## Separacion de dataframes
### Separamos los dataframes en train, validation y test

In [1]:
import Utilidades as ut
import pandas as pd
import numpy as np

df = pd.read_pickle("Filtrado_entrenamiento.pkl")
df_test = pd.read_pickle("Filtrado_test.pkl")

#Pasamos la columna Stage a entero.
df['Stage'] = (df['Stage'] == 'Closed Won').astype('int64')

#Convertimos Total_Amount y Total_Taxable_Amount a escala logaritmica
#Sumo uno para evitar el cero

df['Total_Amount(USD)'] = np.log10(df['Total_Amount(USD)'] + 1)
df['Total_Taxable_Amount(USD)'] = np.log10(df['Total_Taxable_Amount(USD)'] + 1)
df_test['Total_Amount(USD)'] = np.log10(df_test['Total_Amount(USD)'] + 1)
df_test['Total_Taxable_Amount(USD)'] = np.log10(df_test['Total_Taxable_Amount(USD)'] + 1)

#Realizamos un split del df original a df_train, df_validation

split_date = np.datetime64('2018-01-01')

df = df.sort_values('Opportunity_Created_Date', ascending=True)
df_test = df_test.sort_values('Opportunity_Created_Date', ascending=True)

df_train = df.loc[df.Opportunity_Created_Date < split_date]
df_validation = df.loc[df.Opportunity_Created_Date >= split_date]
train_size = df_train.shape[0]
validation_size = df_validation.shape[0]

split_proportion = validation_size/(validation_size + train_size)

print(f'Se realiza un split {round(1-split_proportion, 2)}-{round(split_proportion, 2)} entre train-validation\n')
df.loc[:, 'Opportunity_Total_Amount'] = df.groupby('Opportunity_ID')['Total_Amount(USD)'].transform('sum')

df.head(3).T

Se realiza un split 0.73-0.27 entre train-validation



Unnamed: 0,16104,16105,16106
Region,EMEA,APAC,Americas
"Pricing, Delivery_Terms_Quote_Appr",0,1,1
"Pricing, Delivery_Terms_Approved",0,1,0
Bureaucratic_Code_0_Approval,0,1,1
Bureaucratic_Code_0_Approved,0,1,0
Bureaucratic_Code,Bureaucratic_Code_4,Bureaucratic_Code_3,Bureaucratic_Code_5
Account_Created_Date,2013-07-27 00:00:00,2014-01-22 00:00:00,2013-08-22 00:00:00
Billing_Country,France,Germany,United States
Account_Name,Account_Name_34,Account_Name_272,Account_Name_359
Opportunity_Name,Opportunity_Name_5265,Opportunity_Name_4731,Opportunity_Name_1390


## Feature Engineering
### Preparamos los datos para los modelos de machine learning

### Posibles features

- Opportunity_Total_Amount: El total amount de todos los productos de la oportunidad.
- Opportunity_Size: La cantidad de productos vendidos en la oportunidad.
- Planned_Opportunity_Duration: El tiempo estimado de duracion de la oportunidad.
- Actual_Opportunity_Duration: El tiempo entre que se crea la oportunidad hasta que es modificada por ultima vez.
- Opportunity_Taxable_Rate: El ratio de Total_Taxable_Amount / Opportunity_Total_Amount
- Opportunity_Currency: Moneda en la cual esta expresado el total amount de la oportunidad. (Estaba en el df original)
- Product_Amount_Deviation_of_Product_Family_rate: Desviacion de la media (o mediana) del precio del producto segun la familia del producto.
- Month: (Cambiar de formato año-mes a solo mes)
- Year: (Formada a partir de la columna original Month)
- Year-Month: Formato año-mes (formada a partir de la columna original Month)
- Avg_Product_Duration: Vida media del producto por region.
- Opportunity_TRF: TRF de la suma de productos de la oportunidad.


### Mezcla de features

- Opportunity_Duration_Ratio: Ratio entre duracion real de la oportunidad y duracion estimada Actual_Opportunity_Duration / Actual_Opportunity_Duration.
- Opportunity_Total_Amount_Region_avg: Total amount promedio por region.
- Opportunity_Total_Amount_Region_std: Desviacion estandar del total amount por region.
- Opportunity_Total_Amount_Region_avg_Ratio: Opportunity_Total_Amount / Opportunity_Total_Amount_Region_avg
- Opportunity_Total_Amount_Region_std_Ratio: Opportunity_Total_Amount / Opportunity_Total_Amount_Region_std
- Opportunity_TRF_Region_avg: Media de TRF de oportunidad para la region.
- Opportunity_TRF_Region_std: Desviacion estandar de TRF de oportunidad para la region.
- Opportunity_TRF_Region_avg_Ratio: Opportunity_TRF / Opportunity_TRF_Region_avg
- Opportunity_TRF_Region_std_Ratio: Opportunity_TRF / Opportunity_TRF_Region_std

### Features en el tiempo

- Product_Family_Total_Sells_Region_this_week: Cantidad de ventas de un producto de product Family en los ultimos 7 dias, para una Region.
- Product_Family_Total_Sells_Region_last_week: Cantidad de ventas de un producto de product Family en la semana pasada, para una Region.
- Product_Family_Total_Sells_Region_last_week_ratio: Product_Family_Total_Sells_Region_this_week / Product_Family_Total_Sells_Region_last_week

- Product_Family_Total_Sells_Region_this_month: Cantidad de ventas de un producto de product Family en el ultimo mes, para una Region.
- Product_Family_Total_Sells_Region_last_month: Cantidad de ventas de un producto de product Family en el mes pasado, para una Region.
- Product_Family_Total_Sells_Region_last_month_ratio: Product_Family_Total_Sells_Region_this_month / Product_Family_Total_Sells_Region_last_month

- Product_Family_Total_Sells_Region_this_quarter: Cantidad de ventas de un producto de product Family en el ultimo trimestre, para una Region.
- Product_Family_Total_Sells_Region_last_quarter: Cantidad de ventas de un producto de product Family en el trimestre pasado, para una Region.
- Product_Family_Total_Sells_Region_last_quarter_ratio: Product_Family_Total_Sells_Region_this_quarter / Product_Family_Total_Sells_Region_last_quarter




### Armado de los features

In [2]:
# Armamos una funcion para aplicar el mismo procedimiento a los distintos dataframes

def feature_creation(df):

    ##### Opportunity_Total_Amount
    
    df['Opportunity_Total_Amount'] = df.groupby('Opportunity_ID')['Total_Amount(USD)'].transform('sum')
    
    ### Opportunity_Size
    
    df['Opportunity_Size'] = df.groupby('Opportunity_ID')['Total_Amount(USD)'].transform('count')
    
    ### Planned_Deliver_Duration
    
    df['Planned_Deliver_Duration'] = (df['Planned_Delivery_End_Date'] - df['Planned_Delivery_Start_Date']) / np.timedelta64(1, 'D')
    
    ### Actual_Opportunity_Duration
    
    df['Actual_Opportunity_Duration'] = (df['Last_Modified_Date'] - df['Opportunity_Created_Date']) / np.timedelta64(1, 'D')
    
    ### OBS: Las dos siguientes tienen valores negativos, no tiene ningun sentido. Ver que hacer.
    
    ### Planned_Time_Until_Deliver
    
    df['Planned_Time_Until_Deliver'] = (df['Planned_Delivery_Start_Date'] - df['Opportunity_Created_Date']) / np.timedelta64(1, 'D')
    
    ### Planned_Opportunity_Duration
    
    df['Planned_Opportunity_Duration'] = (df['Planned_Delivery_End_Date'] - df['Opportunity_Created_Date']) / np.timedelta64(1, 'D')
    
    ### Opportunity_Taxable_Rate
    
    df['Opportunity_Taxable_Rate'] = df['Total_Taxable_Amount(USD)'] / df['Opportunity_Total_Amount']
    
    ### Opportunity_Currency
    
    ## YA INCLUIDA EN ASP_CURRENCY, VER SI VALE LA PENA CAMBIARLE EL NOMBRE A LA COLUMNA
    
    ### Product_Amount_Deviation_of_Product_Family_rate
    
    product_family_median = df.groupby('Product_Family')['Total_Amount(USD)'].transform('median')
    df['Product_Amount_Deviation_of_Product_Family_rate'] = (df['Total_Amount(USD)'] - product_family_median) / product_family_median
    
    ### Year-Month
    
    df['Year-Month'] = df['Month']
    df = df.astype({'Year-Month' : 'category'})
    
    ### Month
    
    df = df.astype({'Month' : object})
    
    for (idx, v) in zip(df['Year-Month'].index, df['Year-Month'].values):
        anio = v[:4]
        mes = v[7:]
        df.loc[idx, 'Month'] = mes
    
    df = df.astype({'Month' : 'category'})
    
    ### Avg_Product_Duration
    
    #Ver como vincular la vida media de los productos.
    
    ### Opportunity_TRF
    
    df['Opportunity_TRF'] = df.groupby('Opportunity_ID')['TRF'].transform('sum')
    
    return df

### Mezcla de features

In [3]:
# Armamos una funcion para aplicar el mismo procedimiento a los distintos dataframes

def feature_composition(df):

    ### Opportunity_Duration_Ratio
    
    df['Opportunity_Duration_Ratio'] = df['Planned_Opportunity_Duration'] / df['Actual_Opportunity_Duration']
    
    #### CHEQUEAR ESTO #####################
    # Llenamos los posibles inf que hayan surgido con un numero "muy grande".
    
    df['Opportunity_Duration_Ratio'].replace(to_replace=np.inf, value=10000, inplace=True)
    df['Opportunity_Duration_Ratio'].replace(to_replace=-np.inf, value=-10000, inplace=True)
    
    #########################################
    
    ### Opportunity_Total_Amount_Region_avg
    
    df['Opportunity_Total_Amount_Region_avg'] = df.groupby('Region')['Opportunity_Total_Amount'].transform('mean')
    
    ### Opportunity_Total_Amount_Region_std
    
    df['Opportunity_Total_Amount_Region_std'] = df.groupby('Region')['Opportunity_Total_Amount'].transform('std')
    
    ### Opportunity_Total_Amount_Region_avg_Ratio
    
    df['Opportunity_Total_Amount_Region_avg_Ratio'] = df['Opportunity_Total_Amount'] / df['Opportunity_Total_Amount_Region_avg']
    
    ### Opportunity_Total_Amount_Region_std_Ratio
    
    df['Opportunity_Total_Amount_Region_std_Ratio'] = df['Opportunity_Total_Amount'] / df['Opportunity_Total_Amount_Region_std']
    
    ### Opportunity_TRF_Region_avg
    
    df['Opportunity_TRF_Region_avg'] = df.groupby('Region')['TRF'].transform('mean')
    
    ### Opportunity_TRF_Region_std
    
    df['Opportunity_TRF_Region_std'] = df.groupby('Region')['TRF'].transform('std')
    
    ### Opportunity_Total_Amount_Region_avg_Ratio
    
    df['Opportunity_TRF_Region_avg_Ratio'] = df['Opportunity_TRF'] / df['Opportunity_TRF_Region_avg']
    
    ### Opportunity_Total_Amount_Region_std_Ratio
    
    df['Opportunity_TRF_Region_std_Ratio'] = df['Opportunity_TRF'] / df['Opportunity_TRF_Region_std']
    
    ### Opportunity_Duration_by_Billing_Country
    
    df["Opportunity_Duration_by_Billing_Country"] = df.groupby("Billing_Country")["Opportunity_Duration"].transform("mean")
    
    ### Opportunity_Duration_by_Account_Type
    
    df["Opportunity_Duration_by_Account_Type"] = df.groupby("Account_Type")["Opportunity_Duration"].transform("mean")
    
    ## ASP_by_Region_mean
    
    df["ASP_by_Region_mean"] = df.groupby("Region")["ASP"].transform("mean")
    
    ## ASP_by_Region_std
    
    df["ASP_by_Region_std"] = df.groupby("Region")["ASP"].transform("std")
    
    ## ASP_by_Billing_Country_mean
    
    df["ASP_by_Billing_Country_mean"] = df.groupby("Billing_Country")["ASP"].transform("mean")
    
    ## ASP_by_Billing_Country_std
    
    df["ASP_by_Billing_Country_std"] = df.groupby("Billing_Country")["ASP"].transform("std")
    
    ## Total_Amount_by_Billing_Country_mean
    
    df["Total_Amount_by_Billing_Country_mean"] = df.groupby("Billing_Country")["Total_Amount_USD"].transform("mean")
    
    ## Total_Amount_by_Billing_Country_std
    
    df["Total_Amount_by_Billing_Country_std"] = df.groupby("Billing_Country")["Total_Amount_USD"].transform("std")
    
    ## Buro_Approved_by_Billing_Country_mean
    
    df["Buro_Approved_by_Billing_Country_mean"] = df.groupby("Billing_Country")["Bureaucratic_Code_0_Approved"].transform("mean")
    
    ## Buro_Approved_by_Billing_Country_std
    
    df["Buro_Approved_by_Billing_Country_std"] = df.groupby("Billing_Country")["Bureaucratic_Code_0_Approved"].transform("std")
    
    ## Opportunity_Duration_by_Product_Family_mean
    
    df["Opportunity_Duration_by_Product_Family_mean"] = df.groupby("Product_Family")["Opportunity_Duration"].transform("mean")
    
    ## Opportunity_Duration_by_Product_Family_std
    
    df["Opportunity_Duration_by_Product_Family_std"] = df.groupby("Product_Family")["Opportunity_Duration"].transform("std")
    
    ## Total_Amount_by_Product_Family_mean
    
    df["Total_Amount_by_Product_Family_mean"] = df.groupby("Product_Family")["Total_Amount"].transform("mean")
    
    ## Total_Amount_by_Product_Family_std
    
    df["Total_Amount_by_Product_Family_std"] = df.groupby("Product_Family")["Total_Amount"].transform("std")
    
    ## Buro_Approved_by_Product_Family_mean
    
    df["Buro_Approved_by_Product_Family"] = df.groupby("Product_Family")["Bureaucratic_Code_0_Approved"].transform("mean")
    
    ## Buro_Approved_by_Product_Family_std
    
    df["Buro_Approved_by_Product_Family"] = df.groupby("Product_Family")["Bureaucratic_Code_0_Approved"].transform("std")
    
    return df

## Features en el tiempo

- Product_Family_Total_Sells_Region_this_week: Cantidad de ventas de un producto de product Family en los ultimos 7 dias, para una Region.
- Product_Family_Total_Sells_Region_last_week: Cantidad de ventas de un producto de product Family en la semana pasada, para una Region.
- Product_Family_Total_Sells_Region_last_week_ratio: Product_Family_Total_Sells_Region_this_week / Product_Family_Total_Sells_Region_last_week

- Product_Family_Total_Sells_Region_this_month: Cantidad de ventas de un producto de product Family en el ultimo mes, para una Region.
- Product_Family_Total_Sells_Region_last_month: Cantidad de ventas de un producto de product Family en el mes pasado, para una Region.
- Product_Family_Total_Sells_Region_last_month_ratio: Product_Family_Total_Sells_Region_this_month / Product_Family_Total_Sells_Region_last_month

- Product_Family_Total_Sells_Region_this_quarter: Cantidad de ventas de un producto de product Family en el ultimo trimestre, para una Region.
- Product_Family_Total_Sells_Region_last_quarter: Cantidad de ventas de un producto de product Family en el trimestre pasado, para una Region.
- Product_Family_Total_Sells_Region_last_quarter_ratio: Product_Family_Total_Sells_Region_this_quarter / Product_Family_Total_Sells_Region_last_quarter

In [4]:
def time_feature_1(df):

    df['YearMonth'] = (df['Opportunity_Created_Date'].dt.year.astype(str) + df['Opportunity_Created_Date'].dt.month.astype(str)).astype(int)
    
    df = df.groupby(['Region','Product_Family']).apply(lambda x: x.sort_values('Opportunity_Created_Date'))
    
    df = df.reset_index(drop=True)
    
    df['Total_Product_Family_Region_This_Month'] = df.groupby(['Region','Product_Family', 'YearMonth'])['Opportunity_Created_Date'].transform('count')
    
    df['Total_Product_Family_Region_Last_Month'] = df.groupby(['Region', 'Product_Family'])['Total_Product_Family_Region_This_Month'].shift()
    
    df['Opp_Created_Month'] = df['Opportunity_Created_Date'].apply(lambda x: x.replace(day=1))
    
    df['Opp_Created_Date_last_row'] = df.groupby(['Region', 'Product_Family'])['Opp_Created_Month'].shift()
    
    df['Less_than_a_month'] = ((df['Opp_Created_Month'] - df['Opp_Created_Date_last_row']) / np.timedelta64(1, 'D')) <= 31
    
    df['Total_Product_Family_Region_Last_Month'] = df['Total_Product_Family_Region_Last_Month'] * df['Less_than_a_month']
    
    df['Total_Product_Family_Region_Last_Month'] = df['Total_Product_Family_Region_Last_Month'].fillna(0)
    
    df['Total_Product_Family_Region_Month_Change'] = df['Total_Product_Family_Region_Last_Month'] / df['Total_Product_Family_Region_This_Month']
    
    df = df.drop(columns=['YearMonth', 'Opp_Created_Month', 'Opp_Created_Date_last_row', 'Less_than_a_month'])
        
    return df

In [5]:
def time_feature_2(df):

    ## Total_Products_Region_This_Week, Total_Products_Region_Last_Week, Total_Products_Region_Week_Change
    
    df['YearWeek'] = (df['Opportunity_Created_Date'].dt.year.astype(str) + df['Opportunity_Created_Date'].dt.isocalendar().week.astype(str)).astype(int)
    
    df = df.groupby(['Region']).apply(lambda x: x.sort_values('Opportunity_Created_Date'))
    
    df = df.reset_index(drop=True)
    
    df['Total_Products_Region_This_Week'] = df.groupby(['Region', 'YearWeek'])['Opportunity_Created_Date'].transform('count')
    
    df['Total_Products_Region_Last_Week'] = df.groupby(['Region'])['Total_Products_Region_This_Week'].shift()
    
    df['Opp_Created_Week'] = df['Opportunity_Created_Date'].apply(lambda x: x.replace(day=1))
    
    df['Opp_Created_Date_last_row'] = df.groupby(['Region'])['Opp_Created_Week'].shift()
    
    df['Less_than_a_week'] = ((df['Opp_Created_Week'] - df['Opp_Created_Date_last_row']) / np.timedelta64(1, 'D')) <= 7
    
    df['Total_Products_Region_Last_Week'] = df['Total_Products_Region_Last_Week'] * df['Less_than_a_week']
    
    df['Total_Products_Region_Last_Week'] = df['Total_Products_Region_Last_Week'].fillna(0)
    
    #Hago el ratio al reves para evitar infinitos
    
    df['Total_Products_Region_Week_Change'] = df['Total_Products_Region_Last_Week'] / df['Total_Products_Region_This_Week']
    
    df = df.drop(columns=['YearWeek', 'Opp_Created_Week', 'Opp_Created_Date_last_row', 'Less_than_a_week'])
    
    return df

In [6]:
def time_feature_3(df):

    ## Total_Products_Region_This_Month, Total_Products_Region_Last_Month, Total_Products_Region_Month_Change
    
    df['YearMonth'] = (df['Opportunity_Created_Date'].dt.year.astype(str) + df['Opportunity_Created_Date'].dt.month.astype(str)).astype(int)
    
    df = df.groupby(['Region']).apply(lambda x: x.sort_values('Opportunity_Created_Date'))
    
    df = df.reset_index(drop=True)
    
    df['Total_Products_Region_This_Month'] = df.groupby(['Region', 'YearMonth'])['Opportunity_Created_Date'].transform('count')
    
    df['Total_Products_Region_Last_Month'] = df.groupby(['Region'])['Total_Products_Region_This_Month'].shift()
    
    df['Opp_Created_Month'] = df['Opportunity_Created_Date'].apply(lambda x: x.replace(day=1))
    
    df['Opp_Created_Date_last_row'] = df.groupby(['Region'])['Opp_Created_Month'].shift()
    
    df['Less_than_a_month'] = ((df['Opp_Created_Month'] - df['Opp_Created_Date_last_row']) / np.timedelta64(1, 'D')) <= 31
    
    df['Total_Products_Region_Last_Month'] = df['Total_Products_Region_Last_Month'] * df['Less_than_a_month']
    
    df['Total_Products_Region_Last_Month'] = df['Total_Products_Region_Last_Month'].fillna(0)
    
    #Hago el ratio al reves para evitar infinitos
    
    df['Total_Products_Region_Month_Change'] = df['Total_Products_Region_Last_Month'] / df['Total_Products_Region_This_Month']
    
    df = df.drop(columns=['YearMonth', 'Opp_Created_Month', 'Opp_Created_Date_last_row', 'Less_than_a_month'])
    
    return df

In [7]:
def time_feature_4(df):

    ## Total_Products_Region_This_Quarter, Total_Products_Region_Last_Quarter, Total_Products_Region_Quarter_Change
    
    df['YearQuarter'] = (df['Opportunity_Created_Date'].dt.year.astype(str) + df['Opportunity_Created_Date'].dt.quarter.astype(str)).astype(int)
    
    df = df.groupby(['Region']).apply(lambda x: x.sort_values('Opportunity_Created_Date'))
    
    df = df.reset_index(drop=True)
    
    df['Total_Products_Region_This_Quarter'] = df.groupby(['Region', 'YearQuarter'])['Opportunity_Created_Date'].transform('count')
    
    df['Total_Products_Region_Last_Quarter'] = df.groupby(['Region'])['Total_Products_Region_This_Quarter'].shift()
    
    df['Opp_Created_Quarter'] = df['Opportunity_Created_Date'].apply(lambda x: x.replace(day=1))
    
    df['Opp_Created_Date_last_row'] = df.groupby(['Region'])['Opp_Created_Quarter'].shift()
    
    df['Less_than_a_Quarter'] = ((df['Opp_Created_Quarter'] - df['Opp_Created_Date_last_row']) / np.timedelta64(1, 'D')) <= 31*3
    
    df['Total_Products_Region_Last_Quarter'] = df['Total_Products_Region_Last_Quarter'] * df['Less_than_a_Quarter']
    
    df['Total_Products_Region_Last_Quarter'] = df['Total_Products_Region_Last_Quarter'].fillna(0)
    
    #Hago el ratio al reves para evitar infinitos
    
    df['Total_Products_Region_Quarter_Change'] = df['Total_Products_Region_Last_Quarter'] / df['Total_Products_Region_This_Quarter']
    
    df = df.drop(columns=['YearQuarter', 'Opp_Created_Quarter', 'Opp_Created_Date_last_row', 'Less_than_a_Quarter'])

    return df

In [1]:
def duration_features(df_train, df_test):
    
    ### Family_Duration
    
    df_familia = df_train[['Stage','Region','Product_Family','Planned_Delivery_Start_Date']]
    df_familia = df_familia[df_familia['Stage'] == 'Closed Won']
    df_familia = df_familia.groupby(['Product_Family'])['Planned_Delivery_Start_Date'].agg(['max','min']).reset_index()
    df_familia['Family_Duration'] = (df_familia['max'] - df_familia['min']).dt.days
    df_train = df_train.merge(df_familia[['Product_Family','Family_Duration','max']],on='Product_Family',how='left')
    df_train['Family_Duration'] =  (df_train['Planned_Delivery_Start_Date'] - df_train['max']).dt.days - df_train['Family_Duration']
    df_train['Family_Duration'].replace(np.nan,0)
    #df_train['Family_Durarion'] =  np.sign(df_train['Family_Durarion'])
    df_train = df_train.drop(columns=['max'])
    df_test = df_test.merge(df_train[['Product_Family','Family_Duration']].drop_duplicates(subset=['Product_Family']),left_on='Product_Family',right_on='Product_Family',how='left')
    
    ### Region_Duration
    
    df_region = df_train[['Stage','Region','Planned_Delivery_Start_Date']]
    df_region = df_region[df_familia['Stage'] == 'Closed Won']
    df_region = df_region.groupby(['Region'])['Planned_Delivery_Start_Date'].agg(['max','min']).reset_index()
    df_region['Region_Duration'] = (df_region['max'] - df_region['min']).dt.days
    df_train = df_train.merge(df_region[['Region','Region_Duration','max']],on='Region',how='left')
    df_train['Region_Duration'] =  (df_train['Planned_Delivery_Start_Date'] - df_train['max']).dt.days - df_train['Region_Duration']
    df_train['Region_Duration'].replace(np.nan,0)
    #df_train['Region_Duration'] =  np.sign(df_train['Region_Duration'])
    df_train = df_train.drop(columns=['max'])
    df_test = df_test.merge(df_train[['Region','Region_Duration']].drop_duplicates(subset=['Region']),left_on='Region',right_on='Region',how='left')
    
    ### Territory_Duration
    
    df_territory = df_train[['Stage','Territory','Planned_Delivery_Start_Date']]
    df_territory = df_territory[df_familia['Stage'] == 'Closed Won']
    df_territory = df_territory.groupby(['Territory'])['Planned_Delivery_Start_Date'].agg(['max','min']).reset_index()
    df_territory['Territory_Duration'] = (df_territory['max'] - df_territory['min']).dt.days
    df_train = df_train.merge(df_territory[['Territory','Territory_Duration','max']],on='Territory',how='left')
    df_train['Territory_Duration'] =  (df_train['Planned_Delivery_Start_Date'] - df_train['max']).dt.days - df_train['Territory_Duration']
    df_train['Territory_Duration'].replace(np.nan,0)
    #df_train['Territory_Duration'] =  np.sign(df_train['Territory_Duration'])
    df_train = df_train.drop(columns=['max'])
    df_test = df_test.merge(df_train[['Territory','Territory_Duration']].drop_duplicates(subset=['Territory']),left_on='Territory',right_on='Territory',how='left')
    
    return df_train, df_test

In [8]:
# Una idea de cuantas categorias hay por cada columna categorica.

columnas = df_train.select_dtypes(['category'])
resultado = []

for col in columnas:
    cuenta = df_train[col].value_counts().count()
    resultado.append((col, cuenta))
resultado.sort(key=lambda x: x[1])

for item in resultado:
    print(f"{item[0]} - {item[1]} categorias")

Quote_Type - 2 categorias
Delivery_Quarter - 4 categorias
Delivery_Year - 4 categorias
Region - 5 categorias
ASP_Currency - 5 categorias
Bureaucratic_Code - 6 categorias
Account_Type - 7 categorias
Delivery_Terms - 9 categorias
Opportunity_Type - 25 categorias
Month - 45 categorias
Account_Owner - 47 categorias
Opportunity_Owner - 53 categorias
Last_Modified_By - 53 categorias
Billing_Country - 79 categorias
Product_Family - 225 categorias
Product_Name - 444 categorias
Account_Name - 1609 categorias
Opportunity_Name - 9652 categorias


## Redes Neuronales: Creamos el set de entrenamiento, validacion y test

In [9]:
def time_features(df):
    
    df = time_feature_1(df)
    df = time_feature_2(df)
    df = time_feature_3(df)
    df = time_feature_4(df)
    
    return df

def prepare_dataframes(df, dtypes_dict=None, test=False):
    
    if (dtypes_dict is not None):
        df = df.astype(dtypes_dict)
    
    df = df.sort_values('Opportunity_Created_Date').reset_index(drop=True)
    
    #No poner Opportunity_ID aca
    df = df.drop(columns=['Opportunity_Name',
                          #'Account_Name',
                          #'Account_Owner',
                          #'Opportunity_Owner',
                          #'Opportunity_Created_Date',
                          #'Account_Created_Date',
                          #'Last_Modified_Date',
                          #'Last_Modified_By',
                          #'Planned_Delivery_Start_Date',
                          #'Planned_Delivery_End_Date',
                          #'Month',
                          'Product_Name',
                          #'Quote_Type',
                          #'Account_Type',
                          'Account_Name'])
    if (not test):
        df = df.drop(columns=['Opportunity_ID'])
    
    df = df.select_dtypes(['category', 'int64', 'float64', 'datetime'])
    
    return df

# Componemos los dataframes para la red neuronal

#df_train = feature_creation(df_train.copy())
#df_train = feature_composition(df_train.copy())
#
#df_validation = feature_creation(df_validation.copy())
#df_validation = feature_composition(df_validation.copy())
#
#df_test = feature_creation(df_test.copy())
#df_test = feature_composition(df_test.copy())

#Combinamos train y validation para los features dependientes del tiempo
split_date_1 = df_validation['Opportunity_Created_Date'].nsmallest(1).values[0]
df_train_validation = df_train.append(df_validation, ignore_index=True)
split_date_2 = df_test['Opportunity_Created_Date'].nsmallest(1).values[0]
df_train_val_test = df_train_validation.append(df_test, ignore_index=True)
df_train_val_test = df_train_val_test.drop(columns=['Stage'])

df_train = time_features(df_train)
df_train_validation = time_features(df_train_validation)
df_train_val_test = time_features(df_train_val_test)


########## VEMOS QUE PASA SI TOMAMOS LOS DATASETS COMPLETOS PARA CALCULAR LOS FEATURES

df_train = feature_creation(df_train.copy())
df_train = feature_composition(df_train.copy())

df_train_validation = feature_creation(df_validation.copy())
df_train_validation = feature_composition(df_validation.copy())

df_train_val_test = feature_creation(df_test.copy())
df_train_val_test = feature_composition(df_test.copy())

###################################################################################


df_train = duration_feature(df_train.copy())
df_train_validation = duration_feature(df_train_validation.copy())
df_train_val_test = duration_feature(df_train_val_test.copy())


#Volvemos a separar los dataframes

df_validation = df_train_validation[df_train_validation.Opportunity_Created_Date >= split_date_1]
df_test = df_train_val_test[df_train_val_test.Opportunity_Created_Date >= split_date_2]


# Recuperamos el tipado del set de test (que no entiendo por que se pierde)

dtypes_dict = df_train.drop(columns=['Stage']).dtypes.apply(lambda x: x.name).to_dict()

#Preparamos los df

df_train = prepare_dataframes(df_train, dtypes_dict=dtypes_dict)
df_validation = prepare_dataframes(df_validation, dtypes_dict=dtypes_dict)
df_test = prepare_dataframes(df_test, dtypes_dict=dtypes_dict, test=True)

In [10]:
a = len(df_train.columns)
b = len(df_validation.columns)
c = len(df_test.columns)

sub_b = df_train.columns.difference(df_validation.columns)
sub_c = df_train.columns.difference(df_test.columns)

print(a, b, c)
print(sub_b, sub_c)

60 60 60
Index([], dtype='object') Index(['Stage'], dtype='object')


In [11]:
df_train_val_test.dtypes

Region                                  category
Pricing, Delivery_Terms_Quote_Appr         int64
Pricing, Delivery_Terms_Approved           int64
Bureaucratic_Code_0_Approval               int64
Bureaucratic_Code_0_Approved               int64
                                          ...   
Total_Products_Region_Last_Month         float64
Total_Products_Region_Month_Change       float64
Total_Products_Region_This_Quarter         int64
Total_Products_Region_Last_Quarter       float64
Total_Products_Region_Quarter_Change     float64
Length: 63, dtype: object

#### Buscamos salvar o eliminar datos incompatibles de los features recien creados
##### Nota: A partir de aca el trabajo es bastante manual, ya que depende de los features que creemos

In [12]:
# Dataframe de train
ut.diagnostico_df(df_train)

Suma: 32, Columna: Opportunity_Taxable_Rate


In [13]:
df_train['Opportunity_Taxable_Rate'] = df_train['Opportunity_Taxable_Rate'].fillna(0)
ut.diagnostico_df(df_train)

Ninguna columna tiene datos incompatibles


In [14]:
# Dataframe de validacion
ut.diagnostico_df(df_validation)

Suma: 2, Columna: Opportunity_Duration_Ratio


In [15]:
df_validation['Opportunity_Duration_Ratio'] = df_validation['Opportunity_Duration_Ratio'].fillna(0)
ut.diagnostico_df(df_validation)

Ninguna columna tiene datos incompatibles


In [16]:
# Dataframe de test
ut.diagnostico_df(df_test)

Suma: 2, Columna: Opportunity_Duration_Ratio


In [17]:
df_test['Opportunity_Duration_Ratio'] = df_test['Opportunity_Duration_Ratio'].fillna(0)
ut.diagnostico_df(df_test)

Ninguna columna tiene datos incompatibles


In [18]:
df_train.to_pickle("Neuronales_train.pkl")
df_validation.to_pickle("Neuronales_validation.pkl")
df_test.to_pickle("Neuronales_test.pkl")

In [19]:
#def neural_network_features(df, prior_df=None):
#    
#    original_size = df.shape[0]
#    added_columns = prior_df.columns.difference(df.columns)
#    df = feature_creation(df)
#    df = feature_composition(df)
#    print("Antes del if")
#    print(df.dtypes)
#    if (prior_df is not None):
#        split_date = df['Opportunity_Created_Date'].nsmallest(1).values[0]
#        combined_df = prior_df.append(df, ignore_index=True)
#        print("En el if, despues del append")
#        print(combined_df.dtypes)
#        combined_df = time_feature_1(combined_df)
#        combined_df = time_feature_2(combined_df)
#        combined_df = time_feature_3(combined_df)
#        combined_df = time_feature_4(combined_df)
#        print("En el if, despues de los features")
#        print(combined_df.dtypes)
#        #Hacer un append puede crear una columna no deseada en el df original
#        combined_df = combined_df.drop(columns=added_columns)
#        df = combined_df[combined_df.Opportunity_Created_Date >= split_date]
#        print("En el if, despues de la separacion")
#        print(df.dtypes)
#    else:
#        df = time_feature_1(df)
#        df = time_feature_2(df)
#        df = time_feature_3(df)
#        df = time_feature_4(df)
#    
#    actual_size = df.shape[0]
#    if (original_size != actual_size):
#        print("ERROR: Cambio la cantidad de filas del dataframe en la funcion 'neural_network_features'. "
#              f"El dataframe original tenia {original_size} filas y el nuevo {actual_size}")
#        return df
#    
#    df = df.sort_values('Opportunity_Created_Date').reset_index(drop=True)
#
#    df = df.drop(columns=['Opportunity_ID',
#                          'Opportunity_Name',
#                          #'Account_Name',
#                          #'Account_Owner',
#                          #'Opportunity_Owner',
#                          #'Opportunity_Created_Date',
#                          #'Account_Created_Date',
#                          #'Last_Modified_Date',
#                          #'Last_Modified_By',
#                          #'Planned_Delivery_Start_Date',
#                          #'Planned_Delivery_End_Date',
#                          #'Month',
#                          'Product_Name',
#                          #'Quote_Type',
#                          #'Account_Type',
#                          'Account_Name'])
#    
#    #df = df.select_dtypes(['category', 'int64', 'float64', 'datetime64'])
#    
#    return df
#
#combined_df = df_train.append(df_validation, ignore_index=True)
##combined_df = combined_df.drop(columns=['Stage'])
#df_test = neural_network_features(df_test, prior_df=combined_df)
#df_validation = neural_network_features(df_validation.copy(), prior_df=df_train.copy())
#df_train = neural_network_features(df_train.copy())

In [20]:
#prueba = df.loc[:, ['Region', 'Opportunity_Created_Date']]
#prueba

In [21]:
#prueba = df.loc[:, ['Region', 'Opportunity_Created_Date', 'Bureaucratic_Code_0_Approved']]
#prueba['This_Month_Sells'] = 1
#prueba['Last_Month_Sells'] = 1

#prueba['Last_Month_Sells'] = prueba.groupby([prueba.Region, prueba.Opportunity_Created_Date.dt.month])['count'].transform('count')
#this_month = prueba.groupby(prueba.Region).rolling('31d', on='Opportunity_Created_Date').agg({'Bureaucratic_Code_0_Approved' : 'count'}).reset_index()
#last_month = prueba.groupby(prueba.Region).rolling('62d', on='Opportunity_Created_Date').agg({'Bureaucratic_Code_0_Approved' : 'count'}).reset_index()
#this_month = this_month.drop(columns=['level_1']).rename(columns={'Bureaucratic_Code_0_Approved' : 'This_Month_Sells'}).set_index('Region')
#last_month = last_month.drop(columns=['level_1']).rename(columns={'Bureaucratic_Code_0_Approved' : 'Last_Month_Sells'}).set_index('Region')

#this_month

In [22]:
#sarasa[sarasa.Region == 'Americas'].tail(20)

In [23]:
#prueba['count'].value_counts()

In [24]:
#prueba[prueba['Region'] == 'Americas'].T

In [26]:
df_train.Opportunity_Created_Date.nlargest(30)

11906   2017-12-30
11899   2017-12-29
11900   2017-12-29
11901   2017-12-29
11902   2017-12-29
11903   2017-12-29
11904   2017-12-29
11905   2017-12-29
11894   2017-12-28
11895   2017-12-28
11896   2017-12-28
11897   2017-12-28
11898   2017-12-28
11891   2017-12-27
11892   2017-12-27
11893   2017-12-27
11880   2017-12-26
11881   2017-12-26
11882   2017-12-26
11883   2017-12-26
11884   2017-12-26
11885   2017-12-26
11886   2017-12-26
11887   2017-12-26
11888   2017-12-26
11889   2017-12-26
11890   2017-12-26
11874   2017-12-22
11875   2017-12-22
11876   2017-12-22
Name: Opportunity_Created_Date, dtype: datetime64[ns]