In [49]:
# Tratamiento de datos
# ==============================================================================
import numpy as np
import pandas as pd
import statsmodels.api as sm
import category_encoders as ce


# Gráficos
# ==============================================================================
import matplotlib.pyplot as plt

# Preprocesado y modelado
# ==============================================================================
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss
from sklearn.impute import KNNImputer

# Configuración warnings
# ==============================================================================
import warnings
warnings.filterwarnings('once')

In [50]:
train = pd.read_csv('train.csv', encoding='utf-8')
test = pd.read_csv('test.csv', encoding='utf-8')

In [51]:
train['Total_Amount_Sum'] = train.groupby('Opportunity_ID')['Total_Amount'].transform('sum')
test['Total_Amount_Sum'] = test.groupby('Opportunity_ID')['Total_Amount'].transform('sum')

train['ASP_sum_per_Opportunity'] = train.groupby('Opportunity_ID')['ASP'].transform('sum')
train['ASP_converted_sum_per_Opportunity'] = train.groupby('Opportunity_ID')['ASP_(converted)'].transform('sum')
train['Convertibility']=train['ASP_converted_sum_per_Opportunity'] / train['ASP_sum_per_Opportunity']
test['ASP_sum_per_Opportunity'] = test.groupby('Opportunity_ID')['ASP'].transform('sum')
test['ASP_converted_sum_per_Opportunity'] = test.groupby('Opportunity_ID')['ASP_(converted)'].transform('sum')
test['Convertibility']=test['ASP_converted_sum_per_Opportunity'] / test['ASP_sum_per_Opportunity']

train['Total_Amount_USD'] = train['Convertibility']*train['Total_Amount']
train['Total_Amount_Sum_USD'] = train['Convertibility']*train['Total_Amount_Sum']
train['Total_Taxable_Amount_USD'] = train['Convertibility']*train['Total_Taxable_Amount']
test['Total_Amount_USD'] = test['Convertibility']*test['Total_Amount']
test['Total_Amount_Sum_USD'] = test['Convertibility']*test['Total_Amount_Sum']
test['Total_Taxable_Amount_USD'] = test['Convertibility']*test['Total_Taxable_Amount']

In [52]:
train['Target'] = (train.Stage=='Closed Won').astype('int')

In [53]:
features = ['Territory', 'Billing_Country', 'Delivery_Terms', 'Account_Type', 'Bureaucratic_Code', 
            'Opportunity_Type', 'Source ', 'Region']

codif_df = train.drop_duplicates(subset='Opportunity_ID')

encoder = ce.TargetEncoder(cols=features, smoothing = 0.85)
encoder.fit(codif_df[features],codif_df['Target'])

TargetEncoder(cols=['Territory', 'Billing_Country', 'Delivery_Terms',
                    'Account_Type', 'Bureaucratic_Code', 'Opportunity_Type',
                    'Source ', 'Region'],
              smoothing=0.85)

In [54]:
def hasData(t):
    for x in t:
        if x != 'None':
            return 1
    return 0

In [55]:
train.columns

Index(['ID', 'Region', 'Territory', 'Pricing, Delivery_Terms_Quote_Appr',
       'Pricing, Delivery_Terms_Approved', 'Bureaucratic_Code_0_Approval',
       'Bureaucratic_Code_0_Approved', 'Submitted_for_Approval',
       'Bureaucratic_Code', 'Account_Created_Date', 'Source ',
       'Billing_Country', 'Account_Name', 'Opportunity_Name', 'Opportunity_ID',
       'Sales_Contract_No', 'Account_Owner', 'Opportunity_Owner',
       'Account_Type', 'Opportunity_Type', 'Quote_Type', 'Delivery_Terms',
       'Opportunity_Created_Date', 'Brand', 'Product_Type', 'Size',
       'Product_Category_B', 'Price', 'Currency', 'Last_Activity',
       'Quote_Expiry_Date', 'Last_Modified_Date', 'Last_Modified_By',
       'Product_Family', 'Product_Name', 'ASP_Currency', 'ASP',
       'ASP_(converted)_Currency', 'ASP_(converted)',
       'Planned_Delivery_Start_Date', 'Planned_Delivery_End_Date', 'Month',
       'Delivery_Quarter', 'Delivery_Year', 'Actual_Delivery_Date', 'TRF',
       'Total_Amount_Currenc

In [56]:
def set_df(df):
    
    df.Opportunity_Created_Date = pd.to_datetime(df.Opportunity_Created_Date, errors='coerce')
    df.Planned_Delivery_Start_Date = pd.to_datetime(df.Planned_Delivery_Start_Date, errors='coerce')
    df.Planned_Delivery_End_Date = pd.to_datetime(df.Planned_Delivery_End_Date, errors='coerce')
    df.Account_Created_Date = pd.to_datetime(df.Account_Created_Date, errors='coerce')
    df.Last_Modified_Date = pd.to_datetime(df.Last_Modified_Date, errors='coerce')
    
    df['Year_Creation'] = df.Opportunity_Created_Date.dt.year
    df['Month_Creation'] = df.Opportunity_Created_Date.dt.month
    
    df['Year_Delivery'] = df.Planned_Delivery_Start_Date.dt.year
    df['Month_Delivery'] = df.Planned_Delivery_Start_Date.dt.month
    
    df['Days_Passed'] = df.Last_Modified_Date - df.Opportunity_Created_Date
    df.Days_Passed = df.Days_Passed.astype('timedelta64[D]')
    
    df['Wait_Time_Days'] = df.Planned_Delivery_Start_Date - df.Opportunity_Created_Date
    df.Wait_Time_Days = df.Wait_Time_Days.astype('timedelta64[D]')
    
    df['Delivery_Window'] = df.Planned_Delivery_End_Date - df.Planned_Delivery_Start_Date
    df.Delivery_Window = df.Delivery_Window.astype('timedelta64[D]')
    
    df['Account_LifeSpan_at_Creation'] = df.Opportunity_Created_Date - df.Account_Created_Date
    df.Account_LifeSpan_at_Creation = df.Account_LifeSpan_at_Creation.astype('timedelta64[D]')
    
    df['Account_LifeSpan_at_Modif'] = df.Last_Modified_Date - df.Account_Created_Date
    df.Account_LifeSpan_at_Modif = df.Account_LifeSpan_at_Modif.astype('timedelta64[D]')
    
    df['Account_LifeSpan_at_Deliv'] = df.Planned_Delivery_Start_Date - df.Account_Created_Date
    df.Account_LifeSpan_at_Deliv = df.Account_LifeSpan_at_Deliv.astype('timedelta64[D]')
    
    df['Quote_Type'] = (df.Quote_Type == 'Non Binding').astype('int')
    
    df['Wait_Delivery_Cmp'] = (df.Delivery_Window / df.Wait_Time_Days).replace([np.inf, -np.inf], 0)
    df['Days_Left_Cmp'] =  df.Wait_Time_Days - df.Days_Passed
    df['Days_Wait_Cmp'] = (df.Delivery_Window / df.Days_Passed).replace([np.inf, -np.inf], 0)

    df['USD_Per_Day_Waited'] = (df.Total_Amount_Sum_USD/df.Wait_Time_Days).replace([np.inf, -np.inf], 0)
    df['USD_Per_Day_Passed'] = (df.Total_Amount_Sum_USD/df.Days_Passed).replace([np.inf, -np.inf], 0)
    df['USD_Per_Account_Day'] = (df.Total_Amount_Sum_USD/df.Account_LifeSpan_at_Creation).replace([np.inf, -np.inf], 0)
    
    df['Total_Products'] = df.groupby('Opportunity_ID')['Region'].transform('count')
    df['Multiple_Products'] = (df.Total_Products>1).astype('int')
    
    df['Product_Price_Mean'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('mean')
    df['Product_Price_Max'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('max')
    df['Product_Price_Std'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('std')
    df['Product_Price_Min'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('min')
    
    df['Total_TRF'] = df.groupby('Opportunity_ID')['TRF'].transform('sum')
    df['Price_Per_TRF'] = (df.Total_Amount_Sum_USD/df.Total_TRF).replace([np.inf, -np.inf], 0)
    
    df.drop_duplicates(subset='Opportunity_ID', inplace=True)
    
    df = df.join(encoder.transform(df[features]).add_suffix('_target'))
    
    df['Product_Category_B'] = df.groupby('Opportunity_ID')['Product_Category_B'].transform(hasData)
    df['Price'] = df.groupby('Opportunity_ID')['Price'].transform(hasData)
    df['Size'] = df.groupby('Opportunity_ID')['Size'].transform(hasData)
    df['Product_Type'] = df.groupby('Opportunity_ID')['Product_Type'].transform(hasData)
    df['Brand'] = df.groupby('Opportunity_ID')['Brand'].transform(hasData)
    df['Currency'] = df.groupby('Opportunity_ID')['Currency'].transform(hasData)
    
    df['Or'] = ((df.Product_Category_B)|(df.Price)|(df.Size)|(df.Product_Type)|(df.Brand)|(df.Currency)).astype('int')
    
    df = df.replace([np.inf,-np.inf], np.nan)
    
    return df.drop(columns=['ID', 'Submitted_for_Approval',
       'Account_Created_Date', 'Account_Name', 'Opportunity_Name',
       'Sales_Contract_No', 'Account_Owner', 'Opportunity_Owner',
       'Opportunity_Created_Date', 'Last_Activity',
       'Quote_Expiry_Date', 'Last_Modified_Date', 'Last_Modified_By',
       'Product_Family', 'Product_Name', 'ASP_Currency', 'ASP',
       'ASP_(converted)_Currency', 'ASP_(converted)',
       'Planned_Delivery_Start_Date', 'Planned_Delivery_End_Date', 'Month',
       'Delivery_Quarter', 'Delivery_Year', 'Actual_Delivery_Date', 'TRF',
       'Total_Amount_Currency', 'Total_Amount',
       'Total_Taxable_Amount_Currency', 'Total_Taxable_Amount',
       'Prod_Category_A', 'Total_Amount_Sum', 'ASP_sum_per_Opportunity',
       'ASP_converted_sum_per_Opportunity',
       'Total_Amount_USD', 'Territory', 'Billing_Country', 'Delivery_Terms', 
       'Account_Type', 'Bureaucratic_Code', 'Opportunity_Type', 'Source ', 'Region', 
       'Product_Category_B', 'Price', 'Size', 'Product_Type', 'Brand', 'Currency'])

In [57]:
train_set = set_df(train)
train_set = train_set.drop(columns='Stage')
train_set

Unnamed: 0,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Opportunity_ID,Quote_Type,Convertibility,Total_Amount_Sum_USD,Total_Taxable_Amount_USD,Target,...,Price_Per_TRF,Territory_target,Billing_Country_target,Delivery_Terms_target,Account_Type_target,Bureaucratic_Code_target,Opportunity_Type_target,Source _target,Region_target,Or
0,1,1,1,1,0,1,1.131096,5.964044e+06,5.964044e+06,0,...,596404.380000,0.531334,0.477157,0.479354,0.484398,0.593111,0.517054,0.509776,0.462820,0
1,0,0,0,0,1,1,1.131094,5.455268e+04,5.455268e+04,1,...,0.000000,0.531334,0.477157,0.479354,0.484398,0.593111,0.517054,0.509776,0.462820,0
2,0,0,0,0,2,1,1.000000,8.386560e+04,8.386560e+04,1,...,0.000000,0.629841,0.450638,0.543384,0.296500,0.593111,0.517054,0.438859,0.444896,0
3,1,0,1,0,3,1,1.000000,7.421882e+06,7.421882e+06,0,...,530134.392857,0.629841,0.450638,0.445609,0.296500,0.266444,0.156954,0.632822,0.444896,1
4,1,0,1,0,4,1,1.000000,1.335719e+07,1.335719e+07,0,...,534287.700000,0.629841,0.450638,0.445609,0.296500,0.266444,0.156954,0.632822,0.444896,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16939,1,1,0,0,12799,1,1.131087,4.543578e+05,4.543578e+05,1,...,454357.800000,0.368421,0.512232,0.479354,0.681871,0.593111,0.517054,0.556701,0.462820,0
16940,1,0,0,0,12800,1,1.000000,2.133250e+07,2.133250e+07,0,...,533312.500000,0.347490,0.512232,0.543384,0.296500,0.266444,0.156954,0.509776,0.444896,0
16942,1,1,1,1,12801,1,1.131096,3.390065e+05,3.390065e+05,1,...,0.000000,0.805556,0.788732,0.479354,0.296500,0.593111,0.517054,0.438859,0.462820,0
16945,1,1,1,1,12802,1,1.000000,2.346797e+06,0.000000e+00,0,...,586699.220000,0.347490,0.450638,0.543384,0.296500,0.593111,0.517054,0.509776,0.444896,0


In [58]:
test['Target'] = (test['Sales_Contract_No'] != 'None').astype('int')
test_set = set_df(test)

In [59]:
X = train_set.drop(columns=['Target', 'Opportunity_ID'])
y = train_set.Target

In [60]:
X_test_set = test_set.drop(columns=['Target', 'Opportunity_ID'])
y_test_set = test_set.Target

X_train_fts = X.columns
X_test_fts = X_test_set.columns

imputer_train = KNNImputer()
imputer_train.fit(X)
X=pd.DataFrame(imputer_train.transform(X))

imputer_test = KNNImputer()
imputer_test.fit(X_test_set)
X_test_set = pd.DataFrame(imputer_test.transform(X_test_set))

X.columns = X_train_fts
X_test_set.columns=X_test_fts

In [61]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [62]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 5)]

max_features = ['auto', 'sqrt']

max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)

min_samples_split = [2, 5, 10]

min_samples_leaf = [1, 2, 4]

bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 650, 1100, 1550, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 35, 60, 85, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [None]:
clf=RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 2, verbose=2, random_state=42, n_jobs = -1)

#Train the model using the training sets y_pred=clf.predict(X_test)
rf_random.fit(X_train,y_train)

y_pred=rf_random.predict_proba(X_test)

Fitting 2 folds for each of 100 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


In [None]:
rf_random.best_params_

In [None]:
clf=RandomForestClassifier()
clf.fit(X_train, y_train)

In [None]:
y_pred=clf.predict_proba(X_test)
print(log_loss(y_test, y_pred))

In [None]:
c = RandomForestClassifier(n_estimators= 650, min_samples_split= 2, 
                           min_samples_leaf=1, max_features= 'auto', 
                           max_depth= None,bootstrap= False)
c.fit(X_train, y_train)
print(log_loss(y_test, c.predict_proba(X_test)))

In [None]:
test_pred = c.predict_proba(test_set.drop(columns=['Opportunity_ID', 'Target']))

In [None]:
log_loss(test_set.Target, test_pred)