In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

kfold = 10

In [2]:
train_set = pd.read_csv('train.csv', encoding='utf-8')
test_set = pd.read_csv('test.csv', encoding='utf-8')

In [3]:
train_set.columns = ['ID', 'Region', 'Territory', 'Pricing_Delivery_Terms_Quote_Appr',
       'Pricing_Delivery_Terms_Approved', 'Bureaucratic_Code_0_Approval',
       'Bureaucratic_Code_0_Approved', 'Submitted_for_Approval',
       'Bureaucratic_Code', 'Account_Created_Date', 'Source',
       'Billing_Country', 'Account_Name', 'Opportunity_Name', 'Opportunity_ID',
       'Sales_Contract_No', 'Account_Owner', 'Opportunity_Owner',
       'Account_Type', 'Opportunity_Type', 'Quote_Type', 'Delivery_Terms',
       'Opportunity_Created_Date', 'Brand', 'Product_Type', 'Size',
       'Product_Category_B', 'Price', 'Currency', 'Last_Activity',
       'Quote_Expiry_Date', 'Last_Modified_Date', 'Last_Modified_By',
       'Product_Family', 'Product_Name', 'ASP_Currency', 'ASP',
       'ASP_(converted)_Currency', 'ASP_(converted)',
       'Planned_Delivery_Start_Date', 'Planned_Delivery_End_Date', 'Month',
       'Delivery_Quarter', 'Delivery_Year', 'Actual_Delivery_Date', 'TRF',
       'Total_Amount_Currency', 'Total_Amount',
       'Total_Taxable_Amount_Currency', 'Total_Taxable_Amount', 'Stage',
       'Prod_Category_A']

test_set.columns = ['ID', 'Region', 'Territory', 'Pricing_Delivery_Terms_Quote_Appr',
       'Pricing_Delivery_Terms_Approved', 'Bureaucratic_Code_0_Approval',
       'Bureaucratic_Code_0_Approved', 'Submitted_for_Approval',
       'Bureaucratic_Code', 'Account_Created_Date', 'Source',
       'Billing_Country', 'Account_Name', 'Opportunity_Name', 'Opportunity_ID',
       'Sales_Contract_No', 'Account_Owner', 'Opportunity_Owner',
       'Account_Type', 'Opportunity_Type', 'Quote_Type', 'Delivery_Terms',
       'Opportunity_Created_Date', 'Brand', 'Product_Type', 'Size',
       'Product_Category_B', 'Price', 'Currency', 'Last_Activity',
       'Quote_Expiry_Date', 'Last_Modified_Date', 'Last_Modified_By',
       'Product_Family', 'Product_Name', 'ASP_Currency', 'ASP',
       'ASP_(converted)_Currency', 'ASP_(converted)',
       'Planned_Delivery_Start_Date', 'Planned_Delivery_End_Date', 'Month',
       'Delivery_Quarter', 'Delivery_Year', 'Actual_Delivery_Date', 'TRF',
       'Total_Amount_Currency', 'Total_Amount',
       'Total_Taxable_Amount_Currency', 'Total_Taxable_Amount',
       'Prod_Category_A']

In [4]:
train_set['Target'] = (train_set.Stage=='Closed Won').astype(int)
test_set['Target'] = (test_set.Sales_Contract_No!='None').astype(int)
train_set['Target'].value_counts()

1    9533
0    7414
Name: Target, dtype: int64

In [5]:
train_set.drop_duplicates(subset='Opportunity_ID', inplace=True)
test_set.drop_duplicates(subset='Opportunity_ID', inplace=True)

In [6]:
train_set.replace([np.inf, -np.inf], np.nan, inplace=True)

In [7]:
def set_df(df):
    
    df.Opportunity_Created_Date = pd.to_datetime(df.Opportunity_Created_Date, errors='coerce')
    df.Planned_Delivery_Start_Date = pd.to_datetime(df.Planned_Delivery_Start_Date, errors='coerce')
    df.Planned_Delivery_End_Date = pd.to_datetime(df.Planned_Delivery_End_Date, errors='coerce')
    df.Account_Created_Date = pd.to_datetime(df.Account_Created_Date, errors='coerce')
    df.Last_Modified_Date = pd.to_datetime(df.Last_Modified_Date, errors='coerce')
    
    df['Year_Creation'] = df.Opportunity_Created_Date.dt.year
    df['Month_Creation'] = df.Opportunity_Created_Date.dt.month
    
    df['Year_Delivery'] = df.Planned_Delivery_Start_Date.dt.year
    df['Month_Delivery'] = df.Planned_Delivery_Start_Date.dt.month
    
    df['Days_Passed'] = df.Last_Modified_Date - df.Opportunity_Created_Date
    df.Days_Passed = df.Days_Passed.astype('timedelta64[D]')
    
    df['Wait_Time_Days'] = df.Planned_Delivery_Start_Date - df.Opportunity_Created_Date
    df.Wait_Time_Days = df.Wait_Time_Days.astype('timedelta64[D]')
    
    df['Delivery_Window'] = df.Planned_Delivery_End_Date - df.Planned_Delivery_Start_Date
    df.Delivery_Window = df.Delivery_Window.astype('timedelta64[D]')
    
    df['Account_LifeSpan_at_Creation'] = df.Opportunity_Created_Date - df.Account_Created_Date
    df.Account_LifeSpan_at_Creation = df.Account_LifeSpan_at_Creation.astype('timedelta64[D]')
    
    df['Account_LifeSpan_at_Modif'] = df.Last_Modified_Date - df.Account_Created_Date
    df.Account_LifeSpan_at_Modif = df.Account_LifeSpan_at_Modif.astype('timedelta64[D]')
    
    df['Account_LifeSpan_at_Deliv'] = df.Planned_Delivery_Start_Date - df.Account_Created_Date
    df.Account_LifeSpan_at_Deliv = df.Account_LifeSpan_at_Deliv.astype('timedelta64[D]')
    
    df['Last_Activity_In_Time'] = ((df.Last_Modified_Date > df.Planned_Delivery_Start_Date)&(df.Last_Modified_Date<df.Planned_Delivery_End_Date)).astype('int')
    
    df['Quote_Type'] = (df.Quote_Type == 'Non Binding').astype('int')
    
    df['Delivered_Hot_Season'] = ((((df.Region=='EMEA')|(df.Region=='Americas')|(df.Region=='Japan')|\
                                   ((df.Region=='APAC')&(df.Territory!='Australia')))\
                                      &\
                                 ((df.Delivery_Quarter=='Q2')|(df.Delivery_Quarter=='Q3')))\
                                      |
                                 ((df.Territory=='Australia')&((df.Delivery_Quarter=='Q1')|(df.Delivery_Quarter=='Q4'))))
    
    df['Wait_Delivery_Cmp'] = (df.Delivery_Window / df.Wait_Time_Days).replace([np.inf, -np.inf], 0)
    df['Days_Left_Cmp'] =  df.Wait_Time_Days - df.Days_Passed
    df['Days_Wait_Cmp'] = (df.Delivery_Window / df.Days_Passed).replace([np.inf, -np.inf], 0)

    df['USD_Per_Day_Waited'] = (df.Total_Amount_Sum_USD/df.Wait_Time_Days).replace([np.inf, -np.inf], 0)
    df['USD_Per_Day_Passed'] = (df.Total_Amount_Sum_USD/df.Days_Passed).replace([np.inf, -np.inf], 0)
    df['USD_Per_Account_Day'] = (df.Total_Amount_Sum_USD/df.Account_LifeSpan_at_Creation).replace([np.inf, -np.inf], 0)
    
    df['Total_Products'] = df.groupby('Opportunity_ID')['Region'].transform('count')
    df['Multiple_Products'] = (df.Total_Products>1).astype('int')
    
    df['Product_Price_Mean'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('mean')
    df['Product_Price_Max'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('max')
    df['Product_Price_Std'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('std')
    df['Product_Price_Min'] = df.groupby('Opportunity_ID')['Total_Amount_USD'].transform('min')
    
    df['Total_TRF'] = df.groupby('Opportunity_ID')['TRF'].transform('sum')
    df['Price_Per_TRF'] = (df.Total_Amount_Sum_USD/df.Total_TRF).replace([np.inf, -np.inf], 0)
    
    #reg = pd.get_dummies(df['Region'],drop_first=False)
    #df = pd.concat([df,reg],axis=1)
    
    df.drop_duplicates(subset='Opportunity_ID', inplace=True)
    
    df['Territory'] = df['Territory'].map(ter_smooth)
    df['Billing_Country'] = df['Billing_Country'].map(bill_smooth)
    df['Delivery_Terms'] = df['Delivery_Terms'].map(deliv_smooth)
    df['Account_Type'] = df['Account_Type'].map(acc_smooth)
    df['Bureaucratic_Code'] = df['Bureaucratic_Code'].map(bur_smooth)
    df['Opportunity_Type'] = df['Opportunity_Type'].map(opp_smooth)
    df['Source '] = df['Source '].map(source_smooth)
    df['Region'] = df['Region'].map(reg_smooth)
    
    df['Quote_Type'] = (df.Quote_Type == 'Non Binding').astype('int')
    df['Product_Category_B'] = df.groupby('Opportunity_ID')['Product_Category_B'].transform(hasData)
    df['Price'] = df.groupby('Opportunity_ID')['Price'].transform(hasData)
    df['Size'] = df.groupby('Opportunity_ID')['Size'].transform(hasData)
    df['Product_Type'] = df.groupby('Opportunity_ID')['Product_Type'].transform(hasData)
    df['Brand'] = df.groupby('Opportunity_ID')['Brand'].transform(hasData)
    df['Currency'] = df.groupby('Opportunity_ID')['Currency'].transform(hasData)
    
    return df.drop(columns=['ID', 'Submitted_for_Approval',
       'Account_Created_Date', 'Account_Name', 'Opportunity_Name',
       'Sales_Contract_No', 'Account_Owner', 'Opportunity_Owner',
       'Opportunity_Created_Date', 'Last_Activity',
       'Quote_Expiry_Date', 'Last_Modified_Date', 'Last_Modified_By',
       'Product_Family', 'Product_Name', 'ASP_Currency', 'ASP',
       'ASP_(converted)_Currency', 'ASP_(converted)',
       'Planned_Delivery_Start_Date', 'Planned_Delivery_End_Date', 'Month',
       'Delivery_Quarter', 'Delivery_Year', 'Actual_Delivery_Date', 'TRF',
       'Total_Amount_Currency', 'Total_Amount',
       'Total_Taxable_Amount_Currency', 'Total_Taxable_Amount',
       'Prod_Category_A', 'Total_Amount_Sum', 'ASP_sum_per_Opportunity',
       'ASP_converted_sum_per_Opportunity',
       'Total_Amount_USD'])

In [8]:
#train_set = set_df(train_set)
#test_set = set_df(test_test)

In [9]:
train_set.drop(columns=['Stage', 'Sales_Contract_No', 'ID'], inplace=True)
test_set.drop(columns=['Sales_Contract_No', 'ID'], inplace=True)

In [10]:
def expanding_mean_train(df):
    new = df.copy()
    for col in new.columns.values.tolist():
        if new[col].dtype == 'object':
            cumsum = new.groupby(col)['Target'].cumsum() - new['Target']
            cumcnt = new.groupby(col).cumcount()
            new[col+'_mean_target'] = cumsum/cumcnt
            new[col+'_mean_target'].fillna(0.0, inplace=True)
        else:
            new[col].fillna(0.0, inplace=True)
            #new[col+'_mean_target'] = new[col]
    return new

In [11]:
train = expanding_mean_train(train_set)

In [12]:
def expanding_mean_test(train, test):
    new = test.copy()
    for col in new.columns.values.tolist():
        if (test[col].dtype != "object"): continue
        aux = train.copy()
        x = aux.groupby(col).agg({col+'_mean_target':"mean"}).to_dict()
        new[col] = new[col].map(x[col+'_mean_target'])
    return new.fillna(0.0)

In [13]:
test = expanding_mean_test(train, test_set)

In [14]:
for col in train.columns.values.tolist():
    if (train[col].dtype == "object"): train.drop(columns=[col], inplace=True)

In [15]:
test

Unnamed: 0,Region,Territory,Pricing_Delivery_Terms_Quote_Appr,Pricing_Delivery_Terms_Approved,Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Submitted_for_Approval,Bureaucratic_Code,Account_Created_Date,Source,...,Delivery_Quarter,Delivery_Year,Actual_Delivery_Date,TRF,Total_Amount_Currency,Total_Amount,Total_Taxable_Amount_Currency,Total_Taxable_Amount,Prod_Category_A,Target
0,0.429976,0.514701,1,1,1,1,0,0.606286,0.438530,0.390520,...,0.430512,2019,0.494805,0,0.444586,162240.0,0.443432,367419.0,0.494805,1
3,0.429976,0.514701,1,1,1,1,0,0.606286,0.438530,0.390520,...,0.430512,2019,0.494805,1,0.444586,243360.0,0.443432,757783.5,0.494805,1
8,0.404426,0.356992,1,1,0,0,0,0.606286,0.000000,0.539295,...,0.473798,2019,0.494805,0,0.422554,21037.5,0.423222,21037.5,0.494805,1
9,0.404426,0.545195,1,1,1,0,0,0.256229,0.580266,0.576520,...,0.473798,2019,0.494805,1,0.422554,342490.5,0.423222,2169106.5,0.494805,1
15,0.404426,0.545195,0,0,0,0,0,0.606286,0.917572,0.539295,...,0.430512,2019,0.494805,0,0.422554,5752.5,0.423222,5752.5,0.494805,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2545,0.429976,0.175310,1,1,0,0,0,0.606286,0.406702,0.542876,...,0.430512,2019,0.494805,0,0.422554,23010.0,0.423222,147750.0,0.494805,1
2547,0.404426,0.299970,1,1,0,0,0,0.606286,0.636012,0.484193,...,0.473798,2019,0.494805,0,0.422554,45054.9,0.423222,45054.9,0.494805,1
2548,0.404426,0.299970,1,1,1,1,0,0.606286,0.636012,0.484193,...,0.473798,2019,0.494805,0,0.422554,100122.0,0.423222,100122.0,0.494805,1
2549,0.404426,0.356992,1,1,0,0,0,0.606286,0.000000,0.484193,...,0.406693,2019,0.494805,0,0.422554,143220.0,0.423222,143220.0,0.494805,0


In [16]:
X_train = train.drop(columns=['Target'])
y_train = train['Target']

X_test = test.drop(columns=['Target'])
y_test = test['Target']

In [17]:
X_train.columns.size

49

In [18]:
X_test.columns

Index(['Region', 'Territory', 'Pricing_Delivery_Terms_Quote_Appr',
       'Pricing_Delivery_Terms_Approved', 'Bureaucratic_Code_0_Approval',
       'Bureaucratic_Code_0_Approved', 'Submitted_for_Approval',
       'Bureaucratic_Code', 'Account_Created_Date', 'Source',
       'Billing_Country', 'Account_Name', 'Opportunity_Name', 'Opportunity_ID',
       'Account_Owner', 'Opportunity_Owner', 'Account_Type',
       'Opportunity_Type', 'Quote_Type', 'Delivery_Terms',
       'Opportunity_Created_Date', 'Brand', 'Product_Type', 'Size',
       'Product_Category_B', 'Price', 'Currency', 'Last_Activity',
       'Quote_Expiry_Date', 'Last_Modified_Date', 'Last_Modified_By',
       'Product_Family', 'Product_Name', 'ASP_Currency', 'ASP',
       'ASP_(converted)_Currency', 'ASP_(converted)',
       'Planned_Delivery_Start_Date', 'Planned_Delivery_End_Date', 'Month',
       'Delivery_Quarter', 'Delivery_Year', 'Actual_Delivery_Date', 'TRF',
       'Total_Amount_Currency', 'Total_Amount',
       'To

In [19]:
def training_model(X_train, y_train):
        model = lgb.LGBMClassifier()
        
        # Grid search CV
        parameters = {'max_depth'     : [6,8,10],
                      'learning_rate' : [0.01, 0.05, 0.1],
                      'num_iteration' : [1000, 5000, 10000],
                      'n_estimators'  : [100,300,500]
                       #Add more parameters here for tuning
                      }        
        grid = GridSearchCV(estimator=model, param_grid = parameters, cv = kfold, 
                            verbose = 1, n_jobs = -1, refit = True)
        grid.fit(X_train, y_train)

        # Results from Grid Search
        print("\n========================================================")
        print(" Results from Grid Search " )
        print("========================================================")    
        print("\n The best estimator across ALL searched params:\n",
              grid.best_estimator_)
        print("\n The best parameters across ALL searched params:\n",
              grid.best_params_)
        print("\n ========================================================")

        # Random Search CV
        parameters = {'max_depth'     : sp_randInt(6, 10),
                      'learning_rate' : sp_randFloat(0.1, 0.9),
                      'num_iteration' : sp_randInt(1000, 10000),
                      'n_estimators'  : sp_randInt(100, 1000)
                      # Add more parameters here for tuning
                      }
        
        randm = RandomizedSearchCV(estimator=model, 
                                   param_distributions = parameters, cv = kfold, 
                                   n_iter = 10, verbose = 1, n_jobs = -1)
        randm.fit(X_train, y_train)

        # Results from Random Search
        print("\n========================================================")
        print(" Results from Random Search " )
        print("========================================================")    
        print("\n The best estimator across ALL searched params:\n",
              randm.best_estimator_)
        print("\n The best score across ALL searched params:\n",
              randm.best_score_)
        print("\n The best parameters across ALL searched params:\n",
              randm.best_params_)
        print("\n ========================================================")
        print()

        print()
        print("Random Search score: ", randm.best_score_)
        print()
        print("Grid Search score: ", grid.best_score_)        
        print()

        if grid.best_score_ > randm.best_score_:
            print("The better model found in Grid Search ... ... ... ...\n\n")
            return(grid.best_estimator_)
        else:
            print("The better model found in Random Search ... ... ... ...\n\n")
            return(randm.best_estimator_)

model = training_model(X_train, y_train)

Fitting 10 folds for each of 81 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed: 11.9min
[Parallel(n_jobs=-1)]: Done 438 tasks      | elapsed: 26.5min
[Parallel(n_jobs=-1)]: Done 788 tasks      | elapsed: 42.7min
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed: 44.1min finished



 Results from Grid Search 

 The best estimator across ALL searched params:
 LGBMClassifier(learning_rate=0.01, max_depth=10, num_iteration=1000)

 The best parameters across ALL searched params:
 {'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100, 'num_iteration': 1000}

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:   58.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.4min finished



 Results from Random Search 

 The best estimator across ALL searched params:
 LGBMClassifier(learning_rate=0.4050583591111818, max_depth=6, n_estimators=859,
               num_iteration=3944)

 The best score across ALL searched params:
 0.7406688745821468

 The best parameters across ALL searched params:
 {'learning_rate': 0.4050583591111818, 'max_depth': 6, 'n_estimators': 859, 'num_iteration': 3944}



Random Search score:  0.7406688745821468

Grid Search score:  0.7663815979530353

The better model found in Grid Search ... ... ... ...




In [20]:
def cross_validatin_and_fitting(model, X_train, y_train):
        cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'r2', 
                                 n_jobs = -1, verbose = 1)
        # Cross Validation Results
        print()
        print("Cross Validation results: ", cv_results)
        prt_string = "CV Mean r2 score: %f (Std: %f)"% (cv_results.mean(), cv_results.std())
        print(prt_string)
        
        # Final fitting of the Model
        model.fit(X_train, y_train)
        
        print(); print('========================================================')
        print(); print(model.get_params(deep = True))
        print(); print('========================================================')        
                
        return model
    
model = cross_validatin_and_fitting(model, X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.8s finished



Cross Validation results:  [ 0.21953152  0.19437312  0.23506134 -0.12299505 -0.54208378 -0.10265094
  0.19030429  0.3530572   0.43850247 -0.21673394]
CV Mean r2 score: 0.064637 (Std: 0.286198)


{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.01, 'max_depth': 10, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'num_iteration': 1000}



In [21]:
def evaluate_model(model, X_test, y_test):
        # Evaluate the skill of the Trained model
        # Evaluate the skill of the Trained model
        pred          = model.predict(X_test)
        r2            = r2_score(y_test, pred)

        
        print(); print('Evaluation of the trained model: ')
        print(); print('R2 Score : ', r2)
        
        return model

model = evaluate_model(model, X_test, y_test)


Evaluation of the trained model: 

R2 Score :  -1.0250426965999164


In [22]:
pp = model.predict_proba(X_test)
pp

array([[0.774948  , 0.225052  ],
       [0.61999479, 0.38000521],
       [0.8472653 , 0.1527347 ],
       ...,
       [0.86288774, 0.13711226],
       [0.77146387, 0.22853613],
       [0.75447052, 0.24552948]])

In [23]:
from sklearn.metrics import log_loss

In [28]:
log_loss(test.Target, pp)

0.8652523467530342

In [25]:
subm = test.loc[:, ['Opportunity_ID', 'Target']]
subm.loc[:, ['Opportunity_ID', 'Target']].to_csv('sub_lgb.csv', index=False)