In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

kfold = 10

In [2]:
train = pd.read_csv('train.csv', encoding='utf-8')
test = pd.read_csv('test.csv', encoding='utf-8')

In [3]:
train.columns = ['ID', 'Region', 'Territory', 'Pricing_Delivery_Terms_Quote_Appr',
       'Pricing_Delivery_Terms_Approved', 'Bureaucratic_Code_0_Approval',
       'Bureaucratic_Code_0_Approved', 'Submitted_for_Approval',
       'Bureaucratic_Code', 'Account_Created_Date', 'Source',
       'Billing_Country', 'Account_Name', 'Opportunity_Name', 'Opportunity_ID',
       'Sales_Contract_No', 'Account_Owner', 'Opportunity_Owner',
       'Account_Type', 'Opportunity_Type', 'Quote_Type', 'Delivery_Terms',
       'Opportunity_Created_Date', 'Brand', 'Product_Type', 'Size',
       'Product_Category_B', 'Price', 'Currency', 'Last_Activity',
       'Quote_Expiry_Date', 'Last_Modified_Date', 'Last_Modified_By',
       'Product_Family', 'Product_Name', 'ASP_Currency', 'ASP',
       'ASP_(converted)_Currency', 'ASP_(converted)',
       'Planned_Delivery_Start_Date', 'Planned_Delivery_End_Date', 'Month',
       'Delivery_Quarter', 'Delivery_Year', 'Actual_Delivery_Date', 'TRF',
       'Total_Amount_Currency', 'Total_Amount',
       'Total_Taxable_Amount_Currency', 'Total_Taxable_Amount', 'Stage',
       'Prod_Category_A']

test.columns = ['ID', 'Region', 'Territory', 'Pricing_Delivery_Terms_Quote_Appr',
       'Pricing_Delivery_Terms_Approved', 'Bureaucratic_Code_0_Approval',
       'Bureaucratic_Code_0_Approved', 'Submitted_for_Approval',
       'Bureaucratic_Code', 'Account_Created_Date', 'Source',
       'Billing_Country', 'Account_Name', 'Opportunity_Name', 'Opportunity_ID',
       'Sales_Contract_No', 'Account_Owner', 'Opportunity_Owner',
       'Account_Type', 'Opportunity_Type', 'Quote_Type', 'Delivery_Terms',
       'Opportunity_Created_Date', 'Brand', 'Product_Type', 'Size',
       'Product_Category_B', 'Price', 'Currency', 'Last_Activity',
       'Quote_Expiry_Date', 'Last_Modified_Date', 'Last_Modified_By',
       'Product_Family', 'Product_Name', 'ASP_Currency', 'ASP',
       'ASP_(converted)_Currency', 'ASP_(converted)',
       'Planned_Delivery_Start_Date', 'Planned_Delivery_End_Date', 'Month',
       'Delivery_Quarter', 'Delivery_Year', 'Actual_Delivery_Date', 'TRF',
       'Total_Amount_Currency', 'Total_Amount',
       'Total_Taxable_Amount_Currency', 'Total_Taxable_Amount',
       'Prod_Category_A']

In [4]:
train['Target'] = (train.Stage=='Closed Won').astype(int)
test['Target'] = (test.Sales_Contract_No!='None').astype(int)
train['Target'].value_counts()

1    9533
0    7414
Name: Target, dtype: int64

In [5]:
train.drop_duplicates(subset='Opportunity_ID', inplace=True)
test.drop_duplicates(subset='Opportunity_ID', inplace=True)

In [6]:
train.replace([np.inf, -np.inf], np.nan, inplace=True)

In [7]:
def expanding_mean(df):
    for col in df.columns.values.tolist():
        if df[col].dtype == 'object':
            cumsum = df.groupby(col)['Target'].cumsum() - df['Target']
            cumcnt = df.groupby(col).cumcount()
            df[col] = cumsum/cumcnt
            df[col].fillna(0.0, inplace=True)
        else:
            df[col].fillna(0.0, inplace=True)

In [8]:
expanding_mean(train)
expanding_mean(test)

In [9]:
train

Unnamed: 0,ID,Region,Territory,Pricing_Delivery_Terms_Quote_Appr,Pricing_Delivery_Terms_Approved,Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Submitted_for_Approval,Bureaucratic_Code,Account_Created_Date,...,Delivery_Year,Actual_Delivery_Date,TRF,Total_Amount_Currency,Total_Amount,Total_Taxable_Amount_Currency,Total_Taxable_Amount,Stage,Prod_Category_A,Target
0,27761,0.000000,0.000000,1,1,1,1,0,0.000000,0.000000,...,2016,0.000000,10,0.000000,5272800.00,0.000000,5272800.0,0.0,0.000000,0
1,27760,0.000000,0.000000,0,0,0,0,0,0.000000,0.000000,...,2016,0.000000,0,0.000000,48230.00,0.000000,48230.0,0.0,0.000000,1
2,27446,0.000000,0.000000,0,0,0,0,0,0.500000,0.000000,...,2016,0.500000,0,0.000000,83865.60,0.000000,83865.6,1.0,0.500000,1
3,16808,1.000000,1.000000,1,0,1,0,0,0.000000,0.000000,...,2018,0.666667,14,1.000000,7421881.50,1.000000,7421881.5,0.0,0.666667,0
4,16805,0.500000,0.500000,1,0,1,0,0,0.000000,0.000000,...,2018,0.500000,25,0.500000,13357192.50,0.500000,13357192.5,0.0,0.500000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16939,18324,0.462488,0.333333,1,1,0,0,0,0.593161,0.645243,...,2016,0.515453,1,0.470682,401700.00,0.469199,401700.0,1.0,0.515453,1
16940,20827,0.445440,0.349515,1,0,0,0,0,0.266571,0.000000,...,2016,0.515503,20,0.461282,10751580.00,0.462150,21332500.0,0.0,0.515503,0
16942,8781,0.462654,0.802817,1,1,1,1,0,0.593216,0.000000,...,2016,0.515450,0,0.470852,103350.00,0.469368,299715.0,1.0,0.515450,1
16945,28561,0.445258,0.348837,1,1,1,1,0,0.593271,0.000000,...,2016,0.515500,4,0.461185,2346796.88,0.462053,0.0,0.0,0.515500,0


In [10]:
X_train = train.drop(columns=['Target', 'Stage', 'Sales_Contract_No'])
y_train = train['Target']

X_test = test.drop(columns=['Target', 'Sales_Contract_No'])
y_test = test['Target']

In [11]:
def training_model(X_train, y_train):
        model = lgb.LGBMRegressor()
        
        # Grid search CV
        #parameters = {'max_depth'     : [6,8,10],
                      #'learning_rate' : [0.01, 0.05, 0.1],
                      #'num_iteration' : [1000, 5000, 10000],
                      #'n_estimators'  : [100,300,500]
                      # Add more parameters here for tuning
                      #}        
        #grid = GridSearchCV(estimator=model, param_grid = parameters, cv = kfold, 
                            #verbose = 1, n_jobs = -1, refit = True)
        #grid.fit(X_train, y_train)

        # Results from Grid Search
        #print("\n========================================================")
        #print(" Results from Grid Search " )
        #print("========================================================")    
        #print("\n The best estimator across ALL searched params:\n",
              #grid.best_estimator_)
        #print("\n The best parameters across ALL searched params:\n",
              #grid.best_params_)
        #print("\n ========================================================")

        # Random Search CV
        parameters = {'max_depth'     : sp_randInt(6, 10),
                      'learning_rate' : sp_randFloat(0.1, 0.9),
                      'num_iteration' : sp_randInt(1000, 10000),
                      'n_estimators'  : sp_randInt(100, 1000)
                      # Add more parameters here for tuning
                      }
        
        randm = RandomizedSearchCV(estimator=model, 
                                   param_distributions = parameters, cv = kfold, 
                                   n_iter = 10, verbose = 1, n_jobs = -1)
        randm.fit(X_train, y_train)

        # Results from Random Search
        print("\n========================================================")
        print(" Results from Random Search " )
        print("========================================================")    
        print("\n The best estimator across ALL searched params:\n",
              randm.best_estimator_)
        print("\n The best score across ALL searched params:\n",
              randm.best_score_)
        print("\n The best parameters across ALL searched params:\n",
              randm.best_params_)
        print("\n ========================================================")
        print()

        print()
        print("Random Search score: ", randm.best_score_)
        print()
        #print("Grid Search score: ", grid.best_score_)        
        print()

        #if grid.best_score_ > randm.best_score_:
            #print("The better model found in Grid Search ... ... ... ...\n\n")
            #return(grid.best_estimator_)
        #else:
            #print("The better model found in Random Search ... ... ... ...\n\n")
        return(randm.best_estimator_)

model = training_model(X_train, y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  7.3min finished



 Results from Random Search 

 The best estimator across ALL searched params:
 LGBMRegressor(learning_rate=0.25559837697821414, max_depth=8, n_estimators=615,
              num_iteration=6864)

 The best score across ALL searched params:
 0.3821796642384795

 The best parameters across ALL searched params:
 {'learning_rate': 0.25559837697821414, 'max_depth': 8, 'n_estimators': 615, 'num_iteration': 6864}



Random Search score:  0.3821796642384795




In [12]:
def cross_validatin_and_fitting(model, X_train, y_train):
        cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'r2', 
                                 n_jobs = -1, verbose = 1)
        # Cross Validation Results
        print()
        print("Cross Validation results: ", cv_results)
        prt_string = "CV Mean r2 score: %f (Std: %f)"% (cv_results.mean(), cv_results.std())
        print(prt_string)
        
        # Final fitting of the Model
        model.fit(X_train, y_train)
        
        print(); print('========================================================')
        print(); print(model.get_params(deep = True))
        print(); print('========================================================')        
                
        return model
    
model = cross_validatin_and_fitting(model, X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   49.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   49.4s finished



Cross Validation results:  [0.3676699  0.40621232 0.51760957 0.46773828 0.37734057 0.26168012
 0.37250423 0.45613315 0.51690015 0.07800835]
CV Mean r2 score: 0.382180 (Std: 0.125390)


{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.25559837697821414, 'max_depth': 8, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 615, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'num_iteration': 6864}



In [13]:
def evaluate_model(model, X_test, y_test):
        # Evaluate the skill of the Trained model
        # Evaluate the skill of the Trained model
        pred          = model.predict(X_test)
        r2            = r2_score(y_test, pred)

        
        print(); print('Evaluation of the trained model: ')
        print(); print('R2 Score : ', r2)
        
        return model

model = evaluate_model(model, X_test, y_test)


Evaluation of the trained model: 

R2 Score :  0.2299299384730601
