In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import log_loss

kfold = 10

In [2]:
train = pd.read_csv('train_prepared.csv', encoding='utf-8')
test = pd.read_csv('test_prepared.csv', encoding='utf-8')

In [3]:
train.rename(columns={"Pricing, Delivery_Terms_Quote_Appr": "Pricing_Delivery_Terms_Quote_Appr", "Pricing, Delivery_Terms_Approved": "Pricing_Delivery_Terms_Approved"}, inplace=True)
test.rename(columns={"Pricing, Delivery_Terms_Quote_Appr": "Pricing_Delivery_Terms_Quote_Appr", "Pricing, Delivery_Terms_Approved": "Pricing_Delivery_Terms_Approved"}, inplace=True)

In [4]:
X_train = train.drop(columns=['Target'])
y_train = train['Target']

X_test = test.drop(columns=['Target'])
y_test = test['Target']

In [5]:
def training_model(X_train, y_train):
        model = lgb.LGBMClassifier()
        
        # Grid search CV
        parameters = {'max_depth'     : [6,8,10],
                      'learning_rate' : [0.01, 0.05, 0.1],
                      'num_iteration' : [1000, 5000, 10000],
                      'n_estimators'  : [100,300,500]
                       #Add more parameters here for tuning
                      }        
        grid = GridSearchCV(estimator=model, param_grid = parameters, cv = kfold, 
                            verbose = 1, n_jobs = -1, refit = True)
        grid.fit(X_train, y_train)

        # Results from Grid Search
        print("\n========================================================")
        print(" Results from Grid Search " )
        print("========================================================")    
        print("\n The best estimator across ALL searched params:\n",
              grid.best_estimator_)
        print("\n The best parameters across ALL searched params:\n",
              grid.best_params_)
        print("\n ========================================================")

        # Random Search CV
        parameters = {'max_depth'     : sp_randInt(6, 10),
                      'learning_rate' : sp_randFloat(0.1, 0.9),
                      'num_iteration' : sp_randInt(1000, 10000),
                      'n_estimators'  : sp_randInt(100, 1000)
                      # Add more parameters here for tuning
                      }
        
        randm = RandomizedSearchCV(estimator=model, 
                                   param_distributions = parameters, cv = kfold, 
                                   n_iter = 10, verbose = 1, n_jobs = -1)
        randm.fit(X_train, y_train)

        # Results from Random Search
        print("\n========================================================")
        print(" Results from Random Search " )
        print("========================================================")    
        print("\n The best estimator across ALL searched params:\n",
              randm.best_estimator_)
        print("\n The best score across ALL searched params:\n",
              randm.best_score_)
        print("\n The best parameters across ALL searched params:\n",
              randm.best_params_)
        print("\n ========================================================")
        print()

        print()
        print("Random Search score: ", randm.best_score_)
        print()
        print("Grid Search score: ", grid.best_score_)        
        print()

        if grid.best_score_ > randm.best_score_:
            print("The better model found in Grid Search ... ... ... ...\n\n")
            return(grid.best_estimator_)
        else:
            print("The better model found in Random Search ... ... ... ...\n\n")
            return(randm.best_estimator_)

model = training_model(X_train, y_train)

Fitting 10 folds for each of 81 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 188 tasks      | elapsed:  7.6min
[Parallel(n_jobs=-1)]: Done 438 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 788 tasks      | elapsed: 28.4min
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed: 29.4min finished



 Results from Grid Search 

 The best estimator across ALL searched params:
 LGBMClassifier(learning_rate=0.01, max_depth=6, num_iteration=1000)

 The best parameters across ALL searched params:
 {'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100, 'num_iteration': 1000}

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  3.4min finished



 Results from Random Search 

 The best estimator across ALL searched params:
 LGBMClassifier(learning_rate=0.3574944356443658, max_depth=7, n_estimators=345,
               num_iteration=8964)

 The best score across ALL searched params:
 0.7337223530883242

 The best parameters across ALL searched params:
 {'learning_rate': 0.3574944356443658, 'max_depth': 7, 'n_estimators': 345, 'num_iteration': 8964}



Random Search score:  0.7337223530883242

Grid Search score:  0.7649801963686393

The better model found in Grid Search ... ... ... ...




In [6]:
def cross_validatin_and_fitting(model, X_train, y_train):
        cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'r2', 
                                 n_jobs = -1, verbose = 1)
        # Cross Validation Results
        print()
        print("Cross Validation results: ", cv_results)
        prt_string = "CV Mean r2 score: %f (Std: %f)"% (cv_results.mean(), cv_results.std())
        print(prt_string)
        
        # Final fitting of the Model
        model.fit(X_train, y_train)
        
        print(); print('========================================================')
        print(); print(model.get_params(deep = True))
        print(); print('========================================================')        
                
        return model
    
model = cross_validatin_and_fitting(model, X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.2s finished



Cross Validation results:  [ 0.37466636  0.27179654  0.26361448 -0.14139755 -0.49731722 -0.10048725
  0.0795181   0.19406696  0.08770016  0.05482973]
CV Mean r2 score: 0.058699 (Std: 0.241119)


{'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.01, 'max_depth': 6, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': None, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': True, 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0, 'num_iteration': 1000}



In [7]:
def evaluate_model(model, X_test, y_test):
        # Evaluate the skill of the Trained model
        # Evaluate the skill of the Trained model
        pred          = model.predict(X_test)
        r2            = r2_score(y_test, pred)

        
        print(); print('Evaluation of the trained model: ')
        print(); print('R2 Score : ', r2)
        
        return model

model = evaluate_model(model, X_test, y_test)


Evaluation of the trained model: 

R2 Score :  0.04493572695035464


In [8]:
pp = model.predict_proba(X_test)

In [9]:
log_loss(test.Target, pp)

0.5456546763120896

In [10]:
subm = test.loc[:, ['Opportunity_ID', 'Target']]
subm.loc[:, ['Opportunity_ID', 'Target']].to_csv('sub_lgb_prepared.csv', index=False)