In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import log_loss
import numpy as np
import category_encoders as ce

In [2]:
train_set = pd.read_csv('train_prepared.csv', encoding='utf-8')
test_set = pd.read_csv('test_prepared.csv', encoding='utf-8')

In [3]:
train_set.rename(columns={"Pricing, Delivery_Terms_Quote_Appr": "Pricing_Delivery_Terms_Quote_Appr", 
                          "Pricing, Delivery_Terms_Approved": "Pricing_Delivery_Terms_Approved"}, inplace=True)

test_set.rename(columns={"Pricing, Delivery_Terms_Quote_Appr": "Pricing_Delivery_Terms_Quote_Appr", 
                         "Pricing, Delivery_Terms_Approved": "Pricing_Delivery_Terms_Approved"}, inplace=True)

In [4]:
X = train_set.drop(columns=['Target', 'Opportunity_ID'])
y = train_set.Target

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

clf = lgb.LGBMClassifier()

clf.fit(X_train, y_train)

log_loss(y_test, clf.predict_proba(X_test))

0.26060698882538164

In [6]:
len(X_train.columns)

53

In [7]:
f_imp = clf.feature_importances_
f = X_test.columns
fts = ['Opportunity_ID', 'Target']

for i in range (len(f_imp)):
    print("{: >10}\t\t{: >50}".format(f[i],f_imp[i]))
    if f_imp[i]>=50:
        fts.append(f[i])

Pricing_Delivery_Terms_Quote_Appr		                                                 8
Pricing_Delivery_Terms_Approved		                                                10
Bureaucratic_Code_0_Approval		                                                15
Bureaucratic_Code_0_Approved		                                                 6
Quote_Type		                                                 0
Convertibility		                                                59
Total_Amount_Sum_USD		                                                36
Total_Taxable_Amount_USD		                                                68
Year_Creation		                                                25
Month_Creation		                                                42
Year_Delivery		                                                23
Month_Delivery		                                                31
Days_Passed		                                               113
Wait_Time_Days		                                          

In [8]:
len(fts)

29

In [9]:
train_filtered = train_set.loc[:, fts]

X = train_filtered.drop(columns=['Opportunity_ID', 'Target'])
y = train_filtered.Target

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [10]:
#con train

#from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

#classifier = lgb.LGBMClassifier()

#param_grid = {
#    'boosting_type': ['gbdt', 'goss', 'dart'],
#    'num_leaves': list(range(20, 150)),
#    'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 1000)),
#    'subsample_for_bin': list(range(20000, 300000, 20000)),
#    'min_child_samples': list(range(20, 500, 5)),
#    'reg_alpha': list(np.linspace(0, 1)),
#    'reg_lambda': list(np.linspace(0, 1)),
#    'colsample_bytree': list(np.linspace(0.6, 1, 10)),
#    'subsample': list(np.linspace(0.5, 1, 100)),
#}

#random_search=RandomizedSearchCV(classifier,param_distributions=param_grid,n_iter=750, scoring='neg_log_loss',n_jobs=-1,cv=5,verbose=3)

#random_search.fit(X_train,y_train)

In [11]:
#random_search.best_params_

In [12]:
#random_search.best_score_

In [26]:
best = {'subsample_for_bin': 250000, 'subsample': 0.8131313131313131, 
        'reg_lambda': 0.2857142857142857, 'reg_alpha': 0.18367346938775508, 
        'num_leaves': 35, 'min_child_samples': 83, 'learning_rate': 0.07016445423929361, 
        'colsample_bytree': 0.6, 'boosting_type': 'gbdt'}

In [27]:
classifier = lgb.LGBMClassifier(**best)

classifier.fit(X_train, y_train)

LGBMClassifier(colsample_bytree=0.6, learning_rate=0.07016445423929361,
               min_child_samples=83, num_leaves=35,
               reg_alpha=0.18367346938775508, reg_lambda=0.2857142857142857,
               subsample=0.8131313131313131, subsample_for_bin=250000)

In [28]:
log_loss(y_test, classifier.predict_proba(X_test))

0.2822109367166231

In [29]:
test_filtered = test_set.loc[:, fts]

In [30]:
preds = classifier.predict_proba(test_filtered.drop(columns=['Opportunity_ID', 'Target']))
preds

array([[0.0829492 , 0.9170508 ],
       [0.2055841 , 0.7944159 ],
       [0.77146678, 0.22853322],
       ...,
       [0.83257129, 0.16742871],
       [0.46385284, 0.53614716],
       [0.82884668, 0.17115332]])

In [31]:
log_loss(test_filtered.Target, preds)

0.4840809602918056

In [19]:
final = test_filtered.loc[:, ['Opportunity_ID', 'Target']].copy()
final['Target'] = preds[:, 1]
display(final)
final.to_csv('predictions/lgbm.csv', index=False)

Unnamed: 0,Opportunity_ID,Target
0,10689,0.878618
1,10690,0.769735
2,10691,0.252953
3,10692,0.420642
4,10693,0.969231
...,...,...
1562,12364,0.993051
1563,12365,0.214578
1564,12366,0.202032
1565,12367,0.479124
