In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import CatBoostEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import model_selection
import scipy.stats as stats
from sklearn.feature_selection import SelectFromModel

In [83]:
train = pd.read_csv('../../Feature_Engineering/data/other-cleaned_train.csv')
test = pd.read_csv('../../Feature_Engineering/data/other-cleaned_test.csv')

In [84]:
train.drop(columns = ['Unnamed: 0'], inplace = True)
test.drop(columns = ['Unnamed: 0'], inplace = True)

In [85]:
X_train = train.copy()
X_test = test.copy()

In [86]:
categ_columns = train.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year",\
                                    "Target"]).columns
for column in categ_columns:
    encoder = CatBoostEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(train[column])
    X_train = X_train.join(feature_encoded.add_suffix('_cat_boost'))
    X_train.drop(columns=[column], inplace = True)

In [87]:
categ_columns = test.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year"]).columns
for column in categ_columns:
    encoder = CatBoostEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(test[column])
    X_test = X_test.join(feature_encoded.add_suffix('_cat_boost'))
    X_test.drop(columns=[column], inplace = True)

In [88]:
X_train["Total_Amount"] = pd.to_numeric(X_train["Total_Amount"],errors='coerce').fillna(X_train["Total_Amount"].mean())
X_train["Opportunity_Created_Year"] = pd.to_numeric(X_train["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_train["Quote_Expiry_DOY"] = pd.to_numeric(X_train["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_train["Quote_Expiry_Year"] = pd.to_numeric(X_train["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_DOY"] = pd.to_numeric(X_train["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_Year"] = pd.to_numeric(X_train["Planned_Delivery_End_Year"],errors='coerce').fillna(0)

X_train = X_train.drop(columns = 'Target')
X_train = X_train.drop(columns = 'ID')
X_train = X_train.drop(columns = 'Account_Name_cat_boost')

In [89]:
X_test["Total_Amount"] = pd.to_numeric(X_test["Total_Amount"],errors='coerce').fillna(test["Total_Amount"].mean())
X_test["Opportunity_Created_Year"] = pd.to_numeric(X_test["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_test["Quote_Expiry_DOY"] = pd.to_numeric(X_test["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_test["Quote_Expiry_Year"] = pd.to_numeric(X_test["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_DOY"] = pd.to_numeric(X_test["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_Year"] = pd.to_numeric(X_test["Planned_Delivery_End_Year"],errors='coerce').fillna(0)

X_test = X_test.drop(columns = 'ID')
X_test = X_test.drop(columns = 'Account_Name_cat_boost')

## Model: Catboost

In [90]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [91]:
def catboost(x_train, y_train, x_validation, y_validation):
    catb_classifier = CatBoostClassifier()
    params_catb = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.05, 0.1], 'l2_leaf_reg': [0, 1, 5]}    
    catb_gs = GridSearchCV(catb_classifier, params_catb, cv=5)
    catb_gs.fit(x_train, y_train)
    catb_best = catb_gs.best_estimator_
    print(catb_gs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [92]:
def catboost_2(x_train, y_train, x_validation, y_validation):
    params_catb_2={
        'n_estimators':stats.randint(10,150),'learning_rate':stats.uniform(0.01,0.3),
        'subsample':stats.uniform(0.3,0.7),'l2_leaf_reg':[1,5,10],
        'max_depth':[3,10,6],'colsample_bylevel':stats.uniform(0.,0.6)
    }
    
    catb_rs = RandomizedSearchCV(CatBoostClassifier(),
                          param_distributions=params_catb_2,
                          cv=3,
                          scoring='neg_log_loss',
                          verbose=1,
                          n_iter=150)
    
    catb_rs.fit(x_train, y_train)
    catb_best = catb_rs.best_estimator_
    print(catb_rs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [93]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [106]:
def best_features(model,train):
    importance = model.feature_importances_
    result = pd.DataFrame([train.columns,importance]).transpose()
    result.columns = ["Feature","Importance"]
    return result.sort_values(by='Importance', ascending=False).head(15)["Feature"].to_list()
    
    
def plot_features(model,train):
    fig = plt.gcf()
    fig.set_size_inches(350, 350)
    selection = SelectFromModel(model, threshold=0.040, prefit=True)
    selected_dataset = selection.transform(train)
    model.plot_importance(booster=model)

    plt.rcParams["figure.figsize"] = (40,20)
    plt.xlabel("\nFeature importance", fontsize=40)
    plt.ylabel("Features", fontsize=35)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.show()

### cat boost 1 with all features

In [95]:
y = train.Target
x_train, x_validation, y_train, y_validation = train_test_split(X_train, y, test_size=0.3, stratify=y)

In [96]:
catboost_model = catboost(x_train, y_train, x_validation, y_validation)
test_model(catboost_model,x_validation,y_validation)
cross_val(catboost_model, x_train, y_train)

0:	learn: 0.6835167	total: 5.83ms	remaining: 286ms
1:	learn: 0.6749030	total: 11ms	remaining: 263ms
2:	learn: 0.6649422	total: 14.9ms	remaining: 234ms
3:	learn: 0.6567963	total: 20.3ms	remaining: 233ms
4:	learn: 0.6479985	total: 24.4ms	remaining: 220ms
5:	learn: 0.6420231	total: 28.3ms	remaining: 208ms
6:	learn: 0.6341443	total: 32.2ms	remaining: 198ms
7:	learn: 0.6271869	total: 36.3ms	remaining: 191ms
8:	learn: 0.6197940	total: 40.2ms	remaining: 183ms
9:	learn: 0.6132611	total: 44.1ms	remaining: 177ms
10:	learn: 0.6079966	total: 48.2ms	remaining: 171ms
11:	learn: 0.6010736	total: 52.2ms	remaining: 165ms
12:	learn: 0.5944196	total: 56.1ms	remaining: 160ms
13:	learn: 0.5889073	total: 60ms	remaining: 154ms
14:	learn: 0.5835825	total: 66.7ms	remaining: 156ms
15:	learn: 0.5778984	total: 70.8ms	remaining: 151ms
16:	learn: 0.5729695	total: 76ms	remaining: 148ms
17:	learn: 0.5680799	total: 80ms	remaining: 142ms
18:	learn: 0.5636063	total: 83.8ms	remaining: 137ms
19:	learn: 0.5590488	total: 87

In [97]:
best_features = best_features(catboost_model,X_train)

In [98]:
y_pred = catboost_model.predict_proba(X_test)[:,1]
submission_cb = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred})
submission_cb = submission_cb.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb.to_csv('../submits/cat_boost_with_cat_boost_encoding.csv', index=False)

### cat boost 1 with best features

In [99]:
X_train_best_features = X_train.loc[:,best_features]
X_test_best_features = X_test.loc[:,best_features]

In [100]:
x_best_train, x_best_validation, y_best_train, y_best_validation = train_test_split(X_train_best_features, y, test_size=0.3, stratify=y)

In [101]:
catboost_model_2 = catboost(x_best_train, y_best_train, x_best_validation, y_best_validation)
test_model(catboost_model_2,x_best_validation,y_best_validation)
cross_val(catboost_model_2, x_best_train, y_best_train)

0:	learn: 0.6885733	total: 12.6ms	remaining: 620ms
1:	learn: 0.6836491	total: 20.7ms	remaining: 496ms
2:	learn: 0.6788740	total: 31.6ms	remaining: 495ms
3:	learn: 0.6741792	total: 37ms	remaining: 426ms
4:	learn: 0.6694030	total: 41.2ms	remaining: 371ms
5:	learn: 0.6649443	total: 45.1ms	remaining: 331ms
6:	learn: 0.6605609	total: 49.3ms	remaining: 303ms
7:	learn: 0.6564074	total: 53.5ms	remaining: 281ms
8:	learn: 0.6521092	total: 57.7ms	remaining: 263ms
9:	learn: 0.6479345	total: 61.5ms	remaining: 246ms
10:	learn: 0.6438464	total: 65.6ms	remaining: 233ms
11:	learn: 0.6397943	total: 69.9ms	remaining: 221ms
12:	learn: 0.6358038	total: 73.5ms	remaining: 209ms
13:	learn: 0.6318552	total: 77.3ms	remaining: 199ms
14:	learn: 0.6281530	total: 81ms	remaining: 189ms
15:	learn: 0.6243219	total: 85.2ms	remaining: 181ms
16:	learn: 0.6207163	total: 89.5ms	remaining: 174ms
17:	learn: 0.6174426	total: 93.6ms	remaining: 166ms
18:	learn: 0.6137912	total: 98.7ms	remaining: 161ms
19:	learn: 0.6101947	total

In [102]:
y_pred_2 = catboost_model_2.predict_proba(X_test_best_features)[:,1]
submission_cb_2 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features['Opportunity_ID'], 'Target': y_pred_2})
submission_cb_2 = submission_cb_2.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_2.to_csv('../submits/cat_boost_best_features_with_cat_boost_encoding.csv', index=False)

### cat boost 2 with all features

In [103]:
catboost_model_3 = catboost_2(x_train, y_train, x_validation, y_validation)
test_model(catboost_model_3,x_validation,y_validation)
cross_val(catboost_model_3, x_train, y_train)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
0:	learn: 0.6835056	total: 3.82ms	remaining: 141ms
1:	learn: 0.6742692	total: 6.75ms	remaining: 122ms
2:	learn: 0.6656475	total: 9.76ms	remaining: 114ms
3:	learn: 0.6556813	total: 13ms	remaining: 110ms
4:	learn: 0.6462432	total: 16.6ms	remaining: 110ms
5:	learn: 0.6392269	total: 18.9ms	remaining: 101ms
6:	learn: 0.6311136	total: 21.2ms	remaining: 94.1ms
7:	learn: 0.6227815	total: 23.8ms	remaining: 89.2ms
8:	learn: 0.6180973	total: 26.1ms	remaining: 84ms
9:	learn: 0.6131907	total: 28.6ms	remaining: 80.1ms
10:	learn: 0.6067873	total: 30.9ms	remaining: 75.9ms
11:	learn: 0.6003412	total: 33.2ms	remaining: 72ms
12:	learn: 0.5931011	total: 35.7ms	remaining: 68.7ms
13:	learn: 0.5871928	total: 38.2ms	remaining: 65.5ms
14:	learn: 0.5832698	total: 40.7ms	remaining: 62.4ms
15:	learn: 0.5775555	total: 43.1ms	remaining: 59.2ms
16:	learn: 0.5713606	total: 45.6ms	remaining: 56.4ms
17:	learn: 0.5647397	total: 48.1ms	remaining: 53.4ms
18:	l

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


12:	learn: 0.5880888	total: 42.6ms	remaining: 81.8ms
13:	learn: 0.5832153	total: 46.6ms	remaining: 79.8ms
14:	learn: 0.5793970	total: 49.7ms	remaining: 76.3ms
15:	learn: 0.5733570	total: 52.9ms	remaining: 72.8ms
16:	learn: 0.5666537	total: 57.9ms	remaining: 71.6ms
17:	learn: 0.5599207	total: 62.3ms	remaining: 69.2ms
18:	learn: 0.5547629	total: 66.1ms	remaining: 66.1ms
19:	learn: 0.5512544	total: 71.6ms	remaining: 64.5ms
20:	learn: 0.5469059	total: 75.1ms	remaining: 60.8ms
21:	learn: 0.5429419	total: 78.8ms	remaining: 57.3ms
22:	learn: 0.5381900	total: 82.1ms	remaining: 53.5ms
23:	learn: 0.5339184	total: 85.4ms	remaining: 49.8ms
24:	learn: 0.5283900	total: 89ms	remaining: 46.3ms
25:	learn: 0.5220109	total: 92.7ms	remaining: 42.8ms
26:	learn: 0.5185638	total: 96.2ms	remaining: 39.2ms
27:	learn: 0.5137634	total: 98.7ms	remaining: 35.3ms
28:	learn: 0.5093470	total: 101ms	remaining: 31.4ms
29:	learn: 0.5048397	total: 104ms	remaining: 27.7ms
30:	learn: 0.5017464	total: 106ms	remaining: 24ms


[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:  5.3min finished


2:	learn: 0.4583230	total: 78.5ms	remaining: 3.14s
3:	learn: 0.4174136	total: 123ms	remaining: 3.65s
4:	learn: 0.3866067	total: 157ms	remaining: 3.71s
5:	learn: 0.3557135	total: 192ms	remaining: 3.75s
6:	learn: 0.3362477	total: 225ms	remaining: 3.73s
7:	learn: 0.3242957	total: 260ms	remaining: 3.73s
8:	learn: 0.3122227	total: 293ms	remaining: 3.71s
9:	learn: 0.3027308	total: 328ms	remaining: 3.71s
10:	learn: 0.2907849	total: 363ms	remaining: 3.7s
11:	learn: 0.2774771	total: 399ms	remaining: 3.69s
12:	learn: 0.2656176	total: 432ms	remaining: 3.65s
13:	learn: 0.2594902	total: 461ms	remaining: 3.59s
14:	learn: 0.2543000	total: 494ms	remaining: 3.55s
15:	learn: 0.2460838	total: 526ms	remaining: 3.52s
16:	learn: 0.2407637	total: 563ms	remaining: 3.51s
17:	learn: 0.2350381	total: 600ms	remaining: 3.5s
18:	learn: 0.2292114	total: 633ms	remaining: 3.46s
19:	learn: 0.2236208	total: 665ms	remaining: 3.43s
20:	learn: 0.2159263	total: 698ms	remaining: 3.39s
21:	learn: 0.2132281	total: 730ms	remain

In [107]:
best_features_2 = best_features(catboost_model_3,X_train)

In [108]:
y_pred_3 = catboost_model_3.predict_proba(X_test)[:,1]
submission_cb_3 = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred_3})
submission_cb_3 = submission_cb_3.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_3.to_csv('../submits/cat_boost_2_with_cat_boost_encoding.csv', index=False)

### cat boost 2 with best features

In [109]:
X_train_best_features_2 = X_train.loc[:,best_features_2]
X_test_best_features_2 = X_test.loc[:,best_features_2]

In [110]:
x_best_train_2, x_best_validation_2, y_best_train_2, y_best_validation_2 = train_test_split(X_train_best_features_2, y, test_size=0.3, stratify=y)

In [111]:
catboost_model_4 = catboost_2(x_best_train_2, y_best_train_2, x_best_validation_2, y_best_validation_2)
test_model(catboost_model_4,x_best_validation_2,y_best_validation_2)
cross_val(catboost_model_4, x_best_train_2, y_best_train_2)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
0:	learn: 0.6315270	total: 2.62ms	remaining: 68.1ms
1:	learn: 0.5831487	total: 5.82ms	remaining: 72.7ms
2:	learn: 0.5402189	total: 8.54ms	remaining: 68.3ms
3:	learn: 0.5109950	total: 11.3ms	remaining: 64.8ms
4:	learn: 0.4832747	total: 13.7ms	remaining: 60.5ms
5:	learn: 0.4589623	total: 16.1ms	remaining: 56.4ms
6:	learn: 0.4399634	total: 18.5ms	remaining: 52.9ms
7:	learn: 0.4225794	total: 20.8ms	remaining: 49.4ms
8:	learn: 0.4081123	total: 23.1ms	remaining: 46.2ms
9:	learn: 0.3960084	total: 25.3ms	remaining: 43ms
10:	learn: 0.3845886	total: 27.5ms	remaining: 39.9ms
11:	learn: 0.3752281	total: 30.3ms	remaining: 37.8ms
12:	learn: 0.3686705	total: 32.9ms	remaining: 35.4ms
13:	learn: 0.3612124	total: 35.4ms	remaining: 32.9ms
14:	learn: 0.3547341	total: 37.9ms	remaining: 30.3ms
15:	learn: 0.3489254	total: 42.6ms	remaining: 29.3ms
16:	learn: 0.3442213	total: 44.7ms	remaining: 26.3ms
17:	learn: 0.3400578	total: 47.1ms	remaining: 23

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


5:	learn: 0.4541635	total: 17.2ms	remaining: 60.3ms
6:	learn: 0.4363866	total: 21.4ms	remaining: 61.1ms
7:	learn: 0.4195639	total: 26ms	remaining: 61.8ms
8:	learn: 0.4068957	total: 30.8ms	remaining: 61.5ms
9:	learn: 0.3946037	total: 36.6ms	remaining: 62.2ms
10:	learn: 0.3838864	total: 41ms	remaining: 59.6ms
11:	learn: 0.3752776	total: 44.7ms	remaining: 55.9ms
12:	learn: 0.3683462	total: 47.1ms	remaining: 50.7ms
13:	learn: 0.3627817	total: 49.7ms	remaining: 46.2ms
14:	learn: 0.3553619	total: 52.9ms	remaining: 42.3ms
15:	learn: 0.3502018	total: 55.8ms	remaining: 38.3ms
16:	learn: 0.3463190	total: 57.8ms	remaining: 34ms
17:	learn: 0.3423797	total: 60.3ms	remaining: 30.1ms
18:	learn: 0.3369067	total: 62.4ms	remaining: 26.3ms
19:	learn: 0.3334983	total: 64.8ms	remaining: 22.7ms
20:	learn: 0.3295652	total: 66.7ms	remaining: 19.1ms
21:	learn: 0.3265255	total: 69ms	remaining: 15.7ms
22:	learn: 0.3237834	total: 71.3ms	remaining: 12.4ms
23:	learn: 0.3207477	total: 73.5ms	remaining: 9.19ms
24:	le

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:  2.5min finished


8:	learn: 0.3490366	total: 109ms	remaining: 1.69s
9:	learn: 0.3373937	total: 120ms	remaining: 1.66s
10:	learn: 0.3332460	total: 123ms	remaining: 1.53s
11:	learn: 0.3233978	total: 139ms	remaining: 1.58s
12:	learn: 0.3143384	total: 153ms	remaining: 1.59s
13:	learn: 0.3057517	total: 168ms	remaining: 1.6s
14:	learn: 0.2983115	total: 183ms	remaining: 1.62s
15:	learn: 0.2928102	total: 192ms	remaining: 1.58s
16:	learn: 0.2881548	total: 204ms	remaining: 1.57s
17:	learn: 0.2840468	total: 218ms	remaining: 1.57s
18:	learn: 0.2782755	total: 231ms	remaining: 1.57s
19:	learn: 0.2746999	total: 246ms	remaining: 1.57s
20:	learn: 0.2711007	total: 256ms	remaining: 1.55s
21:	learn: 0.2674435	total: 267ms	remaining: 1.53s
22:	learn: 0.2649954	total: 277ms	remaining: 1.5s
23:	learn: 0.2623461	total: 288ms	remaining: 1.49s
24:	learn: 0.2594937	total: 302ms	remaining: 1.49s
25:	learn: 0.2566923	total: 316ms	remaining: 1.48s
26:	learn: 0.2512404	total: 331ms	remaining: 1.48s
27:	learn: 0.2483160	total: 345ms	r

In [112]:
y_pred_4 = catboost_model_4.predict_proba(X_test_best_features_2)[:,1]
submission_cb_4 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features_2['Opportunity_ID'], 'Target': y_pred_4})
submission_cb_4 = submission_cb_4.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_4.to_csv('../submits/cat_boost_2_best_features_with_cat_boost_encoding.csv', index=False)