In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import MEstimateEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import model_selection
import scipy.stats as stats
from sklearn.feature_selection import SelectFromModel

In [8]:
train = pd.read_csv('../../Feature_Engineering/data/other-cleaned_train.csv')
test = pd.read_csv('../../Feature_Engineering/data/other-cleaned_test.csv')

In [9]:
train.drop(columns = ['Unnamed: 0'], inplace = True)
test.drop(columns = ['Unnamed: 0'], inplace = True)

In [10]:
X_train = train.copy()
X_test = test.copy()

In [11]:
categ_columns = train.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year",\
                                    "Target"]).columns
for column in categ_columns:
    encoder = MEstimateEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(train[column])
    X_train = X_train.join(feature_encoded.add_suffix('_m_est'))
    X_train.drop(columns=[column], inplace = True)

In [12]:
categ_columns = test.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year"]).columns
for column in categ_columns:
    encoder = MEstimateEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(test[column])
    X_test = X_test.join(feature_encoded.add_suffix('_m_est'))
    X_test.drop(columns=[column], inplace = True)

In [13]:
X_train["Total_Amount"] = pd.to_numeric(X_train["Total_Amount"],errors='coerce').fillna(X_train["Total_Amount"].mean())
X_train["Opportunity_Created_Year"] = pd.to_numeric(X_train["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_train["Quote_Expiry_DOY"] = pd.to_numeric(X_train["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_train["Quote_Expiry_Year"] = pd.to_numeric(X_train["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_DOY"] = pd.to_numeric(X_train["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_Year"] = pd.to_numeric(X_train["Planned_Delivery_End_Year"],errors='coerce').fillna(0)

X_train = X_train.drop(columns = 'Target')

In [14]:
X_test["Total_Amount"] = pd.to_numeric(X_test["Total_Amount"],errors='coerce').fillna(test["Total_Amount"].mean())
X_test["Opportunity_Created_Year"] = pd.to_numeric(X_test["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_test["Quote_Expiry_DOY"] = pd.to_numeric(X_test["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_test["Quote_Expiry_Year"] = pd.to_numeric(X_test["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_DOY"] = pd.to_numeric(X_test["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_Year"] = pd.to_numeric(X_test["Planned_Delivery_End_Year"],errors='coerce').fillna(0)


## Model: Catboost

In [15]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [16]:
def catboost(x_train, y_train, x_validation, y_validation):
    catb_classifier = CatBoostClassifier()
    params_catb = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.05, 0.1], 'l2_leaf_reg': [0, 1, 5]}    
    catb_gs = GridSearchCV(catb_classifier, params_catb, cv=5)
    catb_gs.fit(x_train, y_train)
    catb_best = catb_gs.best_estimator_
    print(catb_gs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [17]:
def catboost_2(x_train, y_train, x_validation, y_validation):
    params_catb_2={
        'n_estimators':stats.randint(10,150),'learning_rate':stats.uniform(0.01,0.3),
        'subsample':stats.uniform(0.3,0.7),'l2_leaf_reg':[1,5,10],
        'max_depth':[3,10,6],'colsample_bylevel':stats.uniform(0.,0.6)
    }
    
    catb_rs = RandomizedSearchCV(CatBoostClassifier(),
                          param_distributions=params_catb_2,
                          cv=3,
                          scoring='neg_log_loss',
                          verbose=1,
                          n_iter=150)
    
    catb_rs.fit(x_train, y_train)
    catb_best = catb_rs.best_estimator_
    print(catb_rs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [18]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [30]:
def best_features(model,train):
    importance = model.feature_importances_
    result = pd.DataFrame([train.columns,importance]).transpose()
    result.columns = ["Feature","Importance"]
    return result.sort_values(by='Importance', ascending=False).head(15)["Feature"].to_list()
    
    
def plot_features(model,train):
    fig = plt.gcf()
    fig.set_size_inches(350, 350)
    selection = SelectFromModel(model, threshold=0.040, prefit=True)
    selected_dataset = selection.transform(train)
    model.plot_importance(booster=model)

    plt.rcParams["figure.figsize"] = (40,20)
    plt.xlabel("\nFeature importance", fontsize=40)
    plt.ylabel("Features", fontsize=35)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.show()

### cat boost 1 with all features

In [20]:
y = train.Target
x_train, x_validation, y_train, y_validation = train_test_split(X_train, y, test_size=0.3, stratify=y)

In [21]:
catboost_model = catboost(x_train, y_train, x_validation, y_validation)
test_model(catboost_model,x_validation,y_validation)
cross_val(catboost_model, x_train, y_train)

0:	learn: 0.6832863	total: 65.8ms	remaining: 3.23s
1:	learn: 0.6751348	total: 72.4ms	remaining: 1.74s
2:	learn: 0.6632146	total: 79.6ms	remaining: 1.25s
3:	learn: 0.6523255	total: 85.2ms	remaining: 980ms
4:	learn: 0.6430091	total: 89.9ms	remaining: 809ms
5:	learn: 0.6360678	total: 95.6ms	remaining: 701ms
6:	learn: 0.6284322	total: 99.9ms	remaining: 614ms
7:	learn: 0.6192172	total: 104ms	remaining: 545ms
8:	learn: 0.6092791	total: 108ms	remaining: 491ms
9:	learn: 0.6020186	total: 112ms	remaining: 447ms
10:	learn: 0.5947144	total: 117ms	remaining: 414ms
11:	learn: 0.5871659	total: 123ms	remaining: 390ms
12:	learn: 0.5812153	total: 127ms	remaining: 362ms
13:	learn: 0.5740384	total: 132ms	remaining: 339ms
14:	learn: 0.5673494	total: 136ms	remaining: 318ms
15:	learn: 0.5614275	total: 140ms	remaining: 298ms
16:	learn: 0.5545276	total: 145ms	remaining: 282ms
17:	learn: 0.5489961	total: 150ms	remaining: 266ms
18:	learn: 0.5419294	total: 156ms	remaining: 255ms
19:	learn: 0.5349079	total: 162ms	

In [22]:
best_features = best_features(catboost_model,X_train)
if "Opportunity_ID" not in best_features: 
    best_features.append("Opportunity_ID")

In [23]:
y_pred = catboost_model.predict_proba(X_test)[:,1]
submission_cb = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred})
submission_cb = submission_cb.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb.to_csv('../submits/cat_boost_with_m_est_encoding.csv', index=False)

### cat boost 1 with best features

In [24]:
X_train_best_features = X_train.loc[:,best_features]
X_test_best_features = X_test.loc[:,best_features]

In [25]:
x_best_train, x_best_validation, y_best_train, y_best_validation = train_test_split(X_train_best_features, y, test_size=0.3, stratify=y)

In [26]:
catboost_model_2 = catboost(x_best_train, y_best_train, x_best_validation, y_best_validation)
test_model(catboost_model_2,x_best_validation,y_best_validation)
cross_val(catboost_model_2, x_best_train, y_best_train)

0:	learn: 0.6878950	total: 3.46ms	remaining: 170ms
1:	learn: 0.6827879	total: 6.12ms	remaining: 147ms
2:	learn: 0.6775966	total: 8.41ms	remaining: 132ms
3:	learn: 0.6723991	total: 10.8ms	remaining: 125ms
4:	learn: 0.6670983	total: 13.4ms	remaining: 120ms
5:	learn: 0.6618486	total: 15.7ms	remaining: 115ms
6:	learn: 0.6565304	total: 18ms	remaining: 111ms
7:	learn: 0.6517137	total: 20.4ms	remaining: 107ms
8:	learn: 0.6470222	total: 22.8ms	remaining: 104ms
9:	learn: 0.6421862	total: 25.2ms	remaining: 101ms
10:	learn: 0.6374551	total: 27.5ms	remaining: 97.6ms
11:	learn: 0.6328366	total: 29.9ms	remaining: 94.8ms
12:	learn: 0.6284724	total: 32.2ms	remaining: 91.8ms
13:	learn: 0.6241749	total: 34.6ms	remaining: 89.1ms
14:	learn: 0.6197458	total: 37ms	remaining: 86.3ms
15:	learn: 0.6154882	total: 39.4ms	remaining: 83.7ms
16:	learn: 0.6112478	total: 41.7ms	remaining: 81ms
17:	learn: 0.6072358	total: 44.1ms	remaining: 78.4ms
18:	learn: 0.6031183	total: 46.4ms	remaining: 75.8ms
19:	learn: 0.599111

In [27]:
y_pred_2 = catboost_model_2.predict_proba(X_test_best_features)[:,1]
submission_cb_2 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features['Opportunity_ID'], 'Target': y_pred_2})
submission_cb_2 = submission_cb_2.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_2.to_csv('../submits/cat_boost_best_features_with_m_est_encoding.csv', index=False)

### cat boost 2 with all features

In [28]:
catboost_model_3 = catboost_2(x_train, y_train, x_validation, y_validation)
test_model(catboost_model_3,x_validation,y_validation)
cross_val(catboost_model_3, x_train, y_train)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
0:	learn: 0.6811961	total: 8.98ms	remaining: 1.23s
1:	learn: 0.6708542	total: 20.4ms	remaining: 1.39s
2:	learn: 0.6634751	total: 27.5ms	remaining: 1.24s
3:	learn: 0.6553044	total: 35.4ms	remaining: 1.19s
4:	learn: 0.6460158	total: 41.8ms	remaining: 1.11s
5:	learn: 0.6407678	total: 47.5ms	remaining: 1.05s
6:	learn: 0.6338142	total: 54.7ms	remaining: 1.02s
7:	learn: 0.6260445	total: 58.8ms	remaining: 956ms
8:	learn: 0.6194214	total: 62.4ms	remaining: 894ms
9:	learn: 0.6134191	total: 65ms	remaining: 831ms
10:	learn: 0.6089260	total: 67.6ms	remaining: 781ms
11:	learn: 0.6014088	total: 70.4ms	remaining: 740ms
12:	learn: 0.5966897	total: 73.1ms	remaining: 703ms
13:	learn: 0.5899166	total: 75.9ms	remaining: 672ms
14:	learn: 0.5849806	total: 78.7ms	remaining: 645ms
15:	learn: 0.5799360	total: 81.6ms	remaining: 622ms
16:	learn: 0.5780515	total: 83.7ms	remaining: 595ms
17:	learn: 0.5716822	total: 86.3ms	remaining: 575ms
18:	learn: 0.

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


43:	learn: 0.4632924	total: 174ms	remaining: 371ms
44:	learn: 0.4623071	total: 177ms	remaining: 365ms
45:	learn: 0.4606252	total: 179ms	remaining: 358ms
46:	learn: 0.4588930	total: 182ms	remaining: 352ms
47:	learn: 0.4574322	total: 185ms	remaining: 347ms
48:	learn: 0.4554507	total: 188ms	remaining: 341ms
49:	learn: 0.4521248	total: 192ms	remaining: 338ms
50:	learn: 0.4492559	total: 195ms	remaining: 333ms
51:	learn: 0.4460010	total: 199ms	remaining: 330ms
52:	learn: 0.4435474	total: 203ms	remaining: 326ms
53:	learn: 0.4408046	total: 206ms	remaining: 320ms
54:	learn: 0.4386781	total: 209ms	remaining: 315ms
55:	learn: 0.4363449	total: 211ms	remaining: 309ms
56:	learn: 0.4341232	total: 214ms	remaining: 304ms
57:	learn: 0.4322117	total: 217ms	remaining: 299ms
58:	learn: 0.4298744	total: 220ms	remaining: 294ms
59:	learn: 0.4280968	total: 223ms	remaining: 289ms
60:	learn: 0.4262234	total: 226ms	remaining: 285ms
61:	learn: 0.4252431	total: 229ms	remaining: 281ms
62:	learn: 0.4237430	total: 232

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:  6.0min finished


1:	learn: 0.4584658	total: 107ms	remaining: 7.74s
2:	learn: 0.3994517	total: 162ms	remaining: 7.78s
3:	learn: 0.3583492	total: 215ms	remaining: 7.67s
4:	learn: 0.3270853	total: 264ms	remaining: 7.51s
5:	learn: 0.3084108	total: 314ms	remaining: 7.38s
6:	learn: 0.2928336	total: 364ms	remaining: 7.27s
7:	learn: 0.2785842	total: 414ms	remaining: 7.19s
8:	learn: 0.2688528	total: 464ms	remaining: 7.11s
9:	learn: 0.2608979	total: 507ms	remaining: 6.95s
10:	learn: 0.2509614	total: 560ms	remaining: 6.92s
11:	learn: 0.2424272	total: 607ms	remaining: 6.83s
12:	learn: 0.2373167	total: 653ms	remaining: 6.73s
13:	learn: 0.2321553	total: 698ms	remaining: 6.63s
14:	learn: 0.2279758	total: 748ms	remaining: 6.58s
15:	learn: 0.2226337	total: 797ms	remaining: 6.52s
16:	learn: 0.2189452	total: 848ms	remaining: 6.48s
17:	learn: 0.2161985	total: 895ms	remaining: 6.41s
18:	learn: 0.2121705	total: 941ms	remaining: 6.34s
19:	learn: 0.2091840	total: 984ms	remaining: 6.25s
20:	learn: 0.2039485	total: 1.03s	remain

In [31]:
best_features_2 = best_features(catboost_model_3,X_train)
if "Opportunity_ID" not in best_features_2: 
    best_features_2.append("Opportunity_ID")

In [32]:
y_pred_3 = catboost_model_3.predict_proba(X_test)[:,1]
submission_cb_3 = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred_3})
submission_cb_3 = submission_cb_3.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_3.to_csv('../submits/cat_boost_2_with__m_est_encoding.csv', index=False)

### cat boost 2 with best features

In [33]:
X_train_best_features_2 = X_train.loc[:,best_features_2]
X_test_best_features_2 = X_test.loc[:,best_features_2]

In [34]:
x_best_train_2, x_best_validation_2, y_best_train_2, y_best_validation_2 = train_test_split(X_train_best_features_2, y, test_size=0.3, stratify=y)

In [35]:
catboost_model_4 = catboost_2(x_best_train_2, y_best_train_2, x_best_validation_2, y_best_validation_2)
test_model(catboost_model_4,x_best_validation_2,y_best_validation_2)
cross_val(catboost_model_4, x_best_train_2, y_best_train_2)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
0:	learn: 0.6154890	total: 6.14ms	remaining: 836ms
1:	learn: 0.5589942	total: 13.2ms	remaining: 893ms
2:	learn: 0.5100947	total: 22.4ms	remaining: 1000ms
3:	learn: 0.4719879	total: 32.4ms	remaining: 1.08s
4:	learn: 0.4412706	total: 43.2ms	remaining: 1.14s
5:	learn: 0.4136790	total: 52.6ms	remaining: 1.15s
6:	learn: 0.3893945	total: 64.7ms	remaining: 1.2s
7:	learn: 0.3704535	total: 74.5ms	remaining: 1.2s
8:	learn: 0.3522778	total: 87.6ms	remaining: 1.25s
9:	learn: 0.3368696	total: 99.9ms	remaining: 1.27s
10:	learn: 0.3245365	total: 112ms	remaining: 1.28s
11:	learn: 0.3125867	total: 123ms	remaining: 1.28s
12:	learn: 0.3029234	total: 133ms	remaining: 1.27s
13:	learn: 0.2928472	total: 143ms	remaining: 1.26s
14:	learn: 0.2845904	total: 156ms	remaining: 1.27s
15:	learn: 0.2779201	total: 170ms	remaining: 1.29s
16:	learn: 0.2725532	total: 181ms	remaining: 1.28s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


17:	learn: 0.2667710	total: 192ms	remaining: 1.27s
18:	learn: 0.2617548	total: 209ms	remaining: 1.3s
19:	learn: 0.2578387	total: 221ms	remaining: 1.29s
20:	learn: 0.2535305	total: 232ms	remaining: 1.28s
21:	learn: 0.2501425	total: 242ms	remaining: 1.27s
22:	learn: 0.2461680	total: 253ms	remaining: 1.25s
23:	learn: 0.2402115	total: 263ms	remaining: 1.24s
24:	learn: 0.2364611	total: 275ms	remaining: 1.23s
25:	learn: 0.2334786	total: 286ms	remaining: 1.22s
26:	learn: 0.2308439	total: 296ms	remaining: 1.21s
27:	learn: 0.2283148	total: 302ms	remaining: 1.18s
28:	learn: 0.2243080	total: 313ms	remaining: 1.16s
29:	learn: 0.2218288	total: 323ms	remaining: 1.15s
30:	learn: 0.2198699	total: 334ms	remaining: 1.14s
31:	learn: 0.2164554	total: 343ms	remaining: 1.13s
32:	learn: 0.2142623	total: 355ms	remaining: 1.12s
33:	learn: 0.2120325	total: 367ms	remaining: 1.11s
34:	learn: 0.2094658	total: 378ms	remaining: 1.1s
35:	learn: 0.2079171	total: 387ms	remaining: 1.09s
36:	learn: 0.2062017	total: 404ms

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:  2.6min finished


0:	learn: 0.5756205	total: 36.7ms	remaining: 5.25s
1:	learn: 0.4982873	total: 63.6ms	remaining: 4.52s
2:	learn: 0.4428506	total: 90.7ms	remaining: 4.26s
3:	learn: 0.4015193	total: 112ms	remaining: 3.91s
4:	learn: 0.3699250	total: 131ms	remaining: 3.64s
5:	learn: 0.3435322	total: 150ms	remaining: 3.44s
6:	learn: 0.3253100	total: 169ms	remaining: 3.31s
7:	learn: 0.3113989	total: 190ms	remaining: 3.23s
8:	learn: 0.2983728	total: 210ms	remaining: 3.15s
9:	learn: 0.2865618	total: 226ms	remaining: 3.03s
10:	learn: 0.2771349	total: 246ms	remaining: 2.97s
11:	learn: 0.2684141	total: 265ms	remaining: 2.91s
12:	learn: 0.2637068	total: 274ms	remaining: 2.77s
13:	learn: 0.2579988	total: 292ms	remaining: 2.71s
14:	learn: 0.2501679	total: 312ms	remaining: 2.68s
15:	learn: 0.2461104	total: 331ms	remaining: 2.65s
16:	learn: 0.2410321	total: 350ms	remaining: 2.62s
17:	learn: 0.2370950	total: 368ms	remaining: 2.58s
18:	learn: 0.2324579	total: 387ms	remaining: 2.54s
19:	learn: 0.2286774	total: 407ms	rema

In [36]:
y_pred_4 = catboost_model_4.predict_proba(X_test_best_features_2)[:,1]
submission_cb_4 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features_2['Opportunity_ID'], 'Target': y_pred_4})
submission_cb_4 = submission_cb_4.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_4.to_csv('../submits/cat_boost_2_best_features_with_m_est_encoding.csv', index=False)