In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import model_selection
import scipy.stats as stats
from sklearn.feature_selection import SelectFromModel

In [2]:
train = pd.read_csv('../../Feature_Engineering/data/other-cleaned_train.csv')
test = pd.read_csv('../../Feature_Engineering/data/other-cleaned_test.csv')

In [3]:
train.drop(columns = ['Unnamed: 0'], inplace = True)
test.drop(columns = ['Unnamed: 0'], inplace = True)

In [4]:
X_train = train.copy()
X_test = test.copy()

In [5]:
categ_columns = train.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year",\
                                    "Target"]).columns
for column in categ_columns:
    encoder = OrdinalEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(train[column])
    X_train = X_train.join(feature_encoded.add_suffix('_ord'))
    X_train.drop(columns=[column], inplace = True)

In [6]:
categ_columns = test.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year"]).columns
for column in categ_columns:
    encoder = OrdinalEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(test[column])
    X_test = X_test.join(feature_encoded.add_suffix('_ord'))
    X_test.drop(columns=[column], inplace = True)

In [7]:
X_train["Total_Amount"] = pd.to_numeric(X_train["Total_Amount"],errors='coerce').fillna(X_train["Total_Amount"].mean())
X_train["Opportunity_Created_Year"] = pd.to_numeric(X_train["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_train["Quote_Expiry_DOY"] = pd.to_numeric(X_train["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_train["Quote_Expiry_Year"] = pd.to_numeric(X_train["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_DOY"] = pd.to_numeric(X_train["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_Year"] = pd.to_numeric(X_train["Planned_Delivery_End_Year"],errors='coerce').fillna(0)

X_train = X_train.drop(columns = 'Target')

In [8]:
X_test["Total_Amount"] = pd.to_numeric(X_test["Total_Amount"],errors='coerce').fillna(test["Total_Amount"].mean())
X_test["Opportunity_Created_Year"] = pd.to_numeric(X_test["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_test["Quote_Expiry_DOY"] = pd.to_numeric(X_test["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_test["Quote_Expiry_Year"] = pd.to_numeric(X_test["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_DOY"] = pd.to_numeric(X_test["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_Year"] = pd.to_numeric(X_test["Planned_Delivery_End_Year"],errors='coerce').fillna(0)


## Model: Catboost

In [9]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [10]:
def catboost(x_train, y_train, x_validation, y_validation):
    catb_classifier = CatBoostClassifier()
    params_catb = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.05, 0.1], 'l2_leaf_reg': [0, 1, 5]}    
    catb_gs = GridSearchCV(catb_classifier, params_catb, cv=5)
    catb_gs.fit(x_train, y_train)
    catb_best = catb_gs.best_estimator_
    print(catb_gs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [11]:
def catboost_2(x_train, y_train, x_validation, y_validation):
    params_catb_2={
        'n_estimators':stats.randint(10,150),'learning_rate':stats.uniform(0.01,0.3),
        'subsample':stats.uniform(0.3,0.7),'l2_leaf_reg':[1,5,10],
        'max_depth':[3,10,6],'colsample_bylevel':stats.uniform(0.,0.6)
    }
    
    catb_rs = RandomizedSearchCV(CatBoostClassifier(),
                          param_distributions=params_catb_2,
                          cv=3,
                          scoring='neg_log_loss',
                          verbose=1,
                          n_iter=150)
    
    catb_rs.fit(x_train, y_train)
    catb_best = catb_rs.best_estimator_
    print(catb_rs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [12]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [24]:
def best_features(model,train):
    importance = model.feature_importances_
    result = pd.DataFrame([train.columns,importance]).transpose()
    result.columns = ["Feature","Importance"]
    return result.sort_values(by='Importance', ascending=False).head(15)["Feature"].to_list()
    
    
def plot_features(model,train):
    fig = plt.gcf()
    fig.set_size_inches(350, 350)
    selection = SelectFromModel(model, threshold=0.040, prefit=True)
    selected_dataset = selection.transform(train)
    model.plot_importance(booster=model)

    plt.rcParams["figure.figsize"] = (40,20)
    plt.xlabel("\nFeature importance", fontsize=40)
    plt.ylabel("Features", fontsize=35)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.show()

### cat boost 1 with all features

In [14]:
y = train.Target
x_train, x_validation, y_train, y_validation = train_test_split(X_train, y, test_size=0.3, stratify=y)

In [15]:
catboost_model = catboost(x_train, y_train, x_validation, y_validation)
test_model(catboost_model,x_validation,y_validation)
cross_val(catboost_model, x_train, y_train)

0:	learn: 0.6849198	total: 62.5ms	remaining: 3.06s
1:	learn: 0.6774208	total: 68.9ms	remaining: 1.65s
2:	learn: 0.6675605	total: 73.5ms	remaining: 1.15s
3:	learn: 0.6591157	total: 80.5ms	remaining: 926ms
4:	learn: 0.6516750	total: 85.9ms	remaining: 773ms
5:	learn: 0.6451307	total: 90.7ms	remaining: 665ms
6:	learn: 0.6385695	total: 94.9ms	remaining: 583ms
7:	learn: 0.6335466	total: 99.4ms	remaining: 522ms
8:	learn: 0.6263354	total: 103ms	remaining: 471ms
9:	learn: 0.6207665	total: 108ms	remaining: 430ms
10:	learn: 0.6144038	total: 112ms	remaining: 396ms
11:	learn: 0.6074045	total: 116ms	remaining: 367ms
12:	learn: 0.6028855	total: 120ms	remaining: 342ms
13:	learn: 0.5979874	total: 124ms	remaining: 319ms
14:	learn: 0.5933044	total: 128ms	remaining: 299ms
15:	learn: 0.5873330	total: 132ms	remaining: 281ms
16:	learn: 0.5816095	total: 137ms	remaining: 265ms
17:	learn: 0.5772075	total: 141ms	remaining: 250ms
18:	learn: 0.5719888	total: 145ms	remaining: 237ms
19:	learn: 0.5665471	total: 149ms

In [16]:
best_features = best_features(catboost_model,X_train)
if "Opportunity_ID" not in best_features: 
    best_features.append("Opportunity_ID")

In [17]:
y_pred = catboost_model.predict_proba(X_test)[:,1]
submission_cb = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred})
submission_cb = submission_cb.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb.to_csv('../submits/cat_boost_with_ordinal_encoding.csv', index=False)

### cat boost 1 with best features

In [18]:
X_train_best_features = X_train.loc[:,best_features]
X_test_best_features = X_test.loc[:,best_features]

In [19]:
x_best_train, x_best_validation, y_best_train, y_best_validation = train_test_split(X_train_best_features, y, test_size=0.3, stratify=y)

In [20]:
catboost_model_2 = catboost(x_best_train, y_best_train, x_best_validation, y_best_validation)
test_model(catboost_model_2,x_best_validation,y_best_validation)
cross_val(catboost_model_2, x_best_train, y_best_train)

0:	learn: 0.6888721	total: 3.07ms	remaining: 150ms
1:	learn: 0.6847511	total: 5.48ms	remaining: 131ms
2:	learn: 0.6807087	total: 7.78ms	remaining: 122ms
3:	learn: 0.6765098	total: 10.2ms	remaining: 118ms
4:	learn: 0.6724429	total: 12.6ms	remaining: 113ms
5:	learn: 0.6685352	total: 15.2ms	remaining: 111ms
6:	learn: 0.6645547	total: 17.6ms	remaining: 108ms
7:	learn: 0.6606636	total: 19.9ms	remaining: 104ms
8:	learn: 0.6568353	total: 22.4ms	remaining: 102ms
9:	learn: 0.6530769	total: 24.8ms	remaining: 99.2ms
10:	learn: 0.6492296	total: 27.2ms	remaining: 96.3ms
11:	learn: 0.6455690	total: 29.5ms	remaining: 93.4ms
12:	learn: 0.6422486	total: 31.8ms	remaining: 90.6ms
13:	learn: 0.6386035	total: 34.2ms	remaining: 87.9ms
14:	learn: 0.6351062	total: 36.6ms	remaining: 85.5ms
15:	learn: 0.6315817	total: 39ms	remaining: 82.9ms
16:	learn: 0.6283028	total: 41.4ms	remaining: 80.3ms
17:	learn: 0.6250714	total: 43.7ms	remaining: 77.7ms
18:	learn: 0.6218799	total: 46.1ms	remaining: 75.2ms
19:	learn: 0.6

In [21]:
y_pred_2 = catboost_model_2.predict_proba(X_test_best_features)[:,1]
submission_cb_2 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features['Opportunity_ID'], 'Target': y_pred_2})
submission_cb_2 = submission_cb_2.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_2.to_csv('../submits/cat_boost_best_features_with_ordinal_encoding.csv', index=False)

### cat boost 2 with all features

In [22]:
catboost_model_3 = catboost_2(x_train, y_train, x_validation, y_validation)
test_model(catboost_model_3,x_validation,y_validation)
cross_val(catboost_model_3, x_train, y_train)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
0:	learn: 0.6673623	total: 3.05ms	remaining: 391ms
1:	learn: 0.6444914	total: 6.34ms	remaining: 403ms
2:	learn: 0.6210462	total: 9.61ms	remaining: 404ms
3:	learn: 0.6001128	total: 13.1ms	remaining: 409ms
4:	learn: 0.5855776	total: 16.1ms	remaining: 400ms
5:	learn: 0.5697539	total: 19.2ms	remaining: 393ms
6:	learn: 0.5558808	total: 22.4ms	remaining: 391ms
7:	learn: 0.5417065	total: 25.6ms	remaining: 388ms
8:	learn: 0.5313688	total: 30ms	remaining: 400ms
9:	learn: 0.5224385	total: 35.6ms	remaining: 424ms
10:	learn: 0.5150714	total: 38.6ms	remaining: 414ms
11:	learn: 0.5071021	total: 42.3ms	remaining: 412ms
12:	learn: 0.4987979	total: 46.8ms	remaining: 417ms
13:	learn: 0.4943579	total: 51.1ms	remaining: 420ms
14:	learn: 0.4884392	total: 54.2ms	remaining: 412ms
15:	learn: 0.4823835	total: 57.8ms	remaining: 408ms
16:	learn: 0.4760371	total: 61.8ms	remaining: 407ms
17:	learn: 0.4707523	total: 66.1ms	remaining: 408ms
18:	learn: 0.

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


38:	learn: 0.4088382	total: 170ms	remaining: 391ms
39:	learn: 0.4074139	total: 176ms	remaining: 392ms
40:	learn: 0.4065882	total: 181ms	remaining: 389ms
41:	learn: 0.4057975	total: 186ms	remaining: 385ms
42:	learn: 0.4033850	total: 191ms	remaining: 381ms
43:	learn: 0.4017558	total: 195ms	remaining: 376ms
44:	learn: 0.4007301	total: 202ms	remaining: 377ms
45:	learn: 0.3997811	total: 207ms	remaining: 374ms
46:	learn: 0.3983941	total: 212ms	remaining: 369ms
47:	learn: 0.3966600	total: 217ms	remaining: 366ms
48:	learn: 0.3953971	total: 222ms	remaining: 362ms
49:	learn: 0.3940130	total: 226ms	remaining: 358ms
50:	learn: 0.3921341	total: 232ms	remaining: 354ms
51:	learn: 0.3914000	total: 235ms	remaining: 349ms
52:	learn: 0.3899364	total: 240ms	remaining: 345ms
53:	learn: 0.3890649	total: 243ms	remaining: 338ms
54:	learn: 0.3868189	total: 246ms	remaining: 331ms
55:	learn: 0.3850924	total: 252ms	remaining: 328ms
56:	learn: 0.3836879	total: 257ms	remaining: 325ms
57:	learn: 0.3828755	total: 276

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:  5.5min finished


1:	learn: 0.5066071	total: 110ms	remaining: 6.61s
2:	learn: 0.4523210	total: 168ms	remaining: 6.68s
3:	learn: 0.4086514	total: 220ms	remaining: 6.49s
4:	learn: 0.3830197	total: 268ms	remaining: 6.26s
5:	learn: 0.3615121	total: 321ms	remaining: 6.2s
6:	learn: 0.3463893	total: 378ms	remaining: 6.21s
7:	learn: 0.3314791	total: 432ms	remaining: 6.15s
8:	learn: 0.3176140	total: 484ms	remaining: 6.08s
9:	learn: 0.3085775	total: 539ms	remaining: 6.04s
10:	learn: 0.3028544	total: 590ms	remaining: 5.95s
11:	learn: 0.2896980	total: 640ms	remaining: 5.87s
12:	learn: 0.2790435	total: 695ms	remaining: 5.83s
13:	learn: 0.2642707	total: 750ms	remaining: 5.78s
14:	learn: 0.2569337	total: 799ms	remaining: 5.7s
15:	learn: 0.2516390	total: 853ms	remaining: 5.65s
16:	learn: 0.2461239	total: 903ms	remaining: 5.58s
17:	learn: 0.2420315	total: 955ms	remaining: 5.52s
18:	learn: 0.2380355	total: 1.02s	remaining: 5.54s
19:	learn: 0.2328166	total: 1.08s	remaining: 5.54s
20:	learn: 0.2291899	total: 1.14s	remainin

In [25]:
best_features_2 = best_features(catboost_model_3,X_train)
if "Opportunity_ID" not in best_features_2: 
    best_features_2.append("Opportunity_ID")

In [26]:
y_pred_3 = catboost_model_3.predict_proba(X_test)[:,1]
submission_cb_3 = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred_3})
submission_cb_3 = submission_cb_3.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_3.to_csv('../submits/cat_boost_2_with_ordinal_encoding.csv', index=False)

### cat boost 2 with best features

In [27]:
X_train_best_features_2 = X_train.loc[:,best_features_2]
X_test_best_features_2 = X_test.loc[:,best_features_2]

In [28]:
x_best_train_2, x_best_validation_2, y_best_train_2, y_best_validation_2 = train_test_split(X_train_best_features_2, y, test_size=0.3, stratify=y)

In [30]:
catboost_model_4 = catboost_2(x_best_train_2, y_best_train_2, x_best_validation_2, y_best_validation_2)
test_model(catboost_model_4,x_best_validation_2,y_best_validation_2)
cross_val(catboost_model_4, x_best_train_2, y_best_train_2)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
0:	learn: 0.5896310	total: 2.48ms	remaining: 302ms
1:	learn: 0.5286270	total: 5.5ms	remaining: 333ms
2:	learn: 0.4852980	total: 8.4ms	remaining: 336ms
3:	learn: 0.4530977	total: 11ms	remaining: 327ms
4:	learn: 0.4365059	total: 13.8ms	remaining: 325ms
5:	learn: 0.4168479	total: 16.2ms	remaining: 317ms
6:	learn: 0.4054547	total: 18.9ms	remaining: 314ms
7:	learn: 0.3963737	total: 21.4ms	remaining: 307ms
8:	learn: 0.3882923	total: 24ms	remaining: 304ms
9:	learn: 0.3815241	total: 26.3ms	remaining: 297ms
10:	learn: 0.3753615	total: 29ms	remaining: 295ms
11:	learn: 0.3698142	total: 31.2ms	remaining: 288ms
12:	learn: 0.3639048	total: 33.8ms	remaining: 286ms
13:	learn: 0.3590989	total: 36ms	remaining: 281ms
14:	learn: 0.3560052	total: 38.9ms	remaining: 280ms
15:	learn: 0.3498695	total: 41.5ms	remaining: 278ms
16:	learn: 0.3463332	total: 44.2ms	remaining: 276ms
17:	learn: 0.3424218	total: 46.4ms	remaining: 271ms
18:	learn: 0.3379429	

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


61:	learn: 0.2501573	total: 184ms	remaining: 181ms
62:	learn: 0.2492659	total: 188ms	remaining: 179ms
63:	learn: 0.2478895	total: 191ms	remaining: 176ms
64:	learn: 0.2466613	total: 194ms	remaining: 173ms
65:	learn: 0.2437204	total: 197ms	remaining: 170ms
66:	learn: 0.2430679	total: 200ms	remaining: 167ms
67:	learn: 0.2410769	total: 203ms	remaining: 164ms
68:	learn: 0.2397540	total: 205ms	remaining: 160ms
69:	learn: 0.2388597	total: 208ms	remaining: 157ms
70:	learn: 0.2382720	total: 213ms	remaining: 156ms
71:	learn: 0.2374196	total: 216ms	remaining: 153ms
72:	learn: 0.2360243	total: 219ms	remaining: 150ms
73:	learn: 0.2342552	total: 221ms	remaining: 147ms
74:	learn: 0.2327093	total: 224ms	remaining: 144ms
75:	learn: 0.2321368	total: 226ms	remaining: 140ms
76:	learn: 0.2304164	total: 229ms	remaining: 137ms
77:	learn: 0.2292114	total: 232ms	remaining: 134ms
78:	learn: 0.2280195	total: 235ms	remaining: 131ms
79:	learn: 0.2272666	total: 238ms	remaining: 128ms
80:	learn: 0.2266128	total: 241

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:  2.7min finished


13:	learn: 0.3281848	total: 147ms	remaining: 1.25s
14:	learn: 0.3207002	total: 158ms	remaining: 1.24s
15:	learn: 0.3142275	total: 169ms	remaining: 1.23s
16:	learn: 0.3142273	total: 171ms	remaining: 1.17s
17:	learn: 0.3128055	total: 175ms	remaining: 1.12s
18:	learn: 0.3074242	total: 186ms	remaining: 1.11s
19:	learn: 0.3037868	total: 200ms	remaining: 1.13s
20:	learn: 0.2997685	total: 211ms	remaining: 1.12s
21:	learn: 0.2952705	total: 219ms	remaining: 1.1s
22:	learn: 0.2900600	total: 226ms	remaining: 1.08s
23:	learn: 0.2881504	total: 235ms	remaining: 1.06s
24:	learn: 0.2838217	total: 246ms	remaining: 1.06s
25:	learn: 0.2810075	total: 256ms	remaining: 1.05s
26:	learn: 0.2807741	total: 258ms	remaining: 1.01s
27:	learn: 0.2787427	total: 266ms	remaining: 999ms
28:	learn: 0.2765733	total: 277ms	remaining: 992ms
29:	learn: 0.2738045	total: 286ms	remaining: 982ms
30:	learn: 0.2705622	total: 297ms	remaining: 976ms
31:	learn: 0.2663676	total: 308ms	remaining: 972ms
32:	learn: 0.2660843	total: 311m

In [31]:
y_pred_4 = catboost_model_4.predict_proba(X_test_best_features_2)[:,1]
submission_cb_4 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features_2['Opportunity_ID'], 'Target': y_pred_4})
submission_cb_4 = submission_cb_4.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_4.to_csv('../submits/cat_boost_2_best_features_with_ordinal_encoding.csv', index=False)