In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import model_selection
import scipy.stats as stats
from sklearn.feature_selection import SelectFromModel

In [2]:
train = pd.read_csv('../../Feature_Engineering/data/other-cleaned_train.csv')
test = pd.read_csv('../../Feature_Engineering/data/other-cleaned_test.csv')

In [3]:
train.drop(columns = ['Unnamed: 0'], inplace = True)
test.drop(columns = ['Unnamed: 0'], inplace = True)

In [4]:
X_train = train.copy()
X_test = test.copy()

In [5]:
categ_columns = train.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year",\
                                    "Target"]).columns
for column in categ_columns:
    encoder = OneHotEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(train[column])
    X_train = X_train.join(feature_encoded.add_suffix('_one_hot_enc'))
    X_train.drop(columns=[column], inplace = True)

In [6]:
categ_columns = test.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year"]).columns
for column in categ_columns:
    encoder = OneHotEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(test[column])
    X_test = X_test.join(feature_encoded.add_suffix('_one_hot_enc'))
    X_test.drop(columns=[column], inplace = True)

In [7]:
X_train["Total_Amount"] = pd.to_numeric(X_train["Total_Amount"],errors='coerce').fillna(X_train["Total_Amount"].mean())
X_train["Opportunity_Created_Year"] = pd.to_numeric(X_train["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_train["Quote_Expiry_DOY"] = pd.to_numeric(X_train["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_train["Quote_Expiry_Year"] = pd.to_numeric(X_train["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_DOY"] = pd.to_numeric(X_train["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_Year"] = pd.to_numeric(X_train["Planned_Delivery_End_Year"],errors='coerce').fillna(0)

X_train = X_train.drop(columns = 'Target')
X_train = X_train.drop(columns = 'ID')

In [8]:
X_test["Total_Amount"] = pd.to_numeric(X_test["Total_Amount"],errors='coerce').fillna(test["Total_Amount"].mean())
X_test["Opportunity_Created_Year"] = pd.to_numeric(X_test["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_test["Quote_Expiry_DOY"] = pd.to_numeric(X_test["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_test["Quote_Expiry_Year"] = pd.to_numeric(X_test["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_DOY"] = pd.to_numeric(X_test["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_Year"] = pd.to_numeric(X_test["Planned_Delivery_End_Year"],errors='coerce').fillna(0)

X_test = X_test.drop(columns = 'ID')

## Model: Catboost

In [9]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [10]:
def catboost(x_train, y_train, x_validation, y_validation):
    catb_classifier = CatBoostClassifier()
    params_catb = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.05, 0.1], 'l2_leaf_reg': [0, 1, 5]}    
    catb_gs = GridSearchCV(catb_classifier, params_catb, cv=5)
    catb_gs.fit(x_train, y_train)
    catb_best = catb_gs.best_estimator_
    print(catb_gs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [11]:
def catboost_2(x_train, y_train, x_validation, y_validation):
    params_catb_2={
        'n_estimators':stats.randint(10,150),'learning_rate':stats.uniform(0.01,0.3),
        'subsample':stats.uniform(0.3,0.7),'l2_leaf_reg':[1,5,10],
        'max_depth':[3,10,6],'colsample_bylevel':stats.uniform(0.,0.6)
    }
    
    catb_rs = RandomizedSearchCV(CatBoostClassifier(),
                          param_distributions=params_catb_2,
                          cv=3,
                          scoring='neg_log_loss',
                          verbose=1,
                          n_iter=150)
    
    catb_rs.fit(x_train, y_train)
    catb_best = catb_rs.best_estimator_
    print(catb_rs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [12]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [24]:
def best_features(model,train):
    importance = model.feature_importances_
    result = pd.DataFrame([train.columns,importance]).transpose()
    result.columns = ["Feature","Importance"]
    return result.sort_values(by='Importance', ascending=False).head(15)["Feature"].to_list()
    
    
def plot_features(model,train):
    fig = plt.gcf()
    fig.set_size_inches(350, 350)
    selection = SelectFromModel(model, threshold=0.040, prefit=True)
    selected_dataset = selection.transform(train)
    model.plot_importance(booster=model)

    plt.rcParams["figure.figsize"] = (40,20)
    plt.xlabel("\nFeature importance", fontsize=40)
    plt.ylabel("Features", fontsize=35)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.show()

### cat boost 1 with all features

In [14]:
y = train.Target
x_train, x_validation, y_train, y_validation = train_test_split(X_train, y, test_size=0.3, stratify=y)

In [15]:
catboost_model = catboost(x_train, y_train, x_validation, y_validation)
test_model(catboost_model,x_validation,y_validation)
cross_val(catboost_model, x_train, y_train)

0:	learn: 0.6831233	total: 78.5ms	remaining: 3.84s
1:	learn: 0.6737516	total: 91.1ms	remaining: 2.19s
2:	learn: 0.6644179	total: 102ms	remaining: 1.59s
3:	learn: 0.6566583	total: 112ms	remaining: 1.29s
4:	learn: 0.6487465	total: 123ms	remaining: 1.1s
5:	learn: 0.6407520	total: 133ms	remaining: 976ms
6:	learn: 0.6342014	total: 143ms	remaining: 880ms
7:	learn: 0.6267903	total: 154ms	remaining: 807ms
8:	learn: 0.6187562	total: 164ms	remaining: 747ms
9:	learn: 0.6132939	total: 174ms	remaining: 696ms
10:	learn: 0.6061748	total: 185ms	remaining: 654ms
11:	learn: 0.5996078	total: 195ms	remaining: 617ms
12:	learn: 0.5942756	total: 205ms	remaining: 583ms
13:	learn: 0.5879288	total: 215ms	remaining: 553ms
14:	learn: 0.5815053	total: 226ms	remaining: 526ms
15:	learn: 0.5768629	total: 236ms	remaining: 502ms
16:	learn: 0.5727226	total: 247ms	remaining: 479ms
17:	learn: 0.5681711	total: 257ms	remaining: 457ms
18:	learn: 0.5643089	total: 268ms	remaining: 437ms
19:	learn: 0.5593937	total: 279ms	remain

In [16]:
best_features = best_features(catboost_model,X_train)
if "Opportunity_ID" not in best_features: 
    best_features.append("Opportunity_ID")

In [17]:
y_pred = catboost_model.predict_proba(X_test)[:,1]
submission_cb = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred})
submission_cb = submission_cb.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb.to_csv('../submits/cat_boost_with_one_hot_encoding.csv', index=False)

### cat boost 1 with best features

In [18]:
X_train_best_features = X_train.loc[:,best_features]
X_test_best_features = X_test.loc[:,best_features]

In [19]:
x_best_train, x_best_validation, y_best_train, y_best_validation = train_test_split(X_train_best_features, y, test_size=0.3, stratify=y)

In [20]:
catboost_model_2 = catboost(x_best_train, y_best_train, x_best_validation, y_best_validation)
test_model(catboost_model_2,x_best_validation,y_best_validation)
cross_val(catboost_model_2, x_best_train, y_best_train)

0:	learn: 0.6888676	total: 3.02ms	remaining: 148ms
1:	learn: 0.6845837	total: 8.43ms	remaining: 202ms
2:	learn: 0.6804423	total: 12.6ms	remaining: 198ms
3:	learn: 0.6766813	total: 14.8ms	remaining: 170ms
4:	learn: 0.6728592	total: 16.8ms	remaining: 151ms
5:	learn: 0.6688966	total: 21.4ms	remaining: 157ms
6:	learn: 0.6650375	total: 23.5ms	remaining: 145ms
7:	learn: 0.6609702	total: 25.5ms	remaining: 134ms
8:	learn: 0.6571941	total: 27.5ms	remaining: 125ms
9:	learn: 0.6534984	total: 29.5ms	remaining: 118ms
10:	learn: 0.6496260	total: 31.5ms	remaining: 112ms
11:	learn: 0.6460873	total: 34.2ms	remaining: 108ms
12:	learn: 0.6425949	total: 36.5ms	remaining: 104ms
13:	learn: 0.6392524	total: 38.5ms	remaining: 99ms
14:	learn: 0.6356401	total: 40.5ms	remaining: 94.5ms
15:	learn: 0.6323915	total: 42.5ms	remaining: 90.3ms
16:	learn: 0.6292332	total: 44.5ms	remaining: 86.3ms
17:	learn: 0.6261520	total: 46.4ms	remaining: 82.5ms
18:	learn: 0.6229967	total: 48.8ms	remaining: 79.6ms
19:	learn: 0.61994

In [21]:
y_pred_2 = catboost_model_2.predict_proba(X_test_best_features)[:,1]
submission_cb_2 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features['Opportunity_ID'], 'Target': y_pred_2})
submission_cb_2 = submission_cb_2.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_2.to_csv('../submits/cat_boost_best_features_with_one_hot_encoding.csv', index=False)

### cat boost 2 with all features

In [22]:
catboost_model_3 = catboost_2(x_train, y_train, x_validation, y_validation)
test_model(catboost_model_3,x_validation,y_validation)
cross_val(catboost_model_3, x_train, y_train)

Fitting 3 folds for each of 150 candidates, totalling 450 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.5555670	total: 65ms	remaining: 1.69s
1:	learn: 0.4882602	total: 131ms	remaining: 1.63s
2:	learn: 0.4397157	total: 202ms	remaining: 1.62s
3:	learn: 0.4073356	total: 269ms	remaining: 1.54s
4:	learn: 0.3763308	total: 335ms	remaining: 1.47s
5:	learn: 0.3624688	total: 402ms	remaining: 1.41s
6:	learn: 0.3448784	total: 471ms	remaining: 1.35s
7:	learn: 0.3337850	total: 539ms	remaining: 1.28s
8:	learn: 0.3227261	total: 601ms	remaining: 1.2s
9:	learn: 0.3116787	total: 663ms	remaining: 1.13s
10:	learn: 0.2999319	total: 724ms	remaining: 1.05s
11:	learn: 0.2968260	total: 785ms	remaining: 981ms
12:	learn: 0.2906158	total: 845ms	remaining: 910ms
13:	learn: 0.2882462	total: 904ms	remaining: 840ms
14:	learn: 0.2818519	total: 965ms	remaining: 772ms
15:	learn: 0.2750828	total: 1.03s	remaining: 706ms
16:	learn: 0.2707089	total: 1.09s	remaining: 640ms
17:	learn: 0.2662819	total: 1.15s	remaining: 575ms
18:	learn: 0.2620291	total: 1.21s	remaining: 511ms
19:	learn: 0.2581099	total: 1.27s	remaining

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed: 20.5min finished


0:	learn: 0.5077817	total: 72.9ms	remaining: 8.67s
1:	learn: 0.4281297	total: 147ms	remaining: 8.7s
2:	learn: 0.3807655	total: 224ms	remaining: 8.75s
3:	learn: 0.3510734	total: 306ms	remaining: 8.88s
4:	learn: 0.3273607	total: 385ms	remaining: 8.85s
5:	learn: 0.3164756	total: 460ms	remaining: 8.74s
6:	learn: 0.3069138	total: 536ms	remaining: 8.66s
7:	learn: 0.2999780	total: 610ms	remaining: 8.54s
8:	learn: 0.2922007	total: 686ms	remaining: 8.46s
9:	learn: 0.2849560	total: 760ms	remaining: 8.37s
10:	learn: 0.2808098	total: 834ms	remaining: 8.26s
11:	learn: 0.2725583	total: 908ms	remaining: 8.17s
12:	learn: 0.2650757	total: 981ms	remaining: 8.08s
13:	learn: 0.2618568	total: 1.06s	remaining: 8s
14:	learn: 0.2591253	total: 1.13s	remaining: 7.93s
15:	learn: 0.2567899	total: 1.21s	remaining: 7.85s
16:	learn: 0.2529111	total: 1.28s	remaining: 7.77s
17:	learn: 0.2499330	total: 1.36s	remaining: 7.7s
18:	learn: 0.2446310	total: 1.44s	remaining: 7.66s
19:	learn: 0.2423059	total: 1.52s	remaining: 

In [25]:
best_features_2 = best_features(catboost_model_3,X_train)

In [26]:
y_pred_3 = catboost_model_3.predict_proba(X_test)[:,1]
submission_cb_3 = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred_3})
submission_cb_3 = submission_cb_3.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_3.to_csv('../submits/cat_boost_2_with_one_hot_encoding.csv', index=False)

### cat boost 2 with best features

In [27]:
X_train_best_features_2 = X_train.loc[:,best_features_2]
X_test_best_features_2 = X_test.loc[:,best_features_2]

In [28]:
x_best_train_2, x_best_validation_2, y_best_train_2, y_best_validation_2 = train_test_split(X_train_best_features_2, y, test_size=0.3, stratify=y)

In [29]:
catboost_model_4 = catboost_2(x_best_train_2, y_best_train_2, x_best_validation_2, y_best_validation_2)
test_model(catboost_model_4,x_best_validation_2,y_best_validation_2)
cross_val(catboost_model_4, x_best_train_2, y_best_train_2)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
0:	learn: 0.6846146	total: 1.58ms	remaining: 226ms
1:	learn: 0.6712387	total: 3.14ms	remaining: 223ms
2:	learn: 0.6568389	total: 7.77ms	remaining: 365ms
3:	learn: 0.6428940	total: 10.7ms	remaining: 373ms
4:	learn: 0.6313690	total: 12.9ms	remaining: 357ms
5:	learn: 0.6199013	total: 18.2ms	remaining: 418ms
6:	learn: 0.6076558	total: 20.6ms	remaining: 403ms
7:	learn: 0.5969093	total: 22.1ms	remaining: 376ms
8:	learn: 0.5940355	total: 23ms	remaining: 345ms
9:	learn: 0.5894363	total: 24.3ms	remaining: 325ms
10:	learn: 0.5852485	total: 25.1ms	remaining: 304ms
11:	learn: 0.5778068	total: 26.5ms	remaining: 291ms
12:	learn: 0.5716532	total: 27.8ms	remaining: 280ms
13:	learn: 0.5714618	total: 28.5ms	remaining: 265ms
14:	learn: 0.5712882	total: 29.3ms	remaining: 252ms
15:	learn: 0.5636180	total: 30.2ms	remaining: 242ms
16:	learn: 0.5581135	total: 31.2ms	remaining: 233ms
17:	learn: 0.5563542	total: 32.1ms	remaining: 225ms
18:	learn: 0.

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


85:	learn: 0.4045495	total: 175ms	remaining: 118ms
86:	learn: 0.4035970	total: 178ms	remaining: 116ms
87:	learn: 0.4025543	total: 180ms	remaining: 115ms
88:	learn: 0.4017322	total: 183ms	remaining: 113ms
89:	learn: 0.3999692	total: 191ms	remaining: 115ms
90:	learn: 0.3988422	total: 197ms	remaining: 115ms
91:	learn: 0.3974157	total: 204ms	remaining: 116ms
92:	learn: 0.3963507	total: 209ms	remaining: 114ms
93:	learn: 0.3962721	total: 210ms	remaining: 112ms
94:	learn: 0.3951503	total: 214ms	remaining: 110ms
95:	learn: 0.3943395	total: 216ms	remaining: 108ms
96:	learn: 0.3928564	total: 219ms	remaining: 106ms
97:	learn: 0.3925787	total: 220ms	remaining: 103ms
98:	learn: 0.3915622	total: 226ms	remaining: 103ms
99:	learn: 0.3911715	total: 228ms	remaining: 100ms
100:	learn: 0.3911692	total: 228ms	remaining: 97.2ms
101:	learn: 0.3900711	total: 231ms	remaining: 95.1ms
102:	learn: 0.3900100	total: 232ms	remaining: 92.4ms
103:	learn: 0.3900055	total: 233ms	remaining: 89.6ms
104:	learn: 0.3897510	t

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:  2.4min finished


10:	learn: 0.3408587	total: 132ms	remaining: 1.27s
11:	learn: 0.3345987	total: 145ms	remaining: 1.27s
12:	learn: 0.3296916	total: 164ms	remaining: 1.31s
13:	learn: 0.3213907	total: 180ms	remaining: 1.33s
14:	learn: 0.3164612	total: 196ms	remaining: 1.33s
15:	learn: 0.3127851	total: 209ms	remaining: 1.32s
16:	learn: 0.3087384	total: 223ms	remaining: 1.31s
17:	learn: 0.3040286	total: 235ms	remaining: 1.29s
18:	learn: 0.2983207	total: 248ms	remaining: 1.28s
19:	learn: 0.2955009	total: 262ms	remaining: 1.27s
20:	learn: 0.2909212	total: 276ms	remaining: 1.26s
21:	learn: 0.2874903	total: 287ms	remaining: 1.24s
22:	learn: 0.2845223	total: 297ms	remaining: 1.21s
23:	learn: 0.2810643	total: 309ms	remaining: 1.2s
24:	learn: 0.2776489	total: 323ms	remaining: 1.19s
25:	learn: 0.2748038	total: 337ms	remaining: 1.18s
26:	learn: 0.2724471	total: 352ms	remaining: 1.17s
27:	learn: 0.2704626	total: 364ms	remaining: 1.16s
28:	learn: 0.2677048	total: 377ms	remaining: 1.14s
29:	learn: 0.2658257	total: 388m

In [30]:
y_pred_4 = catboost_model_4.predict_proba(X_test_best_features_2)[:,1]
submission_cb_4 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features_2['Opportunity_ID'], 'Target': y_pred_4})
submission_cb_4 = submission_cb_4.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_4.to_csv('../submits/cat_boost_2_best_features_with_one_hot_encoding.csv', index=False)