In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import CatBoostEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import model_selection
import scipy.stats as stats
from sklearn.feature_selection import SelectFromModel

In [52]:
train = pd.read_csv('../../Feature_Engineering/data/other-cleaned_train.csv')
test = pd.read_csv('../../Feature_Engineering/data/other-cleaned_test.csv')

In [53]:
train.drop(columns = ['Unnamed: 0'], inplace = True)
test.drop(columns = ['Unnamed: 0'], inplace = True)

In [54]:
X_train = train.copy()
X_test = test.copy()

In [55]:
categ_columns = train.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year",\
                                    "Target"]).columns
for column in categ_columns:
    encoder = CatBoostEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(train[column])
    X_train = X_train.join(feature_encoded.add_suffix('_cat_boost'))
    X_train.drop(columns=[column], inplace = True)

In [56]:
categ_columns = test.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year"]).columns
for column in categ_columns:
    encoder = CatBoostEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(test[column])
    X_test = X_test.join(feature_encoded.add_suffix('_cat_boost'))
    X_test.drop(columns=[column], inplace = True)

In [57]:
X_train["Total_Amount"] = pd.to_numeric(X_train["Total_Amount"],errors='coerce').fillna(X_train["Total_Amount"].mean())
X_train["Opportunity_Created_Year"] = pd.to_numeric(X_train["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_train["Quote_Expiry_DOY"] = pd.to_numeric(X_train["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_train["Quote_Expiry_Year"] = pd.to_numeric(X_train["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_DOY"] = pd.to_numeric(X_train["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_Year"] = pd.to_numeric(X_train["Planned_Delivery_End_Year"],errors='coerce').fillna(0)

X_train = X_train.drop(columns = 'Target')
X_train = X_train.drop(columns = 'ID')
X_train = X_train.drop(columns = 'Account_Name_cat_boost')

In [58]:
X_test["Total_Amount"] = pd.to_numeric(X_test["Total_Amount"],errors='coerce').fillna(test["Total_Amount"].mean())
X_test["Opportunity_Created_Year"] = pd.to_numeric(X_test["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_test["Quote_Expiry_DOY"] = pd.to_numeric(X_test["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_test["Quote_Expiry_Year"] = pd.to_numeric(X_test["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_DOY"] = pd.to_numeric(X_test["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_Year"] = pd.to_numeric(X_test["Planned_Delivery_End_Year"],errors='coerce').fillna(0)

X_test = X_test.drop(columns = 'ID')
X_test = X_test.drop(columns = 'Account_Name_cat_boost')

## Model: Catboost

In [59]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [60]:
def catboost(x_train, y_train, x_validation, y_validation):
    catb_classifier = CatBoostClassifier()
    params_catb = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.05, 0.1], 'l2_leaf_reg': [0, 1, 5]}    
    catb_gs = GridSearchCV(catb_classifier, params_catb, cv=5)
    catb_gs.fit(x_train, y_train)
    catb_best = catb_gs.best_estimator_
    print(catb_gs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [73]:
def catboost_2(x_train, y_train, x_validation, y_validation):
    params_catb_2={
        'n_estimators':stats.randint(10,150),'learning_rate':stats.uniform(0.01,0.3),
        'subsample':stats.uniform(0.3,0.7),'l2_leaf_reg':[1,5,10],
        'max_depth':[3,10,6],'colsample_bylevel':stats.uniform(0.,0.6)
    }
    
    catb_rs = RandomizedSearchCV(CatBoostClassifier(),
                          param_distributions=params_catb_2,
                          cv=3,
                          scoring='neg_log_loss',
                          verbose=1,
                          n_iter=150)
    
    catb_rs.fit(x_train, y_train)
    catb_best = catb_rs.best_estimator_
    print(catb_rs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [62]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [63]:
def best_features(model,train):
    importance = model.feature_importances_
    result = pd.DataFrame([train.columns,importance]).transpose()
    result.columns = ["Feature","Importance"]
    return result.sort_values(by='Importance', ascending=False).head(15)["Feature"]
    
    
def plot_features(model,train):
    fig = plt.gcf()
    fig.set_size_inches(350, 350)
    selection = SelectFromModel(model, threshold=0.040, prefit=True)
    selected_dataset = selection.transform(train)
    model.plot_importance(booster=model)

    plt.rcParams["figure.figsize"] = (40,20)
    plt.xlabel("\nFeature importance", fontsize=40)
    plt.ylabel("Features", fontsize=35)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.show()

### cat boost 1 with all features

In [64]:
y = train.Target
x_train, x_validation, y_train, y_validation = train_test_split(X_train, y, test_size=0.3, stratify=y)

In [65]:
catboost_model = catboost(x_train, y_train, x_validation, y_validation)
test_model(catboost_model,x_validation,y_validation)
cross_val(catboost_model, x_train, y_train)

0:	learn: 0.6835900	total: 7.51ms	remaining: 368ms
1:	learn: 0.6740854	total: 12.9ms	remaining: 309ms
2:	learn: 0.6641543	total: 19.7ms	remaining: 309ms
3:	learn: 0.6560257	total: 25.1ms	remaining: 289ms
4:	learn: 0.6494654	total: 32ms	remaining: 288ms
5:	learn: 0.6427978	total: 39.5ms	remaining: 289ms
6:	learn: 0.6348136	total: 44.9ms	remaining: 276ms
7:	learn: 0.6276159	total: 52.1ms	remaining: 273ms
8:	learn: 0.6205143	total: 57.2ms	remaining: 261ms
9:	learn: 0.6136747	total: 65.1ms	remaining: 260ms
10:	learn: 0.6081685	total: 72ms	remaining: 255ms
11:	learn: 0.6000896	total: 78.2ms	remaining: 247ms
12:	learn: 0.5934061	total: 85.3ms	remaining: 243ms
13:	learn: 0.5882681	total: 90.3ms	remaining: 232ms
14:	learn: 0.5804546	total: 97.8ms	remaining: 228ms
15:	learn: 0.5749885	total: 104ms	remaining: 221ms
16:	learn: 0.5695045	total: 110ms	remaining: 213ms
17:	learn: 0.5640736	total: 117ms	remaining: 207ms
18:	learn: 0.5586275	total: 121ms	remaining: 198ms
19:	learn: 0.5534333	total: 12

In [66]:
best_features = best_features(catboost_model,X_train)

In [67]:
y_pred = catboost_model.predict_proba(X_test)[:,1]
submission_cb = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred})
submission_cb = submission_cb.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb.to_csv('../submits/cat_boost_with_cat_boost_encoding.csv', index=False)

### cat boost 1 with best features

In [68]:
X_train_best_features = X_train.loc[:,best_features]
X_test_best_features = X_test.loc[:,best_features]

In [69]:
x_best_train, x_best_validation, y_best_train, y_best_validation = train_test_split(X_train_best_features, y, test_size=0.3, stratify=y)

In [70]:
catboost_model_2 = catboost(x_best_train, y_best_train, x_best_validation, y_best_validation)
test_model(catboost_model_2,x_best_validation,y_best_validation)
cross_val(catboost_model_2, x_best_train, y_best_train)

0:	learn: 0.6883497	total: 6ms	remaining: 294ms
1:	learn: 0.6832312	total: 12ms	remaining: 289ms
2:	learn: 0.6785870	total: 17.2ms	remaining: 269ms
3:	learn: 0.6738641	total: 22.5ms	remaining: 259ms
4:	learn: 0.6694285	total: 28.4ms	remaining: 255ms
5:	learn: 0.6649676	total: 34.2ms	remaining: 251ms
6:	learn: 0.6604248	total: 39.4ms	remaining: 242ms
7:	learn: 0.6562446	total: 44.9ms	remaining: 236ms
8:	learn: 0.6520812	total: 50.2ms	remaining: 229ms
9:	learn: 0.6475694	total: 55.4ms	remaining: 222ms
10:	learn: 0.6433853	total: 61.2ms	remaining: 217ms
11:	learn: 0.6392332	total: 67.6ms	remaining: 214ms
12:	learn: 0.6352140	total: 72.9ms	remaining: 208ms
13:	learn: 0.6311225	total: 78.3ms	remaining: 201ms
14:	learn: 0.6271044	total: 83.4ms	remaining: 195ms
15:	learn: 0.6231461	total: 89.2ms	remaining: 190ms
16:	learn: 0.6192793	total: 94.9ms	remaining: 184ms
17:	learn: 0.6154866	total: 100ms	remaining: 178ms
18:	learn: 0.6116898	total: 106ms	remaining: 173ms
19:	learn: 0.6081533	total: 1

In [71]:
y_pred_2 = catboost_model_2.predict_proba(X_test_best_features)[:,1]
submission_cb_2 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features['Opportunity_ID'], 'Target': y_pred_2})
submission_cb_2 = submission_cb_2.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_2.to_csv('../submits/cat_boost_best_features_with_cat_boost_encoding.csv', index=False)

### cat boost 2 with all features

In [78]:
catboost_model_3 = catboost_2(x_train, y_train, x_validation, y_validation)
test_model(catboost_model_3,x_validation,y_validation)
cross_val(catboost_model_3, x_train, y_train)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
0:	learn: 0.6881142	total: 1.77ms	remaining: 204ms
1:	learn: 0.6869910	total: 3.53ms	remaining: 201ms
2:	learn: 0.6860497	total: 5ms	remaining: 188ms
3:	learn: 0.6820812	total: 6.74ms	remaining: 189ms
4:	learn: 0.6814184	total: 8.28ms	remaining: 184ms
5:	learn: 0.6808629	total: 9.78ms	remaining: 179ms
6:	learn: 0.6798025	total: 11.4ms	remaining: 177ms
7:	learn: 0.6628516	total: 13.2ms	remaining: 178ms
8:	learn: 0.6623878	total: 14.8ms	remaining: 176ms
9:	learn: 0.6590498	total: 16.7ms	remaining: 177ms
10:	learn: 0.6588258	total: 18.3ms	remaining: 175ms
11:	learn: 0.6578262	total: 20.4ms	remaining: 177ms
12:	learn: 0.6560132	total: 21.9ms	remaining: 174ms
13:	learn: 0.6521335	total: 23.6ms	remaining: 172ms
14:	learn: 0.6494155	total: 25.2ms	remaining: 169ms
15:	learn: 0.6359345	total: 27ms	remaining: 169ms
16:	learn: 0.6285630	total: 28.8ms	remaining: 168ms
17:	learn: 0.6215434	total: 31.2ms	remaining: 170ms
18:	learn: 0.619

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


91:	learn: 0.4213443	total: 167ms	remaining: 43.7ms
92:	learn: 0.4213443	total: 169ms	remaining: 41.8ms
93:	learn: 0.4200166	total: 171ms	remaining: 40.1ms
94:	learn: 0.4200166	total: 173ms	remaining: 38.2ms
95:	learn: 0.4200166	total: 175ms	remaining: 36.4ms
96:	learn: 0.4198982	total: 177ms	remaining: 34.6ms
97:	learn: 0.4198982	total: 178ms	remaining: 32.8ms
98:	learn: 0.4188268	total: 181ms	remaining: 31.2ms
99:	learn: 0.4188268	total: 183ms	remaining: 29.4ms
100:	learn: 0.4180592	total: 186ms	remaining: 27.5ms
101:	learn: 0.4167005	total: 188ms	remaining: 25.8ms
102:	learn: 0.4167005	total: 190ms	remaining: 23.9ms
103:	learn: 0.4167005	total: 191ms	remaining: 22.1ms
104:	learn: 0.4088737	total: 194ms	remaining: 20.3ms
105:	learn: 0.4088736	total: 196ms	remaining: 18.5ms
106:	learn: 0.4059778	total: 198ms	remaining: 16.7ms
107:	learn: 0.4020485	total: 200ms	remaining: 14.8ms
108:	learn: 0.4020484	total: 202ms	remaining: 13ms
109:	learn: 0.4017507	total: 204ms	remaining: 11.1ms
110:

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:  7.7min finished


0:	learn: 0.5232964	total: 74.6ms	remaining: 10.3s
1:	learn: 0.4320719	total: 147ms	remaining: 10.1s
2:	learn: 0.3787844	total: 239ms	remaining: 10.8s
3:	learn: 0.3471153	total: 328ms	remaining: 11.1s
4:	learn: 0.3203682	total: 406ms	remaining: 10.9s
5:	learn: 0.3058340	total: 482ms	remaining: 10.7s
6:	learn: 0.2926940	total: 568ms	remaining: 10.7s
7:	learn: 0.2777419	total: 646ms	remaining: 10.6s
8:	learn: 0.2693563	total: 721ms	remaining: 10.4s
9:	learn: 0.2608482	total: 798ms	remaining: 10.3s
10:	learn: 0.2551710	total: 883ms	remaining: 10.3s
11:	learn: 0.2488844	total: 955ms	remaining: 10.1s
12:	learn: 0.2417769	total: 1.02s	remaining: 9.92s
13:	learn: 0.2371842	total: 1.09s	remaining: 9.75s
14:	learn: 0.2327419	total: 1.16s	remaining: 9.61s
15:	learn: 0.2270092	total: 1.23s	remaining: 9.49s
16:	learn: 0.2201131	total: 1.31s	remaining: 9.39s
17:	learn: 0.2146056	total: 1.38s	remaining: 9.3s
18:	learn: 0.2123899	total: 1.45s	remaining: 9.17s
19:	learn: 0.2082056	total: 1.59s	remaini

In [79]:
best_features_2 = best_features(catboost_model_3,X_train)

TypeError: 'Series' object is not callable

In [80]:
y_pred_3 = catboost_model_3.predict_proba(X_test)[:,1]
submission_cb_3 = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred_3})
submission_cb_3 = submission_cb_3.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_3.to_csv('../submits/cat_boost_2_with_cat_boost_encoding.csv', index=False)

### cat boost 2 with best features

In [81]:
X_train_best_features_2 = X_train.loc[:,best_features_2]
X_test_best_features_2 = X_test.loc[:,best_features_2]

NameError: name 'best_features_2' is not defined

In [None]:
x_best_train_2, x_best_validation_2, y_best_train_2, y_best_validation_2 = train_test_split(X_train_best_features_2, y, test_size=0.3, stratify=y)

In [None]:
catboost_model_4 = catboost_2(x_best_train_2, y_best_train_2, x_best_validation_2, y_best_validation_2)
test_model(catboost_model_4,x_best_validation_2,y_best_validation_2)
cross_val(catboost_model_4, x_best_train_2, y_best_train_2)

In [None]:
y_pred_4 = catboost_model_4.predict_proba(X_test_best_features_2)[:,1]
submission_cb_4 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features_2['Opportunity_ID'], 'Target': y_pred_4})
submission_cb_4 = submission_cb_4.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_4.to_csv('../submits/cat_boost_2_best_features_with_cat_boost_encoding.csv', index=False)