In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import WOEEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import model_selection
import scipy.stats as stats
from sklearn.feature_selection import SelectFromModel

In [2]:
train = pd.read_csv('../../Feature_Engineering/data/other-cleaned_train.csv')
test = pd.read_csv('../../Feature_Engineering/data/other-cleaned_test.csv')

In [3]:
train.drop(columns = ['Unnamed: 0'], inplace = True)
test.drop(columns = ['Unnamed: 0'], inplace = True)

In [4]:
X_train = train.copy()
X_test = test.copy()

In [5]:
categ_columns = train.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year",\
                                    "Target"]).columns
for column in categ_columns:
    encoder =  WOEEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(train[column])
    X_train = X_train.join(feature_encoded.add_suffix('_woe'))
    X_train.drop(columns=[column], inplace = True)
    if 'intercept_woe' in X_train:
        X_train.drop(columns=['intercept_woe'], inplace = True)

In [6]:
categ_columns = test.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year"]).columns
for column in categ_columns:
    encoder =  WOEEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(test[column])
    X_test = X_test.join(feature_encoded.add_suffix('_woe'))
    X_test.drop(columns=[column], inplace = True)
    if 'intercept_woe' in X_test:
        X_test.drop(columns=['intercept_woe'], inplace = True)

In [7]:
X_train["Total_Amount"] = pd.to_numeric(X_train["Total_Amount"],errors='coerce').fillna(X_train["Total_Amount"].mean())
X_train["Opportunity_Created_Year"] = pd.to_numeric(X_train["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_train["Quote_Expiry_DOY"] = pd.to_numeric(X_train["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_train["Quote_Expiry_Year"] = pd.to_numeric(X_train["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_DOY"] = pd.to_numeric(X_train["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_Year"] = pd.to_numeric(X_train["Planned_Delivery_End_Year"],errors='coerce').fillna(0)

X_train = X_train.drop(columns = 'Target')

In [8]:
X_test["Total_Amount"] = pd.to_numeric(X_test["Total_Amount"],errors='coerce').fillna(test["Total_Amount"].mean())
X_test["Opportunity_Created_Year"] = pd.to_numeric(X_test["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_test["Quote_Expiry_DOY"] = pd.to_numeric(X_test["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_test["Quote_Expiry_Year"] = pd.to_numeric(X_test["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_DOY"] = pd.to_numeric(X_test["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_Year"] = pd.to_numeric(X_test["Planned_Delivery_End_Year"],errors='coerce').fillna(0)


## Model: Catboost

In [9]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [10]:
def catboost(x_train, y_train, x_validation, y_validation):
    catb_classifier = CatBoostClassifier()
    params_catb = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.05, 0.1], 'l2_leaf_reg': [0, 1, 5]}    
    catb_gs = GridSearchCV(catb_classifier, params_catb, cv=5)
    catb_gs.fit(x_train, y_train)
    catb_best = catb_gs.best_estimator_
    print(catb_gs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [11]:
def catboost_2(x_train, y_train, x_validation, y_validation):
    params_catb_2={
        'n_estimators':stats.randint(10,150),'learning_rate':stats.uniform(0.01,0.3),
        'subsample':stats.uniform(0.3,0.7),'l2_leaf_reg':[1,5,10],
        'max_depth':[3,10,6],'colsample_bylevel':stats.uniform(0.,0.6)
    }
    
    catb_rs = RandomizedSearchCV(CatBoostClassifier(),
                          param_distributions=params_catb_2,
                          cv=3,
                          scoring='neg_log_loss',
                          verbose=1,
                          n_iter=150)
    
    catb_rs.fit(x_train, y_train)
    catb_best = catb_rs.best_estimator_
    print(catb_rs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [12]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [13]:
def best_features(model,train):
    importance = model.feature_importances_
    result = pd.DataFrame([train.columns,importance]).transpose()
    result.columns = ["Feature","Importance"]
    return result.sort_values(by='Importance', ascending=False).head(15)["Feature"].to_list()
    
    
def plot_features(model,train):
    fig = plt.gcf()
    fig.set_size_inches(350, 350)
    selection = SelectFromModel(model, threshold=0.040, prefit=True)
    selected_dataset = selection.transform(train)
    model.plot_importance(booster=model)

    plt.rcParams["figure.figsize"] = (40,20)
    plt.xlabel("\nFeature importance", fontsize=40)
    plt.ylabel("Features", fontsize=35)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.show()

### cat boost 1 with all features

In [14]:
y = train.Target
x_train, x_validation, y_train, y_validation = train_test_split(X_train, y, test_size=0.3, stratify=y)

In [15]:
catboost_model = catboost(x_train, y_train, x_validation, y_validation)
test_model(catboost_model,x_validation,y_validation)
cross_val(catboost_model, x_train, y_train)

0:	learn: 0.6851092	total: 66.2ms	remaining: 3.25s
1:	learn: 0.6774467	total: 73.3ms	remaining: 1.76s
2:	learn: 0.6672952	total: 80.5ms	remaining: 1.26s
3:	learn: 0.6565720	total: 85.9ms	remaining: 988ms
4:	learn: 0.6465936	total: 90.1ms	remaining: 811ms
5:	learn: 0.6388086	total: 94.2ms	remaining: 691ms
6:	learn: 0.6308221	total: 98.4ms	remaining: 604ms
7:	learn: 0.6226522	total: 103ms	remaining: 538ms
8:	learn: 0.6131316	total: 107ms	remaining: 488ms
9:	learn: 0.6044859	total: 111ms	remaining: 444ms
10:	learn: 0.5957024	total: 115ms	remaining: 409ms
11:	learn: 0.5886379	total: 120ms	remaining: 379ms
12:	learn: 0.5818776	total: 124ms	remaining: 353ms
13:	learn: 0.5746330	total: 128ms	remaining: 329ms
14:	learn: 0.5669464	total: 132ms	remaining: 309ms
15:	learn: 0.5619990	total: 137ms	remaining: 291ms
16:	learn: 0.5558349	total: 141ms	remaining: 274ms
17:	learn: 0.5496684	total: 145ms	remaining: 259ms
18:	learn: 0.5427904	total: 150ms	remaining: 244ms
19:	learn: 0.5368168	total: 154ms	

In [16]:
best_features = best_features(catboost_model,X_train)
if "Opportunity_ID" not in best_features: 
    best_features.append("Opportunity_ID")

In [17]:
y_pred = catboost_model.predict_proba(X_test)[:,1]
submission_cb = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred})
submission_cb = submission_cb.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb.to_csv('../submits/cat_boost_with_woe_encoding.csv', index=False)

### cat boost 1 with best features

In [18]:
X_train_best_features = X_train.loc[:,best_features]
X_test_best_features = X_test.loc[:,best_features]

In [19]:
x_best_train, x_best_validation, y_best_train, y_best_validation = train_test_split(X_train_best_features, y, test_size=0.3, stratify=y)

In [20]:
catboost_model_2 = catboost(x_best_train, y_best_train, x_best_validation, y_best_validation)
test_model(catboost_model_2,x_best_validation,y_best_validation)
cross_val(catboost_model_2, x_best_train, y_best_train)

0:	learn: 0.6876140	total: 3ms	remaining: 147ms
1:	learn: 0.6823320	total: 5.42ms	remaining: 130ms
2:	learn: 0.6773410	total: 7.69ms	remaining: 120ms
3:	learn: 0.6722991	total: 10ms	remaining: 115ms
4:	learn: 0.6674489	total: 12.2ms	remaining: 110ms
5:	learn: 0.6626022	total: 14.5ms	remaining: 107ms
6:	learn: 0.6578540	total: 17ms	remaining: 104ms
7:	learn: 0.6532402	total: 21.3ms	remaining: 112ms
8:	learn: 0.6485579	total: 25.1ms	remaining: 114ms
9:	learn: 0.6441190	total: 28.5ms	remaining: 114ms
10:	learn: 0.6397156	total: 32.2ms	remaining: 114ms
11:	learn: 0.6353298	total: 34.6ms	remaining: 109ms
12:	learn: 0.6309758	total: 36.8ms	remaining: 105ms
13:	learn: 0.6266698	total: 39.1ms	remaining: 101ms
14:	learn: 0.6226828	total: 41.4ms	remaining: 96.5ms
15:	learn: 0.6185686	total: 43.6ms	remaining: 92.7ms
16:	learn: 0.6144085	total: 46.1ms	remaining: 89.5ms
17:	learn: 0.6106456	total: 48.4ms	remaining: 86ms
18:	learn: 0.6070543	total: 50.6ms	remaining: 82.6ms
19:	learn: 0.6032689	total

In [21]:
y_pred_2 = catboost_model_2.predict_proba(X_test_best_features)[:,1]
submission_cb_2 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features['Opportunity_ID'], 'Target': y_pred_2})
submission_cb_2 = submission_cb_2.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_2.to_csv('../submits/cat_boost_best_features_with_woe_encoding.csv', index=False)

### cat boost 2 with all features

In [22]:
catboost_model_3 = catboost_2(x_train, y_train, x_validation, y_validation)
test_model(catboost_model_3,x_validation,y_validation)
cross_val(catboost_model_3, x_train, y_train)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
0:	learn: 0.4894535	total: 4.86ms	remaining: 102ms
1:	learn: 0.4213546	total: 9.44ms	remaining: 94.4ms
2:	learn: 0.3661586	total: 15.5ms	remaining: 98.5ms
3:	learn: 0.3417814	total: 19.7ms	remaining: 88.8ms
4:	learn: 0.3207957	total: 25.7ms	remaining: 87.4ms
5:	learn: 0.3102712	total: 29.9ms	remaining: 79.8ms
6:	learn: 0.3005796	total: 35.8ms	remaining: 76.8ms
7:	learn: 0.2928281	total: 39.8ms	remaining: 69.7ms
8:	learn: 0.2835555	total: 45.7ms	remaining: 66ms
9:	learn: 0.2796522	total: 49.5ms	remaining: 59.4ms
10:	learn: 0.2749184	total: 55.2ms	remaining: 55.2ms
11:	learn: 0.2714233	total: 58.9ms	remaining: 49.1ms
12:	learn: 0.2699647	total: 64.7ms	remaining: 44.8ms
13:	learn: 0.2671808	total: 68.4ms	remaining: 39.1ms
14:	learn: 0.2639267	total: 74.1ms	remaining: 34.6ms
15:	learn: 0.2581230	total: 78.4ms	remaining: 29.4ms
16:	learn: 0.2533012	total: 84.3ms	remaining: 24.8ms
17:	learn: 0.2493748	total: 88.1ms	remaining: 19.

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


8:	learn: 0.2898984	total: 49.3ms	remaining: 71.3ms
9:	learn: 0.2866599	total: 54.5ms	remaining: 65.4ms
10:	learn: 0.2796103	total: 60.3ms	remaining: 60.3ms
11:	learn: 0.2754280	total: 64.2ms	remaining: 53.5ms
12:	learn: 0.2736117	total: 70.8ms	remaining: 49ms
13:	learn: 0.2698020	total: 74.6ms	remaining: 42.6ms
14:	learn: 0.2662724	total: 79.9ms	remaining: 37.3ms
15:	learn: 0.2626323	total: 83.6ms	remaining: 31.3ms
16:	learn: 0.2587041	total: 89ms	remaining: 26.2ms
17:	learn: 0.2541097	total: 92.7ms	remaining: 20.6ms
18:	learn: 0.2516662	total: 98ms	remaining: 15.5ms
19:	learn: 0.2484293	total: 102ms	remaining: 10.2ms
20:	learn: 0.2461643	total: 107ms	remaining: 5.1ms
21:	learn: 0.2437448	total: 111ms	remaining: 0us
0:	learn: 0.4861133	total: 4.62ms	remaining: 97.1ms
1:	learn: 0.4103042	total: 8.59ms	remaining: 85.9ms
2:	learn: 0.3649535	total: 14.1ms	remaining: 89.2ms
3:	learn: 0.3394493	total: 19.1ms	remaining: 85.8ms
4:	learn: 0.3231667	total: 24.8ms	remaining: 84.3ms
5:	learn: 0.3

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:  5.3min finished


0:	learn: 0.5585681	total: 48.9ms	remaining: 6.46s
1:	learn: 0.4715585	total: 98.2ms	remaining: 6.43s
2:	learn: 0.4129620	total: 144ms	remaining: 6.22s
3:	learn: 0.3801054	total: 183ms	remaining: 5.89s
4:	learn: 0.3493396	total: 225ms	remaining: 5.76s
5:	learn: 0.3270925	total: 270ms	remaining: 5.71s
6:	learn: 0.3108437	total: 316ms	remaining: 5.69s
7:	learn: 0.2974972	total: 360ms	remaining: 5.62s
8:	learn: 0.2859662	total: 402ms	remaining: 5.54s
9:	learn: 0.2778971	total: 449ms	remaining: 5.53s
10:	learn: 0.2723616	total: 492ms	remaining: 5.46s
11:	learn: 0.2674266	total: 536ms	remaining: 5.41s
12:	learn: 0.2603871	total: 583ms	remaining: 5.38s
13:	learn: 0.2539795	total: 628ms	remaining: 5.33s
14:	learn: 0.2492039	total: 670ms	remaining: 5.27s
15:	learn: 0.2392092	total: 718ms	remaining: 5.25s
16:	learn: 0.2344192	total: 764ms	remaining: 5.21s
17:	learn: 0.2309795	total: 809ms	remaining: 5.17s
18:	learn: 0.2276465	total: 854ms	remaining: 5.13s
19:	learn: 0.2258195	total: 896ms	remai

In [23]:
best_features_2 = best_features(catboost_model_3,X_train)
if "Opportunity_ID" not in best_features_2: 
    best_features_2.append("Opportunity_ID")

TypeError: 'list' object is not callable

In [None]:
y_pred_3 = catboost_model_3.predict_proba(X_test)[:,1]
submission_cb_3 = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred_3})
submission_cb_3 = submission_cb_3.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_3.to_csv('../submits/cat_boost_2_with_woe_encoding.csv', index=False)

### cat boost 2 with best features

In [None]:
X_train_best_features_2 = X_train.loc[:,best_features_2]
X_test_best_features_2 = X_test.loc[:,best_features_2]

In [None]:
x_best_train_2, x_best_validation_2, y_best_train_2, y_best_validation_2 = train_test_split(X_train_best_features_2, y, test_size=0.3, stratify=y)

In [None]:
catboost_model_4 = catboost_2(x_best_train_2, y_best_train_2, x_best_validation_2, y_best_validation_2)
test_model(catboost_model_4,x_best_validation_2,y_best_validation_2)
cross_val(catboost_model_4, x_best_train_2, y_best_train_2)

In [None]:
y_pred_4 = catboost_model_4.predict_proba(X_test_best_features_2)[:,1]
submission_cb_4 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features_2['Opportunity_ID'], 'Target': y_pred_4})
submission_cb_4 = submission_cb_4.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_4.to_csv('../submits/cat_boost_2_best_features_with_woe_encoding.csv', index=False)