In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import SumEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import model_selection
import scipy.stats as stats
from sklearn.feature_selection import SelectFromModel

In [2]:
train = pd.read_csv('../../Feature_Engineering/data/other-cleaned_train.csv')
test = pd.read_csv('../../Feature_Engineering/data/other-cleaned_test.csv')

In [3]:
train.drop(columns = ['Unnamed: 0'], inplace = True)
test.drop(columns = ['Unnamed: 0'], inplace = True)

In [4]:
X_train = train.copy()
X_test = test.copy()

In [5]:
categ_columns = train.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year",\
                                    "Target"]).columns
for column in categ_columns:
    encoder = SumEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(train[column])
    X_train = X_train.join(feature_encoded.add_suffix('_sum'))
    X_train.drop(columns=[column], inplace = True)
    if 'intercept_sum' in X_train:
        X_train.drop(columns=['intercept_sum'], inplace = True)

In [6]:
categ_columns = test.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year"]).columns
for column in categ_columns:
    encoder = SumEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(test[column])
    X_test = X_test.join(feature_encoded.add_suffix('_sum'))
    X_test.drop(columns=[column], inplace = True)
    if 'intercept_sum' in X_test:
        X_test.drop(columns=['intercept_sum'], inplace = True)

In [7]:
X_train["Total_Amount"] = pd.to_numeric(X_train["Total_Amount"],errors='coerce').fillna(X_train["Total_Amount"].mean())
X_train["Opportunity_Created_Year"] = pd.to_numeric(X_train["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_train["Quote_Expiry_DOY"] = pd.to_numeric(X_train["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_train["Quote_Expiry_Year"] = pd.to_numeric(X_train["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_DOY"] = pd.to_numeric(X_train["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_Year"] = pd.to_numeric(X_train["Planned_Delivery_End_Year"],errors='coerce').fillna(0)

X_train = X_train.drop(columns = 'Target')

In [8]:
X_test["Total_Amount"] = pd.to_numeric(X_test["Total_Amount"],errors='coerce').fillna(test["Total_Amount"].mean())
X_test["Opportunity_Created_Year"] = pd.to_numeric(X_test["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_test["Quote_Expiry_DOY"] = pd.to_numeric(X_test["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_test["Quote_Expiry_Year"] = pd.to_numeric(X_test["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_DOY"] = pd.to_numeric(X_test["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_Year"] = pd.to_numeric(X_test["Planned_Delivery_End_Year"],errors='coerce').fillna(0)


## Model: Catboost

In [9]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [10]:
def catboost(x_train, y_train, x_validation, y_validation):
    catb_classifier = CatBoostClassifier()
    params_catb = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.05, 0.1], 'l2_leaf_reg': [0, 1, 5]}    
    catb_gs = GridSearchCV(catb_classifier, params_catb, cv=5)
    catb_gs.fit(x_train, y_train)
    catb_best = catb_gs.best_estimator_
    print(catb_gs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [11]:
def catboost_2(x_train, y_train, x_validation, y_validation):
    params_catb_2={
        'n_estimators':stats.randint(10,150),'learning_rate':stats.uniform(0.01,0.3),
        'subsample':stats.uniform(0.3,0.7),'l2_leaf_reg':[1,5,10],
        'max_depth':[3,10,6],'colsample_bylevel':stats.uniform(0.,0.6)
    }
    
    catb_rs = RandomizedSearchCV(CatBoostClassifier(),
                          param_distributions=params_catb_2,
                          cv=3,
                          scoring='neg_log_loss',
                          verbose=1,
                          n_iter=150)
    
    catb_rs.fit(x_train, y_train)
    catb_best = catb_rs.best_estimator_
    print(catb_rs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [12]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [24]:
def best_features(model,train):
    importance = model.feature_importances_
    result = pd.DataFrame([train.columns,importance]).transpose()
    result.columns = ["Feature","Importance"]
    return result.sort_values(by='Importance', ascending=False).head(15)["Feature"].to_list()
    
    
def plot_features(model,train):
    fig = plt.gcf()
    fig.set_size_inches(350, 350)
    selection = SelectFromModel(model, threshold=0.040, prefit=True)
    selected_dataset = selection.transform(train)
    model.plot_importance(booster=model)

    plt.rcParams["figure.figsize"] = (40,20)
    plt.xlabel("\nFeature importance", fontsize=40)
    plt.ylabel("Features", fontsize=35)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.show()

### cat boost 1 with all features

In [14]:
y = train.Target
x_train, x_validation, y_train, y_validation = train_test_split(X_train, y, test_size=0.3, stratify=y)

In [15]:
catboost_model = catboost(x_train, y_train, x_validation, y_validation)
test_model(catboost_model,x_validation,y_validation)
cross_val(catboost_model, x_train, y_train)

0:	learn: 0.6844083	total: 138ms	remaining: 6.77s
1:	learn: 0.6766326	total: 237ms	remaining: 5.68s
2:	learn: 0.6675227	total: 329ms	remaining: 5.15s
3:	learn: 0.6604788	total: 406ms	remaining: 4.67s
4:	learn: 0.6538460	total: 459ms	remaining: 4.13s
5:	learn: 0.6482093	total: 516ms	remaining: 3.78s
6:	learn: 0.6429361	total: 575ms	remaining: 3.53s
7:	learn: 0.6365745	total: 654ms	remaining: 3.43s
8:	learn: 0.6315261	total: 724ms	remaining: 3.3s
9:	learn: 0.6273310	total: 803ms	remaining: 3.21s
10:	learn: 0.6223253	total: 861ms	remaining: 3.05s
11:	learn: 0.6177263	total: 927ms	remaining: 2.94s
12:	learn: 0.6123114	total: 1.03s	remaining: 2.93s
13:	learn: 0.6079278	total: 1.12s	remaining: 2.88s
14:	learn: 0.6039862	total: 1.2s	remaining: 2.81s
15:	learn: 0.5990291	total: 1.29s	remaining: 2.74s
16:	learn: 0.5940280	total: 1.38s	remaining: 2.69s
17:	learn: 0.5893332	total: 1.48s	remaining: 2.62s
18:	learn: 0.5838351	total: 1.57s	remaining: 2.56s
19:	learn: 0.5807314	total: 1.63s	remaining

In [16]:
best_features = best_features(catboost_model,X_train)
if "Opportunity_ID" not in best_features: 
    best_features.append("Opportunity_ID")

In [17]:
y_pred = catboost_model.predict_proba(X_test)[:,1]
submission_cb = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred})
submission_cb = submission_cb.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb.to_csv('../submits/cat_boost_with_sum_encoding.csv', index=False)

### cat boost 1 with best features

In [18]:
X_train_best_features = X_train.loc[:,best_features]
X_test_best_features = X_test.loc[:,best_features]

In [19]:
x_best_train, x_best_validation, y_best_train, y_best_validation = train_test_split(X_train_best_features, y, test_size=0.3, stratify=y)

In [20]:
catboost_model_2 = catboost(x_best_train, y_best_train, x_best_validation, y_best_validation)
test_model(catboost_model_2,x_best_validation,y_best_validation)
cross_val(catboost_model_2, x_best_train, y_best_train)

0:	learn: 0.6888063	total: 3.48ms	remaining: 170ms
1:	learn: 0.6846674	total: 6.27ms	remaining: 150ms
2:	learn: 0.6804920	total: 8.98ms	remaining: 141ms
3:	learn: 0.6762832	total: 11.5ms	remaining: 133ms
4:	learn: 0.6723366	total: 14.5ms	remaining: 130ms
5:	learn: 0.6681840	total: 17ms	remaining: 125ms
6:	learn: 0.6642245	total: 19.6ms	remaining: 120ms
7:	learn: 0.6605458	total: 22.1ms	remaining: 116ms
8:	learn: 0.6568534	total: 24.7ms	remaining: 113ms
9:	learn: 0.6533679	total: 28.3ms	remaining: 113ms
10:	learn: 0.6497974	total: 31.7ms	remaining: 112ms
11:	learn: 0.6463239	total: 35.5ms	remaining: 112ms
12:	learn: 0.6428790	total: 40.1ms	remaining: 114ms
13:	learn: 0.6396835	total: 46ms	remaining: 118ms
14:	learn: 0.6365156	total: 50.4ms	remaining: 118ms
15:	learn: 0.6330781	total: 55.9ms	remaining: 119ms
16:	learn: 0.6296829	total: 59ms	remaining: 115ms
17:	learn: 0.6263304	total: 61.6ms	remaining: 110ms
18:	learn: 0.6230545	total: 64.3ms	remaining: 105ms
19:	learn: 0.6200690	total: 

In [21]:
y_pred_2 = catboost_model_2.predict_proba(X_test_best_features)[:,1]
submission_cb_2 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features['Opportunity_ID'], 'Target': y_pred_2})
submission_cb_2 = submission_cb_2.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_2.to_csv('../submits/cat_boost_best_features_with_sum_encoding.csv', index=False)

### cat boost 2 with all features

In [22]:
catboost_model_3 = catboost_2(x_train, y_train, x_validation, y_validation)
test_model(catboost_model_3,x_validation,y_validation)
cross_val(catboost_model_3, x_train, y_train)

Fitting 3 folds for each of 150 candidates, totalling 450 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.6102705	total: 106ms	remaining: 6.46s
1:	learn: 0.5409178	total: 302ms	remaining: 9.05s
2:	learn: 0.4934608	total: 500ms	remaining: 9.84s
3:	learn: 0.4606382	total: 685ms	remaining: 9.93s
4:	learn: 0.4347180	total: 878ms	remaining: 10s
5:	learn: 0.4125132	total: 1.1s	remaining: 10.3s
6:	learn: 0.3920672	total: 1.31s	remaining: 10.3s
7:	learn: 0.3827524	total: 1.5s	remaining: 10.2s
8:	learn: 0.3755541	total: 1.71s	remaining: 10.1s
9:	learn: 0.3699127	total: 1.91s	remaining: 9.95s
10:	learn: 0.3615984	total: 2.03s	remaining: 9.43s
11:	learn: 0.3562148	total: 2.15s	remaining: 8.97s
12:	learn: 0.3529321	total: 2.23s	remaining: 8.39s
13:	learn: 0.3516706	total: 2.28s	remaining: 7.81s
14:	learn: 0.3435781	total: 2.48s	remaining: 7.77s
15:	learn: 0.3425226	total: 2.51s	remaining: 7.21s
16:	learn: 0.3394332	total: 2.71s	remaining: 7.19s
17:	learn: 0.3353484	total: 2.92s	remaining: 7.13s
18:	learn: 0.3341253	total: 2.97s	remaining: 6.72s
19:	learn: 0.3321149	total: 3.19s	remaining: 

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed: 60.6min finished


0:	learn: 0.5560385	total: 321ms	remaining: 45s
1:	learn: 0.4855844	total: 452ms	remaining: 31.4s
2:	learn: 0.4176947	total: 701ms	remaining: 32.3s
3:	learn: 0.3844627	total: 965ms	remaining: 33.1s
4:	learn: 0.3728425	total: 1.17s	remaining: 31.9s
5:	learn: 0.3482089	total: 1.39s	remaining: 31.2s
6:	learn: 0.3357681	total: 1.6s	remaining: 30.6s
7:	learn: 0.3286556	total: 1.78s	remaining: 29.6s
8:	learn: 0.3223527	total: 2s	remaining: 29.3s
9:	learn: 0.3150348	total: 2.21s	remaining: 28.9s
10:	learn: 0.3050140	total: 2.41s	remaining: 28.5s
11:	learn: 0.3018658	total: 2.64s	remaining: 28.3s
12:	learn: 0.2989014	total: 2.82s	remaining: 27.8s
13:	learn: 0.2979230	total: 3.04s	remaining: 27.6s
14:	learn: 0.2880604	total: 3.25s	remaining: 27.3s
15:	learn: 0.2873385	total: 3.31s	remaining: 25.9s
16:	learn: 0.2849505	total: 3.52s	remaining: 25.7s
17:	learn: 0.2838739	total: 3.61s	remaining: 24.7s
18:	learn: 0.2813085	total: 3.83s	remaining: 24.6s
19:	learn: 0.2804415	total: 4.02s	remaining: 24

In [25]:
best_features_2 = best_features(catboost_model_3,X_train)
if "Opportunity_ID" not in best_features_2: 
    best_features_2.append("Opportunity_ID")

In [26]:
y_pred_3 = catboost_model_3.predict_proba(X_test)[:,1]
submission_cb_3 = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred_3})
submission_cb_3 = submission_cb_3.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_3.to_csv('../submits/cat_boost_2_with_sum_encoding.csv', index=False)

### cat boost 2 with best features

In [27]:
X_train_best_features_2 = X_train.loc[:,best_features_2]
X_test_best_features_2 = X_test.loc[:,best_features_2]

In [28]:
x_best_train_2, x_best_validation_2, y_best_train_2, y_best_validation_2 = train_test_split(X_train_best_features_2, y, test_size=0.3, stratify=y)

In [29]:
catboost_model_4 = catboost_2(x_best_train_2, y_best_train_2, x_best_validation_2, y_best_validation_2)
test_model(catboost_model_4,x_best_validation_2,y_best_validation_2)
cross_val(catboost_model_4, x_best_train_2, y_best_train_2)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
0:	learn: 0.6091690	total: 1.31ms	remaining: 74.6ms
1:	learn: 0.5550694	total: 2.61ms	remaining: 73.2ms
2:	learn: 0.5277594	total: 3.69ms	remaining: 67.6ms
3:	learn: 0.5036666	total: 4.91ms	remaining: 66.3ms
4:	learn: 0.4879779	total: 7.06ms	remaining: 74.8ms
5:	learn: 0.4740069	total: 8.31ms	remaining: 72ms
6:	learn: 0.4604844	total: 9.65ms	remaining: 70.3ms
7:	learn: 0.4519497	total: 10.9ms	remaining: 68.3ms
8:	learn: 0.4448100	total: 12.3ms	remaining: 66.9ms
9:	learn: 0.4387244	total: 13.7ms	remaining: 65.9ms
10:	learn: 0.4341280	total: 14.9ms	remaining: 63.6ms
11:	learn: 0.4290385	total: 16ms	remaining: 61.4ms
12:	learn: 0.4252309	total: 17.2ms	remaining: 59.5ms
13:	learn: 0.4222866	total: 18.4ms	remaining: 58ms
14:	learn: 0.4206814	total: 19.6ms	remaining: 56.2ms
15:	learn: 0.4191141	total: 20.7ms	remaining: 54.5ms
16:	learn: 0.4144563	total: 22.2ms	remaining: 53.4ms
17:	learn: 0.4126324	total: 23.4ms	remaining: 52.1ms

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.6074664	total: 2.8ms	remaining: 160ms
1:	learn: 0.5568002	total: 5.4ms	remaining: 151ms
2:	learn: 0.5201182	total: 8ms	remaining: 147ms
3:	learn: 0.5005114	total: 10.7ms	remaining: 144ms
4:	learn: 0.4850525	total: 13ms	remaining: 138ms
5:	learn: 0.4707892	total: 15.1ms	remaining: 131ms
6:	learn: 0.4593961	total: 17ms	remaining: 124ms
7:	learn: 0.4521140	total: 19.2ms	remaining: 120ms
8:	learn: 0.4433230	total: 21.4ms	remaining: 116ms
9:	learn: 0.4345981	total: 23.3ms	remaining: 112ms
10:	learn: 0.4282679	total: 25.4ms	remaining: 108ms
11:	learn: 0.4240196	total: 27.6ms	remaining: 106ms
12:	learn: 0.4192374	total: 29.9ms	remaining: 104ms
13:	learn: 0.4147470	total: 32.3ms	remaining: 102ms
14:	learn: 0.4115010	total: 34.2ms	remaining: 97.9ms
15:	learn: 0.4081199	total: 35.3ms	remaining: 92.7ms
16:	learn: 0.4066244	total: 36.5ms	remaining: 87.9ms
17:	learn: 0.4041643	total: 37.8ms	remaining: 84.1ms
18:	learn: 0.4019186	total: 39.3ms	remaining: 80.7ms
19:	learn: 0.3999312	total

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:  3.5min finished


5:	learn: 0.3827696	total: 121ms	remaining: 2.52s
6:	learn: 0.3677570	total: 142ms	remaining: 2.51s
7:	learn: 0.3551258	total: 171ms	remaining: 2.63s
8:	learn: 0.3435271	total: 197ms	remaining: 2.67s
9:	learn: 0.3329009	total: 216ms	remaining: 2.61s
10:	learn: 0.3267503	total: 235ms	remaining: 2.56s
11:	learn: 0.3190283	total: 253ms	remaining: 2.5s
12:	learn: 0.3112067	total: 273ms	remaining: 2.48s
13:	learn: 0.3060153	total: 290ms	remaining: 2.42s
14:	learn: 0.2998218	total: 311ms	remaining: 2.41s
15:	learn: 0.2948441	total: 335ms	remaining: 2.41s
16:	learn: 0.2880705	total: 356ms	remaining: 2.39s
17:	learn: 0.2838716	total: 382ms	remaining: 2.4s
18:	learn: 0.2792261	total: 400ms	remaining: 2.36s
19:	learn: 0.2767102	total: 422ms	remaining: 2.34s
20:	learn: 0.2734899	total: 441ms	remaining: 2.31s
21:	learn: 0.2694221	total: 460ms	remaining: 2.28s
22:	learn: 0.2665486	total: 478ms	remaining: 2.24s
23:	learn: 0.2640616	total: 501ms	remaining: 2.23s
24:	learn: 0.2608591	total: 531ms	rema

In [30]:
y_pred_4 = catboost_model_4.predict_proba(X_test_best_features_2)[:,1]
submission_cb_4 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features_2['Opportunity_ID'], 'Target': y_pred_4})
submission_cb_4 = submission_cb_4.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_4.to_csv('../submits/cat_boost_2_best_features_with_sum_encoding.csv', index=False)