In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from category_encoders import PolynomialEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import model_selection
import scipy.stats as stats
from sklearn.feature_selection import SelectFromModel

In [2]:
train = pd.read_csv('../../Feature_Engineering/data/other-cleaned_train.csv')
test = pd.read_csv('../../Feature_Engineering/data/other-cleaned_test.csv')

In [3]:
train.drop(columns = ['Unnamed: 0'], inplace = True)
test.drop(columns = ['Unnamed: 0'], inplace = True)

In [4]:
X_train = train.copy()
X_test = test.copy()

In [5]:
categ_columns = train.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year",\
                                    "Target"]).columns
for column in categ_columns:
    encoder = PolynomialEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(train[column])
    X_train = X_train.join(feature_encoded.add_suffix('_poly'))
    X_train.drop(columns=[column], inplace = True)
    if 'intercept_poly' in X_train:
        X_train.drop(columns=['intercept_poly'], inplace = True)

  raw_poly = scores.reshape((-1, 1)) ** np.arange(n).reshape((1, -1))


In [6]:
categ_columns = test.drop(columns = ["Opportunity_ID","ID", "Pricing, Delivery_Terms_Quote_Appr",\
                                    "Bureaucratic_Code_0_Approval","Bureaucratic_Code_0_Approved",\
                                    "Submitted_for_Approval","ASP","ASP_(converted)","TRF","Total_Amount",\
                                    "Total_Taxable_Amount","diferencia_en_dias","Last_Modified_DOY","Last_Modified_Year",\
                                    "Opportunity_Created_DOY","Opportunity_Created_Year","Quote_Expiry_DOY",\
                                     "Quote_Expiry_Year","Planned_Delivery_Start_DOY","Planned_Delivery_Start_Year",\
                                    "Planned_Delivery_End_DOY","Planned_Delivery_End_Year"]).columns
for column in categ_columns:
    encoder = PolynomialEncoder()
    encoder.fit(train[column], train['Target'])
    feature_encoded = encoder.transform(test[column])
    X_test = X_test.join(feature_encoded.add_suffix('_poly'))
    X_test.drop(columns=[column], inplace = True)
    if 'intercept_poly' in X_test:
        X_test.drop(columns=['intercept_poly'], inplace = True)

In [7]:
X_train["Total_Amount"] = pd.to_numeric(X_train["Total_Amount"],errors='coerce').fillna(X_train["Total_Amount"].mean())
X_train["Opportunity_Created_Year"] = pd.to_numeric(X_train["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_train["Quote_Expiry_DOY"] = pd.to_numeric(X_train["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_train["Quote_Expiry_Year"] = pd.to_numeric(X_train["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_DOY"] = pd.to_numeric(X_train["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_train["Planned_Delivery_End_Year"] = pd.to_numeric(X_train["Planned_Delivery_End_Year"],errors='coerce').fillna(0)

X_train = X_train.drop(columns = 'Target')

In [8]:
X_test["Total_Amount"] = pd.to_numeric(X_test["Total_Amount"],errors='coerce').fillna(test["Total_Amount"].mean())
X_test["Opportunity_Created_Year"] = pd.to_numeric(X_test["Opportunity_Created_Year"],errors='coerce').fillna(0)
X_test["Quote_Expiry_DOY"] = pd.to_numeric(X_test["Quote_Expiry_DOY"],errors='coerce').fillna(0)
X_test["Quote_Expiry_Year"] = pd.to_numeric(X_test["Quote_Expiry_Year"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_DOY"] = pd.to_numeric(X_test["Planned_Delivery_End_DOY"],errors='coerce').fillna(0)
X_test["Planned_Delivery_End_Year"] = pd.to_numeric(X_test["Planned_Delivery_End_Year"],errors='coerce').fillna(0)


## Model: Catboost

In [9]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [10]:
def catboost(x_train, y_train, x_validation, y_validation):
    catb_classifier = CatBoostClassifier()
    params_catb = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.05, 0.1], 'l2_leaf_reg': [0, 1, 5]}    
    catb_gs = GridSearchCV(catb_classifier, params_catb, cv=5)
    catb_gs.fit(x_train, y_train)
    catb_best = catb_gs.best_estimator_
    print(catb_gs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [11]:
def catboost_2(x_train, y_train, x_validation, y_validation):
    params_catb_2={
        'n_estimators':stats.randint(10,150),'learning_rate':stats.uniform(0.01,0.3),
        'subsample':stats.uniform(0.3,0.7),'l2_leaf_reg':[1,5,10],
        'max_depth':[3,10,6],'colsample_bylevel':stats.uniform(0.,0.6)
    }
    
    catb_rs = RandomizedSearchCV(CatBoostClassifier(),
                          param_distributions=params_catb_2,
                          cv=3,
                          scoring='neg_log_loss',
                          verbose=1,
                          n_iter=150)
    
    catb_rs.fit(x_train, y_train)
    catb_best = catb_rs.best_estimator_
    print(catb_rs.best_params_)
    print('catb: {}'.format(catb_best.score(x_validation, y_validation)))
    return catb_best

In [12]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [24]:
def best_features(model,train):
    importance = model.feature_importances_
    result = pd.DataFrame([train.columns,importance]).transpose()
    result.columns = ["Feature","Importance"]
    return result.sort_values(by='Importance', ascending=False).head(15)["Feature"].to_list()
    
    
def plot_features(model,train):
    fig = plt.gcf()
    fig.set_size_inches(350, 350)
    selection = SelectFromModel(model, threshold=0.040, prefit=True)
    selected_dataset = selection.transform(train)
    model.plot_importance(booster=model)

    plt.rcParams["figure.figsize"] = (40,20)
    plt.xlabel("\nFeature importance", fontsize=40)
    plt.ylabel("Features", fontsize=35)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.show()

### cat boost 1 with all features

In [14]:
y = train.Target
x_train, x_validation, y_train, y_validation = train_test_split(X_train, y, test_size=0.3, stratify=y)

In [15]:
catboost_model = catboost(x_train, y_train, x_validation, y_validation)
test_model(catboost_model,x_validation,y_validation)
cross_val(catboost_model, x_train, y_train)

0:	learn: 0.6845075	total: 92.7ms	remaining: 4.54s
1:	learn: 0.6756349	total: 115ms	remaining: 2.75s
2:	learn: 0.6679605	total: 138ms	remaining: 2.16s
3:	learn: 0.6611921	total: 190ms	remaining: 2.19s
4:	learn: 0.6546643	total: 213ms	remaining: 1.91s
5:	learn: 0.6485334	total: 235ms	remaining: 1.72s
6:	learn: 0.6414941	total: 256ms	remaining: 1.57s
7:	learn: 0.6338227	total: 278ms	remaining: 1.46s
8:	learn: 0.6262705	total: 300ms	remaining: 1.37s
9:	learn: 0.6212198	total: 327ms	remaining: 1.31s
10:	learn: 0.6157354	total: 351ms	remaining: 1.25s
11:	learn: 0.6105596	total: 374ms	remaining: 1.18s
12:	learn: 0.6057962	total: 395ms	remaining: 1.12s
13:	learn: 0.6000902	total: 415ms	remaining: 1.07s
14:	learn: 0.5941738	total: 435ms	remaining: 1.01s
15:	learn: 0.5902279	total: 455ms	remaining: 968ms
16:	learn: 0.5862793	total: 476ms	remaining: 924ms
17:	learn: 0.5811760	total: 496ms	remaining: 881ms
18:	learn: 0.5765478	total: 522ms	remaining: 851ms
19:	learn: 0.5713241	total: 542ms	remain

In [16]:
best_features = best_features(catboost_model,X_train)
if "Opportunity_ID" not in best_features: 
    best_features.append("Opportunity_ID")

In [17]:
y_pred = catboost_model.predict_proba(X_test)[:,1]
submission_cb = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred})
submission_cb = submission_cb.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb.to_csv('../submits/cat_boost_with_polynomial_encoding.csv', index=False)

### cat boost 1 with best features

In [18]:
X_train_best_features = X_train.loc[:,best_features]
X_test_best_features = X_test.loc[:,best_features]

In [19]:
x_best_train, x_best_validation, y_best_train, y_best_validation = train_test_split(X_train_best_features, y, test_size=0.3, stratify=y)

In [20]:
catboost_model_2 = catboost(x_best_train, y_best_train, x_best_validation, y_best_validation)
test_model(catboost_model_2,x_best_validation,y_best_validation)
cross_val(catboost_model_2, x_best_train, y_best_train)

0:	learn: 0.6887659	total: 4.34ms	remaining: 213ms
1:	learn: 0.6845109	total: 10.1ms	remaining: 242ms
2:	learn: 0.6806354	total: 15.7ms	remaining: 246ms
3:	learn: 0.6767179	total: 20.3ms	remaining: 233ms
4:	learn: 0.6728539	total: 26.4ms	remaining: 238ms
5:	learn: 0.6692416	total: 33.1ms	remaining: 243ms
6:	learn: 0.6653864	total: 40ms	remaining: 246ms
7:	learn: 0.6616520	total: 46.9ms	remaining: 246ms
8:	learn: 0.6580549	total: 50ms	remaining: 228ms
9:	learn: 0.6543571	total: 55.4ms	remaining: 221ms
10:	learn: 0.6508512	total: 58.5ms	remaining: 208ms
11:	learn: 0.6473481	total: 62.7ms	remaining: 198ms
12:	learn: 0.6438347	total: 65.5ms	remaining: 186ms
13:	learn: 0.6403514	total: 72.4ms	remaining: 186ms
14:	learn: 0.6370209	total: 76.3ms	remaining: 178ms
15:	learn: 0.6337027	total: 79.3ms	remaining: 168ms
16:	learn: 0.6304337	total: 81.9ms	remaining: 159ms
17:	learn: 0.6273417	total: 87.7ms	remaining: 156ms
18:	learn: 0.6242342	total: 94.6ms	remaining: 154ms
19:	learn: 0.6212929	total

In [21]:
y_pred_2 = catboost_model_2.predict_proba(X_test_best_features)[:,1]
submission_cb_2 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features['Opportunity_ID'], 'Target': y_pred_2})
submission_cb_2 = submission_cb_2.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_2.to_csv('../submits/cat_boost_best_features_with_polynomial_encoding.csv', index=False)

### cat boost 2 with all features

In [22]:
catboost_model_3 = catboost_2(x_train, y_train, x_validation, y_validation)
test_model(catboost_model_3,x_validation,y_validation)
cross_val(catboost_model_3, x_train, y_train)

Fitting 3 folds for each of 150 candidates, totalling 450 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.6524435	total: 10.3ms	remaining: 708ms
1:	learn: 0.6067316	total: 21.6ms	remaining: 735ms
2:	learn: 0.5703726	total: 31.1ms	remaining: 695ms
3:	learn: 0.5444478	total: 40.3ms	remaining: 665ms
4:	learn: 0.5257627	total: 49.1ms	remaining: 638ms
5:	learn: 0.5075906	total: 58.5ms	remaining: 623ms
6:	learn: 0.4904024	total: 68.2ms	remaining: 614ms
7:	learn: 0.4811871	total: 76.6ms	remaining: 593ms
8:	learn: 0.4661193	total: 85.9ms	remaining: 582ms
9:	learn: 0.4561125	total: 94.4ms	remaining: 567ms
10:	learn: 0.4464642	total: 105ms	remaining: 561ms
11:	learn: 0.4426434	total: 113ms	remaining: 546ms
12:	learn: 0.4384112	total: 123ms	remaining: 539ms
13:	learn: 0.4333518	total: 131ms	remaining: 524ms
14:	learn: 0.4264996	total: 140ms	remaining: 514ms
15:	learn: 0.4213400	total: 149ms	remaining: 503ms
16:	learn: 0.4172691	total: 158ms	remaining: 492ms
17:	learn: 0.4143212	total: 166ms	remaining: 481ms
18:	learn: 0.4096292	total: 175ms	remaining: 470ms
19:	learn: 0.4050409	total: 184

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed: 44.9min finished


0:	learn: 0.5525003	total: 144ms	remaining: 21s
1:	learn: 0.4758084	total: 286ms	remaining: 20.7s
2:	learn: 0.4216575	total: 439ms	remaining: 21s
3:	learn: 0.3939733	total: 599ms	remaining: 21.4s
4:	learn: 0.3724223	total: 740ms	remaining: 21s
5:	learn: 0.3521637	total: 887ms	remaining: 20.8s
6:	learn: 0.3373353	total: 1.04s	remaining: 20.8s
7:	learn: 0.3265763	total: 1.19s	remaining: 20.8s
8:	learn: 0.3094039	total: 1.35s	remaining: 20.7s
9:	learn: 0.3045751	total: 1.5s	remaining: 20.5s
10:	learn: 0.2982878	total: 1.65s	remaining: 20.4s
11:	learn: 0.2892605	total: 1.81s	remaining: 20.3s
12:	learn: 0.2874944	total: 1.83s	remaining: 18.8s
13:	learn: 0.2823518	total: 1.97s	remaining: 18.7s
14:	learn: 0.2789970	total: 2.11s	remaining: 18.6s
15:	learn: 0.2744234	total: 2.25s	remaining: 18.5s
16:	learn: 0.2698603	total: 2.39s	remaining: 18.3s
17:	learn: 0.2657693	total: 2.54s	remaining: 18.2s
18:	learn: 0.2584321	total: 2.68s	remaining: 18.1s
19:	learn: 0.2538372	total: 2.83s	remaining: 18s

In [25]:
best_features_2 = best_features(catboost_model_3,X_train)
if "Opportunity_ID" not in best_features_2: 
    best_features_2.append("Opportunity_ID")

In [26]:
y_pred_3 = catboost_model_3.predict_proba(X_test)[:,1]
submission_cb_3 = pd.DataFrame(data={'Opportunity_ID':X_test['Opportunity_ID'], 'Target': y_pred_3})
submission_cb_3 = submission_cb_3.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_3.to_csv('../submits/cat_boost_2_with_polynomial_encoding.csv', index=False)

### cat boost 2 with best features

In [27]:
X_train_best_features_2 = X_train.loc[:,best_features_2]
X_test_best_features_2 = X_test.loc[:,best_features_2]

In [28]:
x_best_train_2, x_best_validation_2, y_best_train_2, y_best_validation_2 = train_test_split(X_train_best_features_2, y, test_size=0.3, stratify=y)

In [30]:
catboost_model_4 = catboost_2(x_best_train_2, y_best_train_2, x_best_validation_2, y_best_validation_2)
test_model(catboost_model_4,x_best_validation_2,y_best_validation_2)
cross_val(catboost_model_4, x_best_train_2, y_best_train_2)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
0:	learn: 0.6891730	total: 1.13ms	remaining: 64.2ms
1:	learn: 0.6872929	total: 2.24ms	remaining: 62.8ms
2:	learn: 0.6860531	total: 3.1ms	remaining: 56.8ms
3:	learn: 0.6852352	total: 3.97ms	remaining: 53.6ms
4:	learn: 0.6846955	total: 4.86ms	remaining: 51.5ms
5:	learn: 0.6843395	total: 5.73ms	remaining: 49.7ms
6:	learn: 0.6841047	total: 6.65ms	remaining: 48.5ms
7:	learn: 0.6839499	total: 7.55ms	remaining: 47.2ms
8:	learn: 0.6838478	total: 8.44ms	remaining: 45.9ms
9:	learn: 0.6837805	total: 9.33ms	remaining: 44.8ms
10:	learn: 0.6592946	total: 11.4ms	remaining: 48.7ms
11:	learn: 0.6592638	total: 12.4ms	remaining: 47.7ms
12:	learn: 0.6592435	total: 14ms	remaining: 48.6ms
13:	learn: 0.6335891	total: 15.1ms	remaining: 47.5ms
14:	learn: 0.6335790	total: 16.1ms	remaining: 46.1ms
15:	learn: 0.6335724	total: 17.1ms	remaining: 44.9ms
16:	learn: 0.6335680	total: 18.2ms	remaining: 43.8ms
17:	learn: 0.6335651	total: 19.1ms	remaining: 42.

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


17:	learn: 0.6320916	total: 19.5ms	remaining: 43.3ms
18:	learn: 0.6320897	total: 20.6ms	remaining: 42.3ms
19:	learn: 0.6320885	total: 22.1ms	remaining: 41.9ms
20:	learn: 0.6320876	total: 23.3ms	remaining: 41.1ms
21:	learn: 0.6320871	total: 24.4ms	remaining: 39.9ms
22:	learn: 0.6320867	total: 25.8ms	remaining: 39.2ms
23:	learn: 0.6320865	total: 26.8ms	remaining: 37.9ms
24:	learn: 0.6320863	total: 27.8ms	remaining: 36.7ms
25:	learn: 0.6320862	total: 29.1ms	remaining: 35.8ms
26:	learn: 0.6320862	total: 30ms	remaining: 34.4ms
27:	learn: 0.6320861	total: 30.9ms	remaining: 33.1ms
28:	learn: 0.6320861	total: 31.7ms	remaining: 31.7ms
29:	learn: 0.6320861	total: 32.9ms	remaining: 30.7ms
30:	learn: 0.6320861	total: 33.8ms	remaining: 29.4ms
31:	learn: 0.6217089	total: 34.8ms	remaining: 28.2ms
32:	learn: 0.6217089	total: 35.6ms	remaining: 27ms
33:	learn: 0.6217089	total: 36.5ms	remaining: 25.8ms
34:	learn: 0.6217089	total: 37.4ms	remaining: 24.6ms
35:	learn: 0.6141892	total: 38.4ms	remaining: 23.5

[Parallel(n_jobs=1)]: Done 450 out of 450 | elapsed:  2.1min finished


9:	learn: 0.3813757	total: 79.3ms	remaining: 801ms
10:	learn: 0.3743008	total: 91.1ms	remaining: 829ms
11:	learn: 0.3736938	total: 93.7ms	remaining: 773ms
12:	learn: 0.3727384	total: 97.5ms	remaining: 735ms
13:	learn: 0.3634424	total: 108ms	remaining: 748ms
14:	learn: 0.3615277	total: 112ms	remaining: 715ms
15:	learn: 0.3555853	total: 122ms	remaining: 727ms
16:	learn: 0.3534226	total: 128ms	remaining: 707ms
17:	learn: 0.3473302	total: 138ms	remaining: 711ms
18:	learn: 0.3468766	total: 140ms	remaining: 677ms
19:	learn: 0.3434691	total: 150ms	remaining: 681ms
20:	learn: 0.3387579	total: 159ms	remaining: 681ms
21:	learn: 0.3387574	total: 161ms	remaining: 651ms
22:	learn: 0.3336813	total: 169ms	remaining: 645ms
23:	learn: 0.3284997	total: 180ms	remaining: 652ms
24:	learn: 0.3273654	total: 185ms	remaining: 636ms
25:	learn: 0.3238170	total: 196ms	remaining: 641ms
26:	learn: 0.3225122	total: 201ms	remaining: 625ms
27:	learn: 0.3188453	total: 211ms	remaining: 624ms
28:	learn: 0.3154270	total: 

In [31]:
y_pred_4 = catboost_model_4.predict_proba(X_test_best_features_2)[:,1]
submission_cb_4 = pd.DataFrame(data={'Opportunity_ID':X_test_best_features_2['Opportunity_ID'], 'Target': y_pred_4})
submission_cb_4 = submission_cb_4.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_cb_4.to_csv('../submits/cat_boost_2_best_features_with_polynomial_encoding.csv', index=False)