In [1]:
import pandas as pd
import sklearn.metrics 

from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection

import xgboost as xgb
from xgboost import XGBClassifier

import scipy.stats as stats

In [2]:
df_train_mean_encoding = pd.read_csv('../Feature_Encoding/data/train_mean_encoding.csv')
df_test_mean_encoding = pd.read_csv('../Feature_Encoding/data/test_mean_encoding.csv')
df_train_binary_encoding = pd.read_csv('../Feature_Encoding/data/train_binary_encoding.csv')
df_test_binary_encoding = pd.read_csv('../Feature_Encoding/data/test_binary_encoding.csv')
df_train_cat_boost_encoding = pd.read_csv('../Feature_Encoding/data/train_cat_boost_encoding.csv')
df_test_cat_boost_encoding = pd.read_csv('../Feature_Encoding/data/test_cat_boost_encoding.csv')
df_train_backward_diff_encoding = pd.read_csv('../Feature_Encoding/data/train_backward_diff_encoding.csv')
df_test_backward_diff_encoding = pd.read_csv('../Feature_Encoding/data/test_backward_diff_encoding.csv')
train = pd.read_csv('../Feature_Engineering/data/other-cleaned_train.csv')

In [3]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [4]:
#Realiza busqueda completa combinando los parametros
def xgboost(x_train, y_train, x_validation, y_validation):
    xgb_classifier = XGBClassifier()
    params_xgb = {'n_estimators': [50, 100,150], 'learning_rate': [0.01, 0.05, 0.1], 'gamma': [0, 1, 5],'max_depth':[3,10,6,15]}    
    xgb_gs = GridSearchCV(xgb_classifier, params_xgb, cv=5)
    xgb_gs.fit(x_train, y_train)
    xgb_best = xgb_gs.best_estimator_
    print(xgb_gs.best_params_)
    print('xgb: {}'.format(xgb_best.score(x_validation, y_validation)))
    return xgb_best


#Realiza busqueda random dentro de los parametros validos
def xgboost2(x_train, y_train, x_validation, y_validation):
    params_xgb_2={
        'n_estimators':stats.randint(10,500),'learning_rate':stats.uniform(0.01,0.3),
        'subsample':stats.uniform(0.3,0.7),'min_child_weight':[1,5,10],
        'max_depth':[3,10,6,15],'gamma':stats.randint(0,10),'colsample_bytree':stats.uniform(0.,0.6)
    }
    
    xgb_rs = RandomizedSearchCV(xgb.XGBClassifier(n_jobs=-1),
                          param_distributions=params_xgb_2,
                          cv=2,
                          scoring='neg_log_loss',
                          verbose=1,
                          n_iter=200)
    
    xgb_rs.fit(x_train, y_train)
    xgb_best = xgb_rs.best_estimator_
    print(xgb_rs.best_params_)
    print('xgb: {}'.format(xgb_best.score(x_validation, y_validation)))
    return xgb_best

def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [18]:
y = train.Target
x_train_mean_encoding, x_validation_mean_encoding, y_train_mean_encoding, y_validation_mean_encoding = train_test_split(df_train_mean_encoding, y, test_size=0.3, stratify=y)
x_train_binary_encoding, x_validation_binary_encoding, y_train_binary_encoding, y_validation_binary_encoding = train_test_split(df_train_binary_encoding, y, test_size=0.3, stratify=y)
x_train_cat_boost_encoding, x_validation_cat_boost_encoding, y_train_cat_boost_encoding, y_validation_cat_boost_encoding = train_test_split(df_train_cat_boost_encoding, y, test_size=0.3, stratify=y)
x_train_backdiff_encoding, x_validation_backdiff_encoding, y_train_backdiff_encoding,  y_validation_backdiff_encoding = train_test_split(df_train_backward_diff_encoding, y, test_size=0.3, stratify=y)

## Mean encoding

In [6]:
xgboost_mean_encoding = xgboost(x_train_mean_encoding, y_train_mean_encoding, x_validation_mean_encoding, y_validation_mean_encoding)
test_model(xgboost_mean_encoding,x_validation_mean_encoding,y_validation_mean_encoding)
cross_val(xgboost_mean_encoding, x_train_mean_encoding, y_train_mean_encoding)



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































{'gamma': 0, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 150}
xgb: 0.9362290227048371
Accuracy: 93.62%, Logloss: 0.18




















0.9269761974833391


In [7]:
xgboost_mean_encoding = xgboost2(x_train_mean_encoding, y_train_mean_encoding, x_validation_mean_encoding, y_validation_mean_encoding)
test_model(xgboost_mean_encoding,x_validation_mean_encoding,y_validation_mean_encoding)
cross_val(xgboost_mean_encoding, x_train_mean_encoding, y_train_mean_encoding)

Fitting 2 folds for each of 200 candidates, totalling 400 fits












































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































{'colsample_bytree': 0.2829962571291043, 'gamma': 0, 'learning_rate': 0.12017593847419084, 'max_depth': 6, 'min_child_weight': 1, 'n_estimators': 236, 'subsample': 0.7400405446783502}
xgb: 0.9332675222112538
Accuracy: 93.33%, Logloss: 0.18
















0.9252843291207583


## Binary encoding

In [8]:
xgboost_binary_encoding = xgboost(x_train_binary_encoding, y_train_binary_encoding, x_validation_binary_encoding, y_validation_binary_encoding)
test_model(xgboost_binary_encoding,x_validation_binary_encoding,y_validation_binary_encoding)
cross_val(xgboost_binary_encoding, x_train_binary_encoding, y_train_binary_encoding)



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































{'gamma': 0, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 150}
xgb: 0.9237907206317868
Accuracy: 92.38%, Logloss: 0.19
















0.9176677887310933


In [9]:
xgboost_binary_encoding = xgboost2(x_train_binary_encoding, y_train_binary_encoding, x_validation_binary_encoding, y_validation_binary_encoding)
test_model(xgboost_binary_encoding,x_validation_binary_encoding,y_validation_binary_encoding)
cross_val(xgboost_binary_encoding, x_train_binary_encoding, y_train_binary_encoding)

Fitting 2 folds for each of 200 candidates, totalling 400 fits








































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































{'colsample_bytree': 0.2936995851814212, 'gamma': 0, 'learning_rate': 0.10332811118228277, 'max_depth': 15, 'min_child_weight': 1, 'n_estimators': 134, 'subsample': 0.8737000137421238}
xgb: 0.92142152023692
Accuracy: 92.14%, Logloss: 0.19
















0.9182603991455984


In [10]:
y_pred = xgboost_mean_encoding.predict_proba(df_test_mean_encoding)[:,1]
submission_xgboost = pd.DataFrame(data={'Opportunity_ID':df_test_mean_encoding['Opportunity_ID'], 'Target': y_pred})
submission_xgboost = submission_xgboost.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_xgboost.to_csv('submits/mean_xgboost.csv', index=False)

In [11]:
y_pred = xgboost_binary_encoding.predict_proba(df_test_binary_encoding)[:,1]
submission_xgboost = pd.DataFrame(data={'Opportunity_ID':df_test_binary_encoding['Opportunity_ID'], 'Target': y_pred})
submission_xgboost = submission_xgboost.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_xgboost.to_csv('submits/binary_xgboost.csv', index=False)

## Cat boost encoding

In [12]:
xgboost_cat_boost_encoding = xgboost(x_train_cat_boost_encoding, y_train_cat_boost_encoding, x_validation_cat_boost_encoding, y_validation_cat_boost_encoding)
test_model(xgboost_cat_boost_encoding,x_validation_cat_boost_encoding,y_validation_cat_boost_encoding)
cross_val(xgboost_cat_boost_encoding, x_train_cat_boost_encoding, y_train_cat_boost_encoding)



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































{'gamma': 0, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 150}
xgb: 0.9293188548864758
Accuracy: 92.93%, Logloss: 0.17
















0.9262130934249315


In [13]:
xgboost_2_cat_boost_encoding = xgboost2(x_train_cat_boost_encoding, y_train_cat_boost_encoding, x_validation_cat_boost_encoding, y_validation_cat_boost_encoding)
test_model(xgboost_2_cat_boost_encoding,x_validation_cat_boost_encoding,y_validation_cat_boost_encoding)
cross_val(xgboost_2_cat_boost_encoding, x_train_cat_boost_encoding, y_train_cat_boost_encoding)

Fitting 2 folds for each of 200 candidates, totalling 400 fits




















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































{'colsample_bytree': 0.2861828208788495, 'gamma': 1, 'learning_rate': 0.06374345768040443, 'max_depth': 15, 'min_child_weight': 5, 'n_estimators': 475, 'subsample': 0.9026253173412635}
xgb: 0.932872655478776
Accuracy: 93.29%, Logloss: 0.17




















0.9273136402791771


In [14]:
y_pred = xgboost_cat_boost_encoding.predict_proba(df_test_cat_boost_encoding)[:,1]
submission_xgb_cat_boost_enc = pd.DataFrame(data={'Opportunity_ID':df_test_cat_boost_encoding['Opportunity_ID'], 'Target': y_pred})
submission_xgb_cat_boost_enc = submission_xgb_cat_boost_enc.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_xgb_cat_boost_enc.to_csv('submits/xgb_cat_boost_enc.csv', index=False)

In [15]:
y_2_pred = xgboost_2_cat_boost_encoding.predict_proba(df_test_cat_boost_encoding)[:,1]
submission_xgb_2_cat_boost_enc = pd.DataFrame(data={'Opportunity_ID':df_test_cat_boost_encoding['Opportunity_ID'], 'Target': y_2_pred})
submission_xgb_2_cat_boost_enc = submission_xgb_2_cat_boost_enc.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_xgb_2_cat_boost_enc.to_csv('submits/xgb_2_cat_boost_enc.csv', index=False)

## Backward Difference encoding

In [None]:
xgboost_backdiff_encoding = xgboost(x_train_backdiff_encoding, y_train_backdiff_encoding, x_validation_backdiff_encoding, y_validation_backdiff_encoding)
test_model(xgboost_backdiff_encoding,x_validation_backdiff_encoding,y_validation_backdiff_encoding)
cross_val(xgboost_backdiff_encoding, x_train_backdiff_encoding, y_train_backdiff_encoding)





































In [None]:
xgboost_2_backward_diff_encoding = xgboost2(x_train_backward_diff_encoding, y_train_backward_diff_encoding, x_validation_backward_diff_encoding, y_validation_backward_diff_encoding)
test_model(xgboost_2_backward_diff_encoding,x_validation_backward_diff_encoding,y_validation_backward_diff_encoding)
cross_val(xgboost_2_backward_diff_encoding, x_train_backward_diff_encoding, y_train_backward_diff_encoding)

In [None]:
y_pred = xgboost_backward_diff_encoding.predict_proba(df_test_backward_diff_encoding)[:,1]
submission_xgb_backward_diff_enc = pd.DataFrame(data={'Opportunity_ID':df_test_backward_diff_encoding['Opportunity_ID'], 'Target': y_pred})
submission_xgb_backward_diff_enc = submission_xgb_backward_diff_enc.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_xgb_backward_diff_enc.to_csv('submits/xgb_backward_diff_enc.csv', index=False)

In [None]:
y_2_pred = xgboost_2_backward_diff_encoding.predict_proba(df_test_backward_diff_encoding)[:,1]
submission_xgb_2_backward_diff_enc = pd.DataFrame(data={'Opportunity_ID':df_test_backward_diff_encoding['Opportunity_ID'], 'Target': y_2_pred})
submission_xgb_2_backward_diff_enc = submission_xgb_2_backward_diff_enc.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_xgb_2_backward_diff_enc.to_csv('submits/xgb_2_backward_diff_enc.csv', index=False)