In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import model_selection

In [34]:
df_train_mean_encoding = pd.read_csv('../Feature_Encoding/data/train_mean_encoding.csv')
df_test_mean_encoding = pd.read_csv('../Feature_Encoding/data/test_mean_encoding.csv')
df_train_binary_encoding = pd.read_csv('../Feature_Encoding/data/train_binary_encoding.csv')
df_test_binary_encoding = pd.read_csv('../Feature_Encoding/data/test_binary_encoding.csv')
train = pd.read_csv('../Feature_Engineering/data/other-cleaned_train.csv')

In [35]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [36]:
#Realiza busqueda completa combinando los parametros
def bagging(x_train, y_train, x_validation, y_validation):
    bag_classifier = BaggingClassifier()
    params_bag = {'n_estimators':[1,2,3,4,5,6,7,8,9,10,15,25],'max_samples':[1.0],'max_features':[1.0]}    
    bag_gs = GridSearchCV(bag_classifier, params_bag, cv=3)
    bag_gs.fit(x_train, y_train)
    bag_best = bag_gs.best_estimator_
    print(bag_gs.best_params_)
    print('bag: {}'.format(bag_best.score(x_validation, y_validation)))
    return bag_best

In [37]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))


In [38]:
y = train.Target
x_train_mean_encoding, x_validation_mean_encoding, y_train_mean_encoding, y_validation_mean_encoding = train_test_split(df_train_mean_encoding, y, test_size=0.3, stratify=y)
x_train_binary_encoding, x_validation_binary_encoding, y_train_binary_encoding, y_validation_binary_encoding = train_test_split(df_train_binary_encoding, y, test_size=0.3, stratify=y)

## Mean Encoding

In [39]:
bagging_mean_encoding = bagging(x_train_mean_encoding, y_train_mean_encoding, x_validation_mean_encoding, y_validation_mean_encoding)
test_model(bagging_mean_encoding,x_validation_mean_encoding,y_validation_mean_encoding)
cross_val(bagging_mean_encoding, x_train_mean_encoding, y_train_mean_encoding)

{'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 25}
bag: 0.9125370187561698
Accuracy: 91.25%, Logloss: 0.35
0.9148747290611823


In [41]:
y_pred_bag_mean = bagging_mean_encoding.predict_proba(df_test_mean_encoding)[:,1]
submission_bag_mean = pd.DataFrame(data={'Opportunity_ID':df_test_mean_encoding['Opportunity_ID'], 'Target': y_pred_bag_mean})
submission_bag_mean = submission_bag_mean.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_bag_mean.to_csv('submits/mean_bag.csv', index=False)

## Binary Encoding

In [40]:
bagging_binary_encoding = bagging(x_train_binary_encoding, y_train_binary_encoding, x_validation_binary_encoding, y_validation_binary_encoding)
test_model(bagging_binary_encoding,x_validation_binary_encoding,y_validation_binary_encoding)
cross_val(bagging_binary_encoding, x_train_binary_encoding, y_train_binary_encoding)

{'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 25}
bag: 0.9087857847976308
Accuracy: 90.88%, Logloss: 0.26
0.8959224737260059


In [42]:
y_pred_bag_binary = bagging_binary_encoding.predict_proba(df_test_binary_encoding)[:,1]
submission_bag_binary = pd.DataFrame(data={'Opportunity_ID':df_test_binary_encoding['Opportunity_ID'], 'Target': y_pred_bag_binary})
submission_bag_binary = submission_bag_binary.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_bag_binary.to_csv('submits/binary_bag.csv', index=False)

## Binary Encoding

In [40]:
bagging_binary_encoding = bagging(x_train_binary_encoding, y_train_binary_encoding, x_validation_binary_encoding, y_validation_binary_encoding)
test_model(bagging_binary_encoding,x_validation_binary_encoding,y_validation_binary_encoding)
cross_val(bagging_binary_encoding, x_train_binary_encoding, y_train_binary_encoding)

{'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 25}
bag: 0.9087857847976308
Accuracy: 90.88%, Logloss: 0.26
0.8959224737260059


In [42]:
y_pred_bag_binary = bagging_binary_encoding.predict_proba(df_test_binary_encoding)[:,1]
submission_bag_binary = pd.DataFrame(data={'Opportunity_ID':df_test_binary_encoding['Opportunity_ID'], 'Target': y_pred_bag_binary})
submission_bag_binary = submission_bag_binary.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_bag_binary.to_csv('submits/binary_bag.csv', index=False)

## Binary Encoding

In [40]:
bagging_binary_encoding = bagging(x_train_binary_encoding, y_train_binary_encoding, x_validation_binary_encoding, y_validation_binary_encoding)
test_model(bagging_binary_encoding,x_validation_binary_encoding,y_validation_binary_encoding)
cross_val(bagging_binary_encoding, x_train_binary_encoding, y_train_binary_encoding)

{'max_features': 1.0, 'max_samples': 1.0, 'n_estimators': 25}
bag: 0.9087857847976308
Accuracy: 90.88%, Logloss: 0.26
0.8959224737260059


In [42]:
y_pred_bag_binary = bagging_binary_encoding.predict_proba(df_test_binary_encoding)[:,1]
submission_bag_binary = pd.DataFrame(data={'Opportunity_ID':df_test_binary_encoding['Opportunity_ID'], 'Target': y_pred_bag_binary})
submission_bag_binary = submission_bag_binary.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_bag_binary.to_csv('submits/binary_bag.csv', index=False)