In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn import model_selection
import sklearn.naive_bayes as nb

In [19]:
df_train_mean_encoding = pd.read_csv('../Feature_Encoding/data/train_mean_encoding.csv')
df_test_mean_encoding = pd.read_csv('../Feature_Encoding/data/test_mean_encoding.csv')
df_train_binary_encoding = pd.read_csv('../Feature_Encoding/data/train_binary_encoding.csv')
df_test_binary_encoding = pd.read_csv('../Feature_Encoding/data/test_binary_encoding.csv')
train = pd.read_csv('../Feature_Engineering/data/other-cleaned_train.csv')

In [20]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [21]:
#Realiza busqueda completa combinando los parametros
def bernoulli(x_train, y_train, x_validation, y_validation):
    bernoulli_classifier = nb.BernoulliNB()
    params_ber={'alpha':[0.0,0.1,0.2,1.0,2.0,10.0,50.3]}    
    ber_gs = GridSearchCV(bernoulli_classifier, params_ber, cv=3)
    ber_gs.fit(x_train, y_train)
    ber_best = ber_gs.best_estimator_
    print(ber_gs.best_params_)
    print('bernoulli: {}'.format(ber_best.score(x_validation, y_validation)))
    return ber_best

#Realiza busqueda completa combinando los parametros
def gauss(x_train, y_train, x_validation, y_validation):
    gauss_classifier = nb.GaussianNB()
    params_gauss={}    
    gauss_gs = GridSearchCV(gauss_classifier, params_gauss, cv=3)
    gauss_gs.fit(x_train, y_train)
    gauss_best = gauss_gs.best_estimator_
    print(gauss_gs.best_params_)
    print('gauss: {}'.format(gauss_best.score(x_validation, y_validation)))
    return gauss_best

In [22]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = log_loss(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [23]:
y = train.Target
x_train_mean_encoding, x_validation_mean_encoding, y_train_mean_encoding, y_validation_mean_encoding = train_test_split(df_train_mean_encoding, y, test_size=0.3, stratify=y)
x_train_binary_encoding, x_validation_binary_encoding, y_train_binary_encoding, y_validation_binary_encoding = train_test_split(df_train_binary_encoding, y, test_size=0.3, stratify=y)

## Mean encoding

In [24]:
bernoulli_mean_encoding = bernoulli(x_train_mean_encoding, y_train_mean_encoding, x_validation_mean_encoding, y_validation_mean_encoding)
test_model(bernoulli_mean_encoding,x_validation_mean_encoding,y_validation_mean_encoding)
cross_val(bernoulli_mean_encoding, x_train_mean_encoding, y_train_mean_encoding)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


{'alpha': 0.0}
bernoulli: 0.7638696939782823
Accuracy: 76.39%, Logloss: 0.60
0.765866076920488


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


In [25]:
gauss_mean_encoding = gauss(x_train_mean_encoding, y_train_mean_encoding, x_validation_mean_encoding, y_validation_mean_encoding)
test_model(gauss_mean_encoding,x_validation_mean_encoding,y_validation_mean_encoding)
cross_val(gauss_mean_encoding, x_train_mean_encoding, y_train_mean_encoding)

{}
gauss: 0.6015794669299112
Accuracy: 60.16%, Logloss: 0.89
0.6009450188430921


## Binary Encoding

In [26]:
bernoulli_binary_encoding = bernoulli(x_train_binary_encoding, y_train_binary_encoding, x_validation_binary_encoding, y_validation_binary_encoding)
test_model(bernoulli_binary_encoding,x_validation_binary_encoding,y_validation_binary_encoding)
cross_val(bernoulli_binary_encoding, x_train_binary_encoding, y_train_binary_encoding)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


{'alpha': 0.0}
bernoulli: 0.7344521224086871
Accuracy: 73.45%, Logloss: 1.03
0.7427655844867254


  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


In [27]:
gauss_binary_encoding = gauss(x_train_binary_encoding, y_train_binary_encoding, x_validation_binary_encoding, y_validation_binary_encoding)
test_model(gauss_binary_encoding,x_validation_binary_encoding,y_validation_binary_encoding)
cross_val(gauss_binary_encoding, x_train_binary_encoding, y_train_binary_encoding)

{}
gauss: 0.5974333662388944
Accuracy: 59.74%, Logloss: 0.86
0.6120296835090901


In [28]:
y_pred_bern_mean = bernoulli_mean_encoding.predict_proba(df_test_mean_encoding)[:,1]
submission_bern_mean = pd.DataFrame(data={'Opportunity_ID':df_test_mean_encoding['Opportunity_ID'], 'Target': y_pred_bern_mean})
submission_bern_mean = submission_bern_mean.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_bern_mean.to_csv('submits/mean_bernoulli.csv', index=False)