In [1]:
import pandas as pd
from sklearn import model_selection, metrics, naive_bayes

In [2]:
df_train = pd.read_csv('../../Feature_Encoding/data/train_count_encoding.csv')
df_test = pd.read_csv('../../Feature_Encoding/data/test_count_encoding.csv')
train = pd.read_csv('../../Feature_Engineering/data/other-cleaned_train.csv')

In [3]:
def cross_val(model, x_train, y_train):
    score_cross_val = model_selection.cross_val_score(model, x_train, y_train, cv=5)
    print(score_cross_val.mean())

In [4]:
#Realiza busqueda completa combinando los parametros
def bernoulli(x_train, y_train, x_validation, y_validation):
    bernoulli_classifier = naive_bayes.BernoulliNB()
    params_ber={'alpha':[0.0,0.1,0.2,1.0,2.0,10.0,50.3]}    
    ber_gs = model_selection.GridSearchCV(bernoulli_classifier, params_ber, cv=3)
    ber_gs.fit(x_train, y_train)
    ber_best = ber_gs.best_estimator_
    print(ber_gs.best_params_)
    print('bernoulli: {}'.format(ber_best.score(x_validation, y_validation)))
    return ber_best

#Realiza busqueda completa combinando los parametros
def gauss(x_train, y_train, x_validation, y_validation):
    gauss_classifier = naive_bayes.GaussianNB()
    params_gauss={}    
    gauss_gs = model_selection.GridSearchCV(gauss_classifier, params_gauss, cv=3)
    gauss_gs.fit(x_train, y_train)
    gauss_best = gauss_gs.best_estimator_
    print(gauss_gs.best_params_)
    print('gauss: {}'.format(gauss_best.score(x_validation, y_validation)))
    return gauss_best

In [5]:
def test_model(model, x_test, y_test):
    predictions = model.predict_proba(x_test)[:,1]
    logloss = metrics.log_loss(y_test, predictions)
    accuracy = metrics.accuracy_score(y_test, predictions.round())
    print("Accuracy: %.2f%%, Logloss: %.2f" % (accuracy*100.0, logloss))

In [6]:
y = train.Target
x_train, x_validation, y_train, y_validation = model_selection.train_test_split(df_train, y, test_size=0.3, stratify=y)

In [7]:
for column in df_test.columns.tolist():
    df_test[column] = df_test[column].fillna(0)  

df_test= df_test.replace('', 0)

In [8]:
bernoulli_ = bernoulli(x_train, y_train, x_validation, y_validation)
test_model(bernoulli_, x_validation,y_validation)
cross_val(bernoulli_, x_train, y_train)



{'alpha': 50.3}
bernoulli: 0.7445212240868707
Accuracy: 74.45%, Logloss: 0.63
0.7543584720160569


In [9]:
gauss_ = gauss(x_train, y_train, x_validation, y_validation)
test_model(gauss_, x_validation,y_validation)
cross_val(gauss_, x_train, y_train)

{}
gauss: 0.648568608094768
Accuracy: 64.86%, Logloss: 1.48
0.6564572050928978


In [10]:
y_pred = bernoulli_.predict_proba(df_test)[:,1]
submission_bern = pd.DataFrame(data={'Opportunity_ID':df_test['Opportunity_ID'], 'Target': y_pred})
submission_bern = submission_bern.groupby("Opportunity_ID").agg({"Target":"mean"}).reset_index()
submission_bern.to_csv('../submits/count_bernoulli.csv', index=False)