In [21]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss
from sklearn.impute import KNNImputer

In [22]:
train_set = pd.read_csv('train_prepared.csv', encoding='utf-8')
test_set = pd.read_csv('test_prepared.csv', encoding='utf-8')

train_set = train_set.replace([-np.inf, np.inf], np.nan)
test_set = test_set.replace([-np.inf, np.inf], np.nan)

In [23]:
train_fts = train_set.columns

imputer_train = KNNImputer()
imputer_train.fit(train_set)
train_set = pd.DataFrame(imputer_train.transform(train_set))

train_set.columns = train_fts

In [24]:
X_test_set = test_set.drop(columns=['Target', 'Opportunity_ID'])
y_test_set = test_set.Target

X_test_fts = X_test_set.columns

imputer_test = KNNImputer()
imputer_test.fit(X_test_set)
X_test_set = pd.DataFrame(imputer_test.transform(X_test_set))

X_test_set.columns=X_test_fts

In [25]:
X = train_set.drop(columns=['Opportunity_ID', 'Target'])
y = train_set.Target

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.2, random_state=123)

In [26]:
#from sklearn.model_selection import RandomizedSearchCV
#from scipy.stats import loguniform

#space = dict()
#space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
#space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
#space['C'] = loguniform(1e-5, 100)

In [27]:
model = GaussianNB()

params_NB = {'var_smoothing': np.logspace(0,-9, num=100)}
gs_NB = GridSearchCV(estimator=model, 
                 param_grid=params_NB, 
                 cv=5,   # use any cross validation technique 
                 verbose=3, 
                 scoring='neg_log_loss') 
_ = gs_NB.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] var_smoothing=1.0 ...............................................
[CV] .................. var_smoothing=1.0, score=-0.679, total=   0.0s
[CV] var_smoothing=1.0 ...............................................
[CV] .................. var_smoothing=1.0, score=-0.678, total=   0.0s
[CV] var_smoothing=1.0 ...............................................
[CV] .................. var_smoothing=1.0, score=-0.678, total=   0.0s
[CV] var_smoothing=1.0 ...............................................
[CV] .................. var_smoothing=1.0, score=-1.086, total=   0.0s
[CV] var_smoothing=1.0 ...............................................
[CV] .................. var_smoothing=1.0, score=-0.682, total=   0.0s
[CV] var_smoothing=0.8111308307896871 ................................
[CV] ... var_smoothing=0.8111308307896871, score=-0.676, total=   0.0s
[CV] var_smoothing=0.8111308307896871 ................................
[CV] ... var_s

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] .... var_smoothing=0.657933224657568, score=-0.675, total=   0.0s
[CV] var_smoothing=0.657933224657568 .................................
[CV] .... var_smoothing=0.657933224657568, score=-0.675, total=   0.0s
[CV] var_smoothing=0.657933224657568 .................................
[CV] .... var_smoothing=0.657933224657568, score=-1.295, total=   0.0s
[CV] var_smoothing=0.657933224657568 .................................
[CV] .... var_smoothing=0.657933224657568, score=-0.682, total=   0.0s
[CV] var_smoothing=0.533669923120631 .................................
[CV] .... var_smoothing=0.533669923120631, score=-0.670, total=   0.0s
[CV] var_smoothing=0.533669923120631 .................................
[CV] .... var_smoothing=0.533669923120631, score=-0.676, total=   0.0s
[CV] var_smoothing=0.533669923120631 .................................
[CV] .... var_smoothing=0.533669923120631, score=-0.676, total=   0.0s
[CV] var_smoothing=0.533669923120631 .................................
[CV] .

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    7.4s finished


In [28]:
gs_NB.best_params_

{'var_smoothing': 1.0}

In [29]:
model = GaussianNB(var_smoothing=1)

In [34]:
model.fit(X_train, y_train)

from sklearn.metrics import f1_score

print(log_loss(y_train, model.predict_proba(X_train)))
print(f1_score(y_train, model.predict(X_train)))
print()
print(log_loss(y_test, model.predict_proba(X_test)))
print(f1_score(y_test, model.predict(X_test)))

0.6777799198247245
0.6959909948913325

0.6773582493539407
0.7050408719346049
