# Carga de BoW

In [14]:
# Importo paquetes

import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline

from scipy import sparse

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score, accuracy_score, precision_score, roc_curve, roc_auc_score, auc

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [11]:
X_train = sparse.load_npz("./Data/X_train_BoW.npz")
X_test = sparse.load_npz("./Data/X_test_BoW.npz")
y_train = pd.read_csv("./Data/y_train.csv", header=None)
y_test = pd.read_csv("./Data/y_test.csv", header=None)

In [12]:
y_train = y_train[0]
y_test = y_test[0]

In [13]:
y_train.value_counts() / y_train.count() * 100

0    97.481913
1     2.518087
Name: 0, dtype: float64

In [5]:
sampling_strategys = [0.1, 0.25, 0.5, 0.75, 0.9]

In [6]:
under_samples = [[],[],[],[],[]]
i = 0
for i in range(len(sampling_strategys)):
    ros = RandomUnderSampler(random_state=12, sampling_strategy=sampling_strategys[i])
    X_train_sample, y_train_sample = ros.fit_sample(X_train, y_train)
    under_samples[i].append(X_train_sample)
    under_samples[i].append(y_train_sample)

In [9]:
for i in range(len(under_samples)):
    print (pd.Series(under_samples[i][1]).value_counts() / pd.Series(under_samples[i][1]).count())

0    0.909091
1    0.090909
dtype: float64
0    0.8
1    0.2
dtype: float64
0    0.666667
1    0.333333
dtype: float64
0    0.571427
1    0.428573
dtype: float64
0    0.526311
1    0.473689
dtype: float64


# Modelado Bernoulli Naive Bayes

In [7]:
modelo_NB = Grid(BernoulliNB(),param_distributions={},scoring='roc_auc', verbose= 10, n_jobs=-1, cv=3)

In [8]:
auc_us = []
for i in range(len(under_samples)):
    modelo_NB.fit(under_samples[i][0], under_samples[i][1])
    y_aux = modelo_NB.predict_proba(X_test)
    y_probs =y_aux[:,1]
    fpr_nb,tpr_nb,thr_nb = roc_curve(y_test, y_probs)
    auc_us.append(auc(fpr_nb, tpr_nb))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    2.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    1.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1250s.) Setting batch_size=2.


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1093s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.1093s.) Setting batch_size=2.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:    0.0s finished


Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [9]:
auc_us

[0.8390796428193598,
 0.8400158316099,
 0.8409217855699516,
 0.8416304292422913,
 0.8423708186099134]

Está mejorando (benchmark = 0.8389243670726327). Vamos a probar en Regresión Logística (el caballito de batalla)

Nota: no hacemos oversampling porque tarda mucho SMOTE (Knn) y es menos efectivo que el random undersampling para problemas de alta dimensionalidad: https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-14-106

# Modelado Logistic Regression

In [39]:
sampling_strategys = [0.1, 0.25, 0.5, 0.75, 0.9]

In [41]:
under_samples = [[],[],[],[],[]]
i = 0
for i in range(len(sampling_strategys)):
    ros = RandomUnderSampler(random_state=12, sampling_strategy=sampling_strategys[i])
    X_train_sample, y_train_sample = ros.fit_sample(X_train, y_train)
    under_samples[i].append(X_train_sample)
    under_samples[i].append(y_train_sample)

In [42]:
for i in range(len(under_samples)):
    print (pd.Series(under_samples[i][1]).value_counts() / pd.Series(under_samples[i][1]).count())

0    0.909091
1    0.090909
dtype: float64
0    0.8
1    0.2
dtype: float64
0    0.666667
1    0.333333
dtype: float64
0    0.571427
1    0.428573
dtype: float64
0    0.526311
1    0.473689
dtype: float64


In [43]:
modelo_LR = GridSearchCV(LogisticRegression(),{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},scoring='roc_auc', verbose= 10, n_jobs=-1, cv=3)

In [44]:
auc_us = []
for i in range(len(under_samples)):
    modelo_LR.fit(under_samples[i][0], under_samples[i][1])
    print ("Mejor Modelo: ",modelo_LR.best_estimator_)
    print ("AUC: ",modelo_LR.best_score_)
    y_aux = modelo_LR.predict_proba(X_test)
    y_probs =y_aux[:,1]
    fpr_lr,tpr_lr,thr_lr = roc_curve(y_test, y_probs)
    auc_us.append(auc(fpr_lr, tpr_lr))

Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done   9 out of  21 | elapsed:    9.5s remaining:   12.7s
[Parallel(n_jobs=-1)]: Done  12 out of  21 | elapsed:   30.5s remaining:   22.8s
[Parallel(n_jobs=-1)]: Done  15 out of  21 | elapsed:   40.7s remaining:   16.2s
[Parallel(n_jobs=-1)]: Done  18 out of  21 | elapsed:   51.3s remaining:    8.5s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   55.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   55.5s finished


Mejor Modelo:  LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
AUC:  0.860215040758471
Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   9 out of  21 | elapsed:    3.5s remaining:    4.7s
[Parallel(n_jobs=-1)]: Done  12 out of  21 | elapsed:   11.2s remaining:    8.4s
[Parallel(n_jobs=-1)]: Done  15 out of  21 | elapsed:   16.8s remaining:    6.7s
[Parallel(n_jobs=-1)]: Done  18 out of  21 | elapsed:   20.4s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   23.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   23.9s finished


Mejor Modelo:  LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
AUC:  0.8584408477294648
Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done   9 out of  21 | elapsed:    1.9s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done  12 out of  21 | elapsed:    5.2s remaining:    3.9s
[Parallel(n_jobs=-1)]: Done  15 out of  21 | elapsed:   11.3s remaining:    4.4s
[Parallel(n_jobs=-1)]: Done  18 out of  21 | elapsed:   12.4s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   13.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   13.8s finished


Mejor Modelo:  LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
AUC:  0.8550905481109973
Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   9 out of  21 | elapsed:    1.7s remaining:    2.3s
[Parallel(n_jobs=-1)]: Done  12 out of  21 | elapsed:    3.9s remaining:    2.9s
[Parallel(n_jobs=-1)]: Done  15 out of  21 | elapsed:    7.9s remaining:    3.1s
[Parallel(n_jobs=-1)]: Done  18 out of  21 | elapsed:   11.1s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   14.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   14.4s finished


Mejor Modelo:  LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
AUC:  0.8518424116238552
Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done   9 out of  21 | elapsed:    1.5s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done  12 out of  21 | elapsed:    3.2s remaining:    2.4s
[Parallel(n_jobs=-1)]: Done  15 out of  21 | elapsed:    7.0s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done  18 out of  21 | elapsed:   10.3s remaining:    1.6s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   15.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:   15.3s finished


Mejor Modelo:  LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
AUC:  0.8496556383987012


In [46]:
auc_us

[0.8482298052226152,
 0.8469715165716131,
 0.8444672094435568,
 0.8422681556757085,
 0.8414676549141089]

## La mejora parece ser con valores bajos de sample_strategy... vamos a probar más

In [60]:
sampling_strategys = [0.026, 0.027, 0.028, 0.029, 0.03]

In [61]:
under_samples = [[],[],[],[],[]]
i = 0
for i in range(len(sampling_strategys)):
    ros = RandomUnderSampler(random_state=12, sampling_strategy=sampling_strategys[i])
    X_train_sample, y_train_sample = ros.fit_sample(X_train, y_train)
    under_samples[i].append(X_train_sample)
    under_samples[i].append(y_train_sample)

In [62]:
for i in range(len(under_samples)):
    print (pd.Series(under_samples[i][1]).value_counts() / pd.Series(under_samples[i][1]).count())

0    0.974659
1    0.025341
dtype: float64
0    0.97371
1    0.02629
dtype: float64
0    0.972763
1    0.027237
dtype: float64
0    0.971817
1    0.028183
dtype: float64
0    0.970874
1    0.029126
dtype: float64


In [63]:
modelo_LR = GridSearchCV(LogisticRegression(),{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},scoring='roc_auc', verbose= 10, n_jobs=-1, cv=3)

In [64]:
auc_us = []
for i in range(len(under_samples)):
    modelo_LR.fit(under_samples[i][0], under_samples[i][1])
    print ("Mejor Modelo: ",modelo_LR.best_estimator_)
    print ("AUC: ",modelo_LR.best_score_)
    y_aux = modelo_LR.predict_proba(X_test)
    y_probs =y_aux[:,1]
    fpr_lr,tpr_lr,thr_lr = roc_curve(y_test, y_probs)
    auc_us.append(auc(fpr_lr, tpr_lr))

Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done   9 out of  21 | elapsed:   49.2s remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  12 out of  21 | elapsed:  1.9min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  15 out of  21 | elapsed:  2.3min remaining:   55.0s
[Parallel(n_jobs=-1)]: Done  18 out of  21 | elapsed:  2.7min remaining:   27.4s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:  2.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:  2.9min finished


Mejor Modelo:  LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
AUC:  0.860899812258534
Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   12.2s
[Parallel(n_jobs=-1)]: Done   9 out of  21 | elapsed:   45.5s remaining:  1.0min
[Parallel(n_jobs=-1)]: Done  12 out of  21 | elapsed:  2.0min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  15 out of  21 | elapsed:  2.3min remaining:   55.9s
[Parallel(n_jobs=-1)]: Done  18 out of  21 | elapsed:  2.7min remaining:   27.1s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:  3.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:  3.0min finished


Mejor Modelo:  LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
AUC:  0.8607186627516812
Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done   9 out of  21 | elapsed:   46.7s remaining:  1.0min
[Parallel(n_jobs=-1)]: Done  12 out of  21 | elapsed:  1.8min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  15 out of  21 | elapsed:  2.1min remaining:   50.0s
[Parallel(n_jobs=-1)]: Done  18 out of  21 | elapsed:  2.6min remaining:   25.6s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:  2.7min finished


Mejor Modelo:  LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
AUC:  0.8609894388771515
Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done   9 out of  21 | elapsed:   39.8s remaining:   53.1s
[Parallel(n_jobs=-1)]: Done  12 out of  21 | elapsed:  1.5min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  15 out of  21 | elapsed:  1.9min remaining:   45.7s
[Parallel(n_jobs=-1)]: Done  18 out of  21 | elapsed:  2.3min remaining:   23.2s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:  2.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:  2.7min finished


Mejor Modelo:  LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
AUC:  0.8606565801288717
Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done   9 out of  21 | elapsed:   31.5s remaining:   42.0s
[Parallel(n_jobs=-1)]: Done  12 out of  21 | elapsed:  1.6min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  15 out of  21 | elapsed:  1.7min remaining:   41.9s
[Parallel(n_jobs=-1)]: Done  18 out of  21 | elapsed:  2.3min remaining:   22.7s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:  2.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed:  2.4min finished


Mejor Modelo:  LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
AUC:  0.8609055421117893


In [65]:
auc_us

[0.8482368844379213,
 0.8483171225195126,
 0.8483453263054614,
 0.8482289589575442,
 0.8484152839947942]

## La mejora (si es que hay y no es por azar) parece ser muy marginal... No parece que valga la pena correr los modelos más complejos undersampleando