# Sentiment Analyisis: Ajuste de Hiperparámetros

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from util import load_datasets
train, dev, test = load_datasets()
X_train, y_train = train
X_dev, y_dev = dev
X_test, y_test = test

## Estado del Arte Actual

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from util import print_eval

pipeline = Pipeline([
    ('vect', CountVectorizer(binary=True)),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.85

             precision    recall  f1-score   support

        neg       0.85      0.88      0.86       162
        pos       0.85      0.82      0.83       138

avg / total       0.85      0.85      0.85       300

[[142  20]
 [ 25 113]]


## Vectorizador

Primero hagamos un estudio superficial para ver qué parámetros vale la pena analizar.

### Rango de n-gramas

In [4]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
        binary=True,
        ngram_range=(1, 2),
    )),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.87

             precision    recall  f1-score   support

        neg       0.87      0.88      0.88       162
        pos       0.86      0.85      0.85       138

avg / total       0.87      0.87      0.87       300

[[143  19]
 [ 21 117]]


### Min Frequency

In [24]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
        binary=True,
        #min_df=5,
    )),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.85

             precision    recall  f1-score   support

        neg       0.85      0.88      0.86       162
        pos       0.85      0.82      0.83       138

avg / total       0.85      0.85      0.85       300

[[142  20]
 [ 25 113]]


### Max Frequency

In [26]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
        binary=True,
        max_df=0.7,
    )),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.85

             precision    recall  f1-score   support

        neg       0.85      0.86      0.86       162
        pos       0.84      0.83      0.83       138

avg / total       0.85      0.85      0.85       300

[[140  22]
 [ 24 114]]


### Stop words

In [92]:
pipeline = Pipeline([
    ('vect', CountVectorizer(
        binary=True,
        stop_words='english',
    )),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.84

             precision    recall  f1-score   support

        neg       0.85      0.86      0.86       162
        pos       0.83      0.83      0.83       138

avg / total       0.84      0.84      0.84       300

[[139  23]
 [ 24 114]]


### Grid-Search en Development

Probemos muchas las combinaciones posibles de valores.

In [90]:
from sklearn.model_selection import ParameterGrid

param_grid = {
    'vect__binary': [True],
    'vect__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
    'vect__min_df': [1, 3, 5, 7],
    'vect__max_df': [0.95, 0.9, 0.7],
    'clf__random_state': [0],
}

params_list = list(ParameterGrid(param_grid))

In [91]:
params_list[10]
len(params_list)

60

In [87]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from util import eval

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression()),
])

results = []
for params in params_list:
    # TODO: add progress bar!
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    result = eval(pipeline, X_dev, y_dev)
    
    results.append({
        **result,
        **params,
    })

In [88]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df.sort_values(['acc', 'f1'], ascending=False)[:10]

Unnamed: 0,acc,f1,vect__binary,vect__max_df,vect__min_df,vect__ngram_range
9,0.886667,0.885192,True,0.95,3,"(1, 5)"
29,0.886667,0.885192,True,0.9,3,"(1, 5)"
8,0.883333,0.881904,True,0.95,3,"(1, 4)"
28,0.883333,0.881904,True,0.9,3,"(1, 4)"
4,0.88,0.878788,True,0.95,1,"(1, 5)"
24,0.88,0.878788,True,0.9,1,"(1, 5)"
7,0.88,0.878619,True,0.95,3,"(1, 3)"
13,0.88,0.878619,True,0.95,5,"(1, 4)"
14,0.88,0.878619,True,0.95,5,"(1, 5)"
27,0.88,0.878619,True,0.9,3,"(1, 3)"


¡Excelente!

Conclusiones:
1. Tenemos dos mejores configuraciones.
2. Para próximas búsquedas podemos descartar algunos valores.

Elegimos la siguiente configuración:

In [83]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from util import print_eval

pipeline = Pipeline([
    ('vect', CountVectorizer(
        binary=True,
        min_df=3,
        max_df=0.90,
        ngram_range=(1, 5),
    )),
    ('clf', LogisticRegression(random_state=0)),
])
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.89

             precision    recall  f1-score   support

        neg       0.87      0.93      0.90       162
        pos       0.91      0.84      0.87       138

avg / total       0.89      0.89      0.89       300

[[150  12]
 [ 22 116]]


Evaluamos en test y guardamos el modelo:

In [89]:
print_eval(pipeline, X_test, y_test)
from util import save_model
save_model(pipeline, '2018-07-27_count_logreg')

accuracy	0.84

             precision    recall  f1-score   support

        neg       0.84      0.86      0.85       257
        pos       0.84      0.82      0.83       243

avg / total       0.84      0.84      0.84       500

[[220  37]
 [ 43 200]]


### Grid-Search con Cross Validation

**¡Ejercicio!**

En lugar de hacer la búsqueda sobre dev, hacer 5-fold cross validation sobre la unión de train y dev.

## Clasificador

**¡Ejercicio!**

Ajustar hiperparámetros de la regresión logística.
