# Sentiment Analysis: Inspección de Modelos

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from util import load_datasets
train, dev, test = load_datasets()
X_train, y_train = train
X_dev, y_dev = dev
X_test, y_test = test

## Actual Estado del Arte

A partir de ahora la construcción del pipeline va en un módulo aparte:

In [4]:
from model import build_pipeline

pipeline = build_pipeline()
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=3,
        ngram_range=(1, 5), preprocessor=None, stop_words=None,
        strip_...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [5]:
from util import print_eval
print_eval(pipeline, X_dev, y_dev)

accuracy	0.89

             precision    recall  f1-score   support

        neg       0.87      0.93      0.90       162
        pos       0.91      0.84      0.87       138

avg / total       0.89      0.89      0.89       300

[[150  12]
 [ 22 116]]


## Parámetros del Modelo

Inspeccionemos el pipeline:

In [9]:
pipeline.steps

[('vect', CountVectorizer(analyzer='word', binary=True, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=0.9, max_features=None, min_df=3,
          ngram_range=(1, 5), preprocessor=None, stop_words=None,
          strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
          tokenizer=None, vocabulary=None)),
 ('clf',
  LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
            intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
            penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
            verbose=0, warm_start=False))]

In [10]:
vect = pipeline.named_steps['vect']
clf = pipeline.named_steps['clf']

In [11]:
features = vect.get_feature_names()
len(features)
features[:10]

['00',
 '00 for',
 '000',
 '000 by',
 '000 for',
 '000 in',
 '000 years',
 '007',
 '05',
 '10']

In [12]:
# clf?
coef = clf.coef_
coef.shape

(1, 77207)

In [13]:
import pandas as pd
features_df = pd.DataFrame({'name': features, 'coef': coef.ravel()})
features_df.sort_values('coef', inplace=True)

Vemos los coeficientes que más influyen:

In [14]:
features_df[:10]

Unnamed: 0,name,coef
7722,bad,-0.533431
53202,script,-0.375926
42856,nothing,-0.36321
76126,worst,-0.327793
48721,plot,-0.314007
9882,boring,-0.295886
16309,director,-0.275436
65169,then,-0.262407
58735,than,-0.25733
46185,only,-0.248938


In [19]:
features_df[-10:]

Unnamed: 0,name,coef
48001,performance,0.228033
2281,american,0.229602
48046,performances,0.239935
46849,others,0.243454
3694,and it,0.254433
71572,very,0.25557
10736,but it,0.25759
24881,great,0.263341
38768,many,0.275511
73155,well,0.304695


Interesante. ¿Qué conclusiones se pueden sacar?

## Stop Words


In [15]:
from model import build_pipeline
from util import print_eval

pipeline = build_pipeline()
pipeline.set_params(vect__stop_words='english')
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

accuracy	0.85

             precision    recall  f1-score   support

        neg       0.86      0.86      0.86       162
        pos       0.84      0.83      0.84       138

avg / total       0.85      0.85      0.85       300

[[140  22]
 [ 23 115]]


No mejora. ¿Por qué? Veamos las stopwords:

In [25]:
vect = pipeline.named_steps['vect']
# vect.get_stop_words()

Analicemos los coeficientes del modelo:

In [16]:
from analysis import coef_df
df = coef_df(pipeline)

In [17]:
df[:10]

Unnamed: 0,name,coef
1602,bad,-0.821439
19460,script,-0.609183
16846,plot,-0.578206
24360,worst,-0.570389
2432,boring,-0.532745
21546,supposed,-0.479266
14074,maybe,-0.475238
5681,director,-0.451659
21327,stupid,-0.417517
14251,mess,-0.414792


In [20]:
df[-10:]

Unnamed: 0,name,coef
5218,definitely,0.381941
9717,great,0.382732
16455,perfectly,0.389686
3285,change,0.395888
903,american,0.397135
16015,overall,0.401399
6909,enjoy,0.407785
16477,performances,0.452292
10444,hilarious,0.452333
14191,memorable,0.489459
