# Sentiment Analyisis: Ajuste de Hiperparámetros 2

Veremos cómo usar LogisticRegressionCV para hacer ajuste de parámetros.

In [None]:
%load_ext autoreload
%autoreload 2

In [2]:
import tarfile,sys
tar = tarfile.open('../input/review_polarity_competition.tgz')
tar.extractall()
tar.close()

Extracted in Current Directory


In [None]:
#Lectura de datos de entrenamiento

In [4]:
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split

def load_datasets():
    dataset = load_files('../working/review_polarity_competition/reviews_sentoken/', shuffle=True)
    docs_traindev, docs_test, y_traindev, y_test = train_test_split(
        dataset.data, dataset.target, test_size=0.1, random_state=42)
    test = (docs_test, y_test)
    docs_train, docs_dev, y_train, y_dev = train_test_split(
        docs_traindev, y_traindev, test_size=0.1, random_state=42)
    train = docs_train, y_train
    dev = docs_dev, y_dev
    return train, dev, test

In [5]:
from sklearn import metrics  
def print_eval(model, X, y_true):
    y_pred = model.predict(X)
    acc = metrics.accuracy_score(y_true, y_pred)
    print('accuracy\t{:2.2f}\n'.format(acc))
    print(metrics.classification_report(y_true, y_pred, target_names=['neg', 'pos']))
    cm = metrics.confusion_matrix(y_true, y_pred)
    print(cm)


def eval(model, X, y_true):
    y_pred = model.predict(X)
    acc = metrics.accuracy_score(y_true, y_pred)
    f1 = metrics.f1_score(y_true, y_pred, average='macro')
    return {'acc': acc, 'f1': f1}


def print_short_eval(model, X, y_true):
    res = eval(model, X, y_true)
    print('accuracy\t{acc:2.2f}\tmacro f1\t{f1:2.2f}'.format(**res)) 

In [6]:
%%time

train, dev, test = load_datasets()
X_train, y_train = train
X_dev, y_dev = dev
X_test, y_test = test

CPU times: user 336 ms, sys: 228 ms, total: 564 ms
Wall time: 569 ms


In [7]:
#Cantidad de datos de entrenamiento

In [8]:
len(train[0])+len(dev[0])+len(test[0])

28070

In [9]:
#Entrenamiento del clasificador

In [10]:
%%time

from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score

pipeline = Pipeline([
    ('vect', TfidfVectorizer(
        binary=True,
        min_df=4,
        max_df=0.99,
        ngram_range=(1, 2),
        lowercase = True,
    )),
    ('clf', LinearSVC(loss='squared_hinge',random_state=0,max_iter=10000,fit_intercept=False,)),
])

scores = cross_val_score(pipeline,X_train,y_train,cv=25,scoring='roc_auc')
print(scores.mean())
pipeline.fit(X_train, y_train)
print_eval(pipeline, X_dev, y_dev)

0.967747189042206
accuracy	0.92

             precision    recall  f1-score   support

        neg       0.93      0.91      0.92      1278
        pos       0.91      0.93      0.92      1249

avg / total       0.92      0.92      0.92      2527

[[1164  114]
 [  89 1160]]
CPU times: user 8min 54s, sys: 17.7 s, total: 9min 11s
Wall time: 9min 12s


In [11]:
# simples casos de prueba 

In [12]:
pipeline.predict(["good film"])

array([1])

In [13]:
pipeline.predict(["not good film"])

array([0])

In [14]:
pipeline.predict(["bad film"])

array([0])

In [15]:
pipeline.predict(["not bad film"])

array([0])

In [16]:
# Evaluo datos de test

In [17]:
%%time
print_eval(pipeline, X_test, y_test)

accuracy	0.91

             precision    recall  f1-score   support

        neg       0.90      0.90      0.90      1352
        pos       0.91      0.91      0.91      1455

avg / total       0.91      0.91      0.91      2807

[[1222  130]
 [ 133 1322]]
CPU times: user 1.37 s, sys: 0 ns, total: 1.37 s
Wall time: 1.42 s


In [18]:
# lectura de reviews no etiquetadas

In [19]:
import os
data_dir = "review_polarity_competition/test_reviews_sentoken/"
filenames = []
eval_data = []
dirname = os.path.join(data_dir)
for fname in os.listdir(dirname):
    with open(os.path.join(dirname, fname), 'r') as f:
        content = f.read()
        eval_data.append(content)
        filenames.append(fname)

In [20]:
eval_data[1]

"I mistakenly received two versions of Season 2 and both have serious defects in the v. same places . On the first disk of each version , Episode 3 can not be watched because of serious pixeling which turns the image into a mosaic really , and extremely garbled sound . Episode 4 lacks any form of intro and when an image finally does appear , one is well into the story . On the duplicate copy there 's a message on disk 3 that it can not not be played . I found a # for Amazon , finally , and will try to speak w/a person about their poor quality control for this series . Needless to say , I 'll hold off reordering any more copies of this series.I have Season 1 , and I think the entire series is being done on the cheap frankly . The packaging itself is rather poor and , as any owner knows , they squeezed an entire season using both sides of a single disk . The disks themselves have no images imprinted them ; they look like something one could buy by the dozen from any office supply store .

In [21]:
# Clasificacione de reviews no etiquetadas

In [22]:
%%time
predicted = pipeline.predict(eval_data)
predicted

CPU times: user 192 ms, sys: 0 ns, total: 192 ms
Wall time: 255 ms


In [23]:
# Generacion de matriz con documentos y prediccionces

In [24]:
import numpy as np
matrix = np.array([filenames,predicted])
matrix = matrix.transpose()
# matrix

In [25]:
# Guardo matriz en archivo csv para subir a kaggle

In [28]:
import pandas as pd 
df = pd.DataFrame(matrix)
df.columns = ['Id','Category']
df.to_csv("30__.csv",index=False,)