# Machine learning Grid Search

## Imports

In [1]:
import sys
import cufflinks
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
import pickle

warnings.filterwarnings('ignore')

sys.path.append('..')
cufflinks.go_offline()

In [2]:
from Corpus.Corpus import get_corpus, filter_binary_pn, filter_corpus_small
from auxiliar.VectorizerHelper import vectorizer, vectorizerIdf, procesar_corpus
from auxiliar import parameters
from auxiliar.HtmlParser import HtmlParser

In [3]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold
import copy

## Config

In [4]:
polarity_dim = 3
clasificadores=['lr', 'ls', 'mb', 'rf']
idf = False
target_names=['Neg', 'Pos']
kfolds = 10
base_dir = '2-clases' if polarity_dim == 2 else ('3-clases' if polarity_dim == 3 else '5-clases')
name = 'machine_learning/tweeter/grid_search'

## Get data

In [5]:
# cine = HtmlParser(200, "http://www.muchocine.net/criticas_ultimas.php", 1)
data_corpus = get_corpus('general-corpus', 'general-corpus', 1, None)

if polarity_dim == 2:
    data_corpus = filter_binary_pn(data_corpus)
#     cine = filter_binary_pn(cine.get_corpus())
elif polarity_dim == 3:
    data_corpus = filter_corpus_small(data_corpus)
#     cine = filter_corpus_small(cine.get_corpus())
elif polarity_dim == 5:
    cine = cine.get_corpus()
# used_data = cine[:5000]
used_data = pd.DataFrame(data_corpus)

#Intentando obtener datos del archivo csv...
/home/suampa/Documentos/SentimentAnalysis/Corpus/../data/general-corpus.csv
#Datos recuperados!


## Split data

In [6]:
def apply_prepro(data):
    return procesar_corpus(data, True, True, False, True)
used_data.content = used_data.content.apply(apply_prepro)

In [7]:
split = used_data.shape[0] * 0.7
train_corpus = used_data.loc[:split - 1 , :]
test_corpus = used_data.loc[split:, :]

## Initialize ML

In [8]:
vect = vectorizerIdf if idf else vectorizer
ls = CalibratedClassifierCV(LinearSVC()) if polarity_dim == 2 else OneVsRestClassifier(CalibratedClassifierCV(LinearSVC()))
lr = LogisticRegression(solver='lbfgs') if polarity_dim == 2 else OneVsRestClassifier(LogisticRegression())
mb = MultinomialNB() if polarity_dim == 2 else OneVsRestClassifier(MultinomialNB())
rf = RandomForestClassifier() if polarity_dim == 2 else OneVsRestClassifier(RandomForestClassifier())

In [9]:
pipeline_ls = Pipeline([
    ('vect', copy.deepcopy(vect)),
    ('ls', ls)
])
pipeline_lr = Pipeline([
    ('vect', copy.deepcopy(vect)),
    ('lr', lr)
])
pipeline_mb = Pipeline([
    ('vect', copy.deepcopy(vect)),
    ('mb', mb)
])
pipeline_rf = Pipeline([
    ('vect', copy.deepcopy(vect)),
    ('rf', rf)
])

In [10]:
pipelines = {
    'ls': pipeline_ls,
    'lr': pipeline_lr,
    'mb': pipeline_mb,
    'rf': pipeline_rf
}
pipelines_train = {
    'ls': ls,
    'lr': lr,
    'mb': mb,
    'rf': rf
}

In [11]:
params = parameters.parameters_bin if polarity_dim == 2 else parameters.parameters

## Train

In [12]:
kf = KFold(n_splits=kfolds, shuffle=True, random_state=None) # realización de k-foldskf = KFold(n_splits=kfolds, shuffle=True, random_state=None) # realización de k-folds
folds = kf.split(x_train)

NameError: name 'x_train' is not defined

In [12]:
folds = pd.read_pickle('folds.pkl') # k-folds precargados
folds = folds.values

In [13]:
results = {}
grids = {}
with tqdm(total=len(clasificadores) * 10) as pbar:
    for c in clasificadores:
        results[c] = { 'real': {}, 'predicted': {} }
        i = 0
        params[c].update(parameters.vect_params)
        param_grid = params[c]
        grid_search = GridSearchCV(pipelines[c], param_grid, verbose=2, scoring='accuracy', refit=True, cv=3)
        grid = grid_search.fit(train_corpus.content, train_corpus.polarity)
        grids[c] = grid
        best_parameters = grid.best_params_
        train_params = {}
        for param_name in sorted(parameters.vect_params.keys()):
            train_params.update({param_name[6:]: best_parameters[param_name]})
        vect.set_params(**train_params)
        x_vect = vect.fit_transform(train_corpus.content, train_corpus.polarity).toarray()
        for train_index, test_index in folds:
            train_x = x_vect[train_index]
            train_y = train_corpus.polarity[train_index]
            test_x = x_vect[test_index]
            test_y = train_corpus.polarity[test_index]

            pipelines_train[c].fit(train_x, train_y)

            predicted = pipelines_train[c].predict(test_x)
            
            results[c]['real'][i] = test_y.values.tolist()
            results[c]['predicted'][i] = predicted.tolist()
            i = i + 1

            pbar.update(1)

    

  0%|          | 0/40 [00:00<?, ?it/s][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] lr__estimator__C=1.0, lr__estimator__fit_intercept=True, lr__estimator__max_iter=100, lr__estimator__solver=liblinear, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  lr__estimator__C=1.0, lr__estimator__fit_intercept=True, lr__estimator__max_iter=100, lr__estimator__solver=liblinear, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   1.0s
[CV] lr__estimator__C=1.0, lr__estimator__fit_intercept=True, lr__estimator__max_iter=100, lr__estimator__solver=liblinear, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV]  lr__estimator__C=1.0, lr__estimator__fit_intercept=True, lr__estimator__max_iter=100, lr__estimator__solver=liblinear, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   0.8s
[CV] lr__estimator__C=1.0, lr__estimator__fit_intercept=True, lr__estimator__max_iter=100, lr__estimator__solver=liblinear, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  lr__estimator__C=1.0, lr__estimator__fit_intercept=True, lr__estimator__max_iter=100, lr__estimator__solver=liblinear, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   0.8s
[CV] lr__estimator__C=1.0, lr__estimator__fit_intercept=True, lr__estimator__max_iter=100, lr__estimator__solver=lbfgs, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  lr__estimator__C=1.0, lr__estimator__fit_intercept=True, lr__estimator__max_iter=100, lr__estimator__solver=lbfgs, vect__max_df=0.5,

[CV]  lr__estimator__C=1.0, lr__estimator__fit_intercept=False, lr__estimator__max_iter=200, lr__estimator__solver=lbfgs, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   0.9s
[CV] lr__estimator__C=1.0, lr__estimator__fit_intercept=False, lr__estimator__max_iter=200, lr__estimator__solver=lbfgs, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  lr__estimator__C=1.0, lr__estimator__fit_intercept=False, lr__estimator__max_iter=200, lr__estimator__solver=lbfgs, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   0.9s
[CV] lr__estimator__C=1.0, lr__estimator__fit_intercept=False, lr__estimator__max_iter=200, lr__estimator__solver=lbfgs, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  lr__estimator__C=1.0, lr__estimator__fit_intercept=False, lr__estimator__max_iter=200, lr__estimator__solver=lbfgs, vect__max_df=0.5, vect__

[CV]  lr__estimator__C=0.5, lr__estimator__fit_intercept=False, lr__estimator__max_iter=100, lr__estimator__solver=lbfgs, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   1.3s
[CV] lr__estimator__C=0.5, lr__estimator__fit_intercept=False, lr__estimator__max_iter=200, lr__estimator__solver=liblinear, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  lr__estimator__C=0.5, lr__estimator__fit_intercept=False, lr__estimator__max_iter=200, lr__estimator__solver=liblinear, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   1.3s
[CV] lr__estimator__C=0.5, lr__estimator__fit_intercept=False, lr__estimator__max_iter=200, lr__estimator__solver=liblinear, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  lr__estimator__C=0.5, lr__estimator__fit_intercept=False, lr__estimator__max_iter=200, lr__estimator__solver=liblinear, vect__ma

[CV]  lr__estimator__C=1.5, lr__estimator__fit_intercept=False, lr__estimator__max_iter=100, lr__estimator__solver=liblinear, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   3.6s
[CV] lr__estimator__C=1.5, lr__estimator__fit_intercept=False, lr__estimator__max_iter=100, lr__estimator__solver=liblinear, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  lr__estimator__C=1.5, lr__estimator__fit_intercept=False, lr__estimator__max_iter=100, lr__estimator__solver=liblinear, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   5.4s
[CV] lr__estimator__C=1.5, lr__estimator__fit_intercept=False, lr__estimator__max_iter=100, lr__estimator__solver=lbfgs, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  lr__estimator__C=1.5, lr__estimator__fit_intercept=False, lr__estimator__max_iter=100, lr__estimator__solver=lbfgs, vect__max_df

[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:  3.1min finished
 25%|██▌       | 10/40 [03:22<04:10,  8.37s/it]  [Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] ls__estimator__base_estimator__C=1.0, ls__estimator__base_estimator__loss=hinge, ls__estimator__base_estimator__max_iter=1, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  ls__estimator__base_estimator__C=1.0, ls__estimator__base_estimator__loss=hinge, ls__estimator__base_estimator__max_iter=1, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   6.6s
[CV] ls__estimator__base_estimator__C=1.0, ls__estimator__base_estimator__loss=hinge, ls__estimator__base_estimator__max_iter=1, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.6s remaining:    0.0s


[CV]  ls__estimator__base_estimator__C=1.0, ls__estimator__base_estimator__loss=hinge, ls__estimator__base_estimator__max_iter=1, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   6.7s
[CV] ls__estimator__base_estimator__C=1.0, ls__estimator__base_estimator__loss=hinge, ls__estimator__base_estimator__max_iter=1, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  ls__estimator__base_estimator__C=1.0, ls__estimator__base_estimator__loss=hinge, ls__estimator__base_estimator__max_iter=1, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   2.7s
[CV] ls__estimator__base_estimator__C=1.0, ls__estimator__base_estimator__loss=hinge, ls__estimator__base_estimator__max_iter=100, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  ls__estimator__base_estimator__C=1.0, ls__estimator__base_estimator__loss=hinge, ls__estimator__base_estim

[CV]  ls__estimator__base_estimator__C=0.5, ls__estimator__base_estimator__loss=squared_hinge, ls__estimator__base_estimator__max_iter=1, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   1.0s
[CV] ls__estimator__base_estimator__C=0.5, ls__estimator__base_estimator__loss=squared_hinge, ls__estimator__base_estimator__max_iter=100, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  ls__estimator__base_estimator__C=0.5, ls__estimator__base_estimator__loss=squared_hinge, ls__estimator__base_estimator__max_iter=100, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   1.1s
[CV] ls__estimator__base_estimator__C=0.5, ls__estimator__base_estimator__loss=squared_hinge, ls__estimator__base_estimator__max_iter=100, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  ls__estimator__base_estimator__C=0.5, ls__estimator__base_estimator__l

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  1.6min finished
 50%|█████     | 20/40 [05:30<01:04,  3.22s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] mb__estimator__alpha=1.0, mb__estimator__fit_prior=True, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  mb__estimator__alpha=1.0, mb__estimator__fit_prior=True, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   6.4s
[CV] mb__estimator__alpha=1.0, mb__estimator__fit_prior=True, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.4s remaining:    0.0s


[CV]  mb__estimator__alpha=1.0, mb__estimator__fit_prior=True, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   6.3s
[CV] mb__estimator__alpha=1.0, mb__estimator__fit_prior=True, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  mb__estimator__alpha=1.0, mb__estimator__fit_prior=True, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   1.5s
[CV] mb__estimator__alpha=1.0, mb__estimator__fit_prior=False, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  mb__estimator__alpha=1.0, mb__estimator__fit_prior=False, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   0.8s
[CV] mb__estimator__alpha=1.0, mb__estimator__fit_prior=False, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  mb__estimator__alpha=1.0, mb__estimator__fit_prior=False, vect__

[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:   27.6s finished
 65%|██████▌   | 26/40 [05:59<01:49,  7.79s/it][Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 16 candidates, totalling 48 fits
[CV] rf__estimator__criterion=gini, rf__estimator__max_depth=10, rf__estimator__max_features=auto, rf__estimator__min_samples_split=2, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  rf__estimator__criterion=gini, rf__estimator__max_depth=10, rf__estimator__max_features=auto, rf__estimator__min_samples_split=2, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   1.0s
[CV] rf__estimator__criterion=gini, rf__estimator__max_depth=10, rf__estimator__max_features=auto, rf__estimator__min_samples_split=2, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV]  rf__estimator__criterion=gini, rf__estimator__max_depth=10, rf__estimator__max_features=auto, rf__estimator__min_samples_split=2, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   1.0s
[CV] rf__estimator__criterion=gini, rf__estimator__max_depth=10, rf__estimator__max_features=auto, rf__estimator__min_samples_split=2, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  rf__estimator__criterion=gini, rf__estimator__max_depth=10, rf__estimator__max_features=auto, rf__estimator__min_samples_split=2, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   1.0s
[CV] rf__estimator__criterion=gini, rf__estimator__max_depth=10, rf__estimator__max_features=auto, rf__estimator__min_samples_split=10, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  rf__estimator__criterion=gini, rf__estimator__max_depth=10, rf__estimator__max_feat

 75%|███████▌  | 30/40 [06:10<01:17,  7.79s/it]

[CV]  rf__estimator__criterion=gini, rf__estimator__max_depth=10, rf__estimator__max_features=10, rf__estimator__min_samples_split=2, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   2.5s
[CV] rf__estimator__criterion=gini, rf__estimator__max_depth=10, rf__estimator__max_features=10, rf__estimator__min_samples_split=10, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  rf__estimator__criterion=gini, rf__estimator__max_depth=10, rf__estimator__max_features=10, rf__estimator__min_samples_split=10, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   3.5s
[CV] rf__estimator__criterion=gini, rf__estimator__max_depth=10, rf__estimator__max_features=10, rf__estimator__min_samples_split=10, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  rf__estimator__criterion=gini, rf__estimator__max_depth=10, rf__estimator__max_features=1

[CV]  rf__estimator__criterion=entropy, rf__estimator__max_depth=10, rf__estimator__max_features=auto, rf__estimator__min_samples_split=2, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   1.0s
[CV] rf__estimator__criterion=entropy, rf__estimator__max_depth=10, rf__estimator__max_features=auto, rf__estimator__min_samples_split=10, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  rf__estimator__criterion=entropy, rf__estimator__max_depth=10, rf__estimator__max_features=auto, rf__estimator__min_samples_split=10, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   1.0s
[CV] rf__estimator__criterion=entropy, rf__estimator__max_depth=10, rf__estimator__max_features=auto, rf__estimator__min_samples_split=10, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  rf__estimator__criterion=entropy, rf__estimator__max_depth=10, rf__es

[CV]  rf__estimator__criterion=entropy, rf__estimator__max_depth=None, rf__estimator__max_features=10, rf__estimator__min_samples_split=2, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   1.2s
[CV] rf__estimator__criterion=entropy, rf__estimator__max_depth=None, rf__estimator__max_features=10, rf__estimator__min_samples_split=10, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  rf__estimator__criterion=entropy, rf__estimator__max_depth=None, rf__estimator__max_features=10, rf__estimator__min_samples_split=10, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1), total=   1.5s
[CV] rf__estimator__criterion=entropy, rf__estimator__max_depth=None, rf__estimator__max_features=10, rf__estimator__min_samples_split=10, vect__max_df=0.5, vect__max_features=500, vect__min_df=10, vect__ngram_range=(1, 1) 
[CV]  rf__estimator__criterion=entropy, rf__estimator__max_depth=None, rf__

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  2.2min finished
100%|██████████| 40/40 [08:52<00:00,  3.51s/it]


In [14]:
results

{'lr': {'real': {0: [3,
    3,
    3,
    3,
    3,
    1,
    3,
    3,
    3,
    1,
    3,
    3,
    3,
    1,
    1,
    3,
    3,
    3,
    1,
    3,
    1,
    1,
    3,
    3,
    3,
    3,
    1,
    1,
    3,
    3,
    3,
    3,
    1,
    3,
    3,
    1,
    1,
    1,
    3,
    2,
    3,
    1,
    1,
    3,
    3,
    3,
    1,
    3,
    1,
    3,
    1,
    1,
    1,
    3,
    3,
    1,
    3,
    3,
    1,
    3,
    3,
    3,
    3,
    1,
    1,
    3,
    1,
    1,
    1,
    1,
    3,
    3,
    1,
    3,
    1,
    3,
    2,
    3,
    3,
    3,
    3,
    3,
    1,
    1,
    3,
    3,
    1,
    3,
    3,
    2,
    3,
    3,
    1,
    1,
    3,
    1,
    3,
    2,
    1,
    1,
    3,
    3,
    1,
    1,
    1,
    1,
    1,
    1,
    3,
    3,
    3,
    1,
    3,
    1,
    1,
    3,
    3,
    3,
    3,
    1,
    3,
    1,
    3,
    3,
    3,
    3,
    1,
    3,
    1,
    1,
    1,
    1,
    1,
    3,
    3,
    3,
    1,
    3,
    3,
    1,
   

In [15]:
pd.DataFrame(results).to_pickle('results/'+name+'/'+base_dir+'/results.pkl')

In [16]:
with open('results/'+name+'/'+base_dir+'/grid_results.pkl', 'wb') as fp:
    pickle.dump(grid, fp)

In [17]:
test_results = {}
with tqdm(total=len(clasificadores)) as pbar:
    for c in clasificadores:
        test_results[c] = { 'real': {}, 'predicted': {} }
        i = 0
        grid = grids[c]
        best_parameters = grid.best_params_
        train_params = {}
        for param_name in sorted(parameters.vect_params.keys()):
            train_params.update({param_name[6:]: best_parameters[param_name]})
        vect.set_params(**train_params)
        vect.fit(data_corpus.content, data_corpus.polarity)
        x_vect = vect.transform(train_corpus.content).toarray()
        x_vect_test = vect.transform(test_corpus.content).toarray()
        train_x = x_vect
        train_y = train_corpus.polarity
        test_x = x_vect_test
        test_y = test_corpus.polarity

        pipelines_train[c].fit(train_x, train_y)

        predicted = pipelines_train[c].predict(test_x)

        test_results[c]['real'][i] = test_y.values.tolist()
        test_results[c]['predicted'][i] = predicted.tolist()
        i = i + 1

        pbar.update(1)

100%|██████████| 4/4 [00:46<00:00, 11.14s/it]


In [18]:
pd.DataFrame(test_results).to_pickle('results/'+name+'/'+base_dir+'/test_results.pkl')