# Machine learning

## Imports

In [1]:
import sys
import cufflinks
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

sys.path.append('./..')
cufflinks.go_offline()

In [16]:
from Corpus.Corpus import get_corpus, filter_binary_pn, filter_corpus_small
from auxiliar.VectorizerHelper import vectorizer, vectorizerIdf, procesar_corpus
from auxiliar import parameters
from auxiliar.HtmlParser import HtmlParser

In [3]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold
import copy

## Config

In [35]:
polarity_dim = 2
clasificadores=['lr', 'ls', 'mb', 'rf']
idf =  False
target_names=['Neg', 'Pos']
kfolds = 10
base_dir = '2-clases' if polarity_dim == 2 else ('3-clases' if polarity_dim == 3 else '5-clases')
name = 'machine_learning/tweeter/base_line'

## Get data

In [11]:
# cine = HtmlParser(200, "http://www.muchocine.net/criticas_ultimas.php", 1)
data_corpus = get_corpus('general-corpus', 'general-corpus', 1, None)

if polarity_dim == 2:
    data_corpus = filter_binary_pn(data_corpus)
#     cine = filter_binary_pn(cine.get_corpus())
elif polarity_dim == 3:
    data_corpus = filter_corpus_small(data_corpus)
#     cine = filter_corpus_small(cine.get_corpus())
elif polarity_dim == 5:
    cine = cine.get_corpus()
# used_data = cine[:5000]
used_data = pd.DataFrame(data_corpus)
split = used_data.shape[0] * 0.7
# data_corpus = None

#Intentando obtener datos del archivo csv...
/home/suampa/Documentos/SentimentAnalysis/Corpus/../data/general-corpus.csv
#Datos recuperados!


## Split data

In [20]:
def apply_prepro(data):
    return procesar_corpus(data, True, True, False, True)
used_data.content = used_data.content.apply(apply_prepro)

In [21]:
train_corpus = used_data.loc[:split - 1 , :]
test_corpus = used_data.loc[split:, :]

## Initialize ML

In [22]:
vect = vectorizerIdf if idf else vectorizer
ls = CalibratedClassifierCV(LinearSVC()) if polarity_dim == 2 else OneVsRestClassifier(CalibratedClassifierCV(LinearSVC()))
lr = LogisticRegression(solver='lbfgs') if polarity_dim == 2 else OneVsRestClassifier(LogisticRegression())
mb = MultinomialNB() if polarity_dim == 2 else OneVsRestClassifier(MultinomialNB())
rf = RandomForestClassifier() if polarity_dim == 2 else OneVsRestClassifier(RandomForestClassifier())

In [23]:
pipeline_ls = Pipeline([
    ('vect', copy.deepcopy(vect)),
    ('ls', ls)
])
pipeline_lr = Pipeline([
    ('vect', copy.deepcopy(vect)),
    ('lr', lr)
])
pipeline_mb = Pipeline([
    ('vect', copy.deepcopy(vect)),
    ('mb', mb)
])
pipeline_rf = Pipeline([
    ('vect', copy.deepcopy(vect)),
    ('rf', rf)
])

In [24]:
pipelines = {
    'ls': pipeline_ls,
    'lr': pipeline_lr,
    'mb': pipeline_mb,
    'rf': pipeline_rf
}
pipelines_train = {
    'ls': ls,
    'lr': lr,
    'mb': mb,
    'rf': rf
}

## Train

In [9]:
kf = KFold(n_splits=kfolds, shuffle=True, random_state=None) # realización de k-folds
folds = kf.split(x_train)

In [29]:
folds = pd.read_pickle('data/pkls/folds.pkl') # k-folds precargados
folds = folds.values

In [26]:
x_vect = vect.fit_transform(train_corpus.content, train_corpus.polarity).toarray()

In [27]:
vect.vocabulary_

{'gracias': 4265,
 'mar': 5606,
 'off': 6318,
 'pensando': 6707,
 'regalito': 7673,
 'sinde': 8337,
 'va': 9135,
 'sgae': 8272,
 'cuando': 2267,
 'van': 9173,
 'sus': 8580,
 'corruptos': 2155,
 'intento': 4888,
 'no': 6185,
 'sacar': 8034,
 'conclusiones': 1877,
 'conozco': 1947,
 'adicto': 194,
 'drama': 2988,
 'ja': 5010,
 'suena': 8518,
 'algo': 357,
 'toca': 8821,
 'grabación': 4259,
 'especial': 3479,
 'navideño': 6112,
 'mari': 5629,
 'crismas': 2227,
 'buen': 1173,
 'día': 3041,
 'primero': 7177,
 'mandar': 5558,
 'abrazo': 40,
 'grande': 4275,
 'miguel': 5815,
 'su': 8487,
 'familia': 3779,
 'hoy': 4553,
 'podría': 6925,
 'ser': 8228,
 'grandeza': 4277,
 'humana': 4565,
 'escaño': 3414,
 'listo': 5322,
 'empezar': 3208,
 'congreso': 1926,
 'buenos': 1189,
 'días': 3043,
 'em': 3167,
 'ira': 4958,
 'puente': 7349,
 'si': 8281,
 'vais': 9145,
 'dejeis': 2516,
 'llevar': 5375,
 'tableta': 8615,
 'pc': 6643,
 'luego': 5430,
 'orbyt': 6392,
 'momento': 5923,
 'digo': 2822,
 'más': 6

In [30]:
results = {}
with tqdm(total=len(clasificadores) * 10) as pbar:
    for c in clasificadores:
        results[c] = { 'real': {}, 'predicted': {} }
        i = 0
        for train_index, test_index in folds:
            train_x = x_vect[train_index]
            train_y = train_corpus.polarity[train_index]
            test_x = x_vect[test_index]
            test_y = train_corpus.polarity[test_index]

            pipelines_train[c].fit(train_x, train_y)

            predicted = pipelines_train[c].predict(test_x)
            
            results[c]['real'][i] = test_y.values.tolist()
            results[c]['predicted'][i] = predicted.tolist()
            i = i + 1

            pbar.update(1)

    

100%|██████████| 40/40 [03:41<00:00, 14.72s/it]


In [40]:
pd.DataFrame(results).to_pickle('results/'+name+'/'+base_dir+'/results.pkl')