# Machine learning

## Imports

In [1]:
import sys
import cufflinks
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

sys.path.append('./..')
cufflinks.go_offline()

In [2]:
from Corpus.Corpus import get_corpus, filter_binary_pn, filter_corpus_small
from auxiliar.VectorizerHelper import vectorizer, vectorizerIdf, preprocessor
from auxiliar import parameters
from auxiliar.HtmlParser import HtmlParser

In [3]:
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import KFold
import copy

## Config

In [4]:
polarity_dim = 5
clasificadores=['lr', 'ls', 'mb', 'rf']
idf =  False
target_names=['Neg', 'Pos']
kfolds = 10
base_dir = '2-clases' if polarity_dim == 2 else ('3-clases' if polarity_dim == 3 else '5-clases')
name = 'machine_learning/tweeter/base_line'

## Get data

In [5]:
cine = HtmlParser(200, "http://www.muchocine.net/criticas_ultimas.php", 1)
data_corpus = get_corpus('general-corpus', 'general-corpus', 1, None)

if polarity_dim == 2:
    data_corpus = filter_binary_pn(data_corpus)
    cine = filter_binary_pn(cine.get_corpus())
elif polarity_dim == 3:
    data_corpus = filter_corpus_small(data_corpus)
    cine = filter_corpus_small(cine.get_corpus())
elif polarity_dim == 5:
    cine = cine.get_corpus()
cine = cine[:5000]
used_data = cine
split = used_data.shape[0] * 0.7
data_corpus = None

#Intentando obtener datos del archivo csv...
./../Corpus/../data/general-corpus.csv
#Datos recuperados!


## Split data

In [6]:
used_data.reset_index().groupby('polarity').agg({'index': 'count'}).iplot(kind='bar')

In [7]:
train_corpus = used_data.loc[:split - 1 , :]
test_corpus = used_data.loc[split:, :]

## Initialize ML

In [8]:
vect = vectorizerIdf if idf else vectorizer
ls = CalibratedClassifierCV(LinearSVC()) if polarity_dim == 2 else OneVsRestClassifier(CalibratedClassifierCV(LinearSVC()))
lr = LogisticRegression(solver='lbfgs') if polarity_dim == 2 else OneVsRestClassifier(LogisticRegression())
mb = MultinomialNB() if polarity_dim == 2 else OneVsRestClassifier(MultinomialNB())
rf = RandomForestClassifier() if polarity_dim == 2 else OneVsRestClassifier(RandomForestClassifier())

In [9]:
pipeline_ls = Pipeline([
    ('vect', copy.deepcopy(vect)),
    ('ls', ls)
])
pipeline_lr = Pipeline([
    ('vect', copy.deepcopy(vect)),
    ('lr', lr)
])
pipeline_mb = Pipeline([
    ('vect', copy.deepcopy(vect)),
    ('mb', mb)
])
pipeline_rf = Pipeline([
    ('vect', copy.deepcopy(vect)),
    ('rf', rf)
])

In [10]:
pipelines = {
    'ls': pipeline_ls,
    'lr': pipeline_lr,
    'mb': pipeline_mb,
    'rf': pipeline_rf
}
pipelines_train = {
    'ls': ls,
    'lr': lr,
    'mb': mb,
    'rf': rf
}

## Train

In [11]:
folds = pd.read_pickle('../data/pkls/folds.pkl') # k-folds precargados
folds = folds.values

In [12]:
x_vect = vect.fit_transform(train_corpus.content, train_corpus.polarity).toarray()

In [13]:
cine_vect = vect.transform(cine.content).toarray()

In [14]:
vect.vocabulary_

{'El': 6278,
 'saqueo': 61430,
 'obras': 52240,
 'arte': 24731,
 'tiempos': 64820,
 'guerra': 43360,
 'siempre': 62129,
 'sido': 62124,
 'tema': 64446,
 'recurrente': 58752,
 'historia': 44117,
 'debido': 33002,
 'trata': 65713,
 'herramienta': 43928,
 'fundamental': 42380,
 'ser': 61919,
 'humano': 44427,
 'ya': 67817,
 'define': 33290,
 'sociedad': 62708,
 'través': 65781,
 'sus': 63909,
 'corrientes': 31853,
 'tendencias': 64528,
 'Con': 4321,
 'hecho': 43822,
 'pretende': 56379,
 'nación': 51508,
 'invasora': 46885,
 'algún': 22839,
 'modo': 50716,
 'adueñarse': 22043,
 'raíces': 58102,
 'identidad': 44595,
 'siendo': 62132,
 'acto': 21579,
 'represivo': 59722,
 'aparte': 23931,
 'delictivo': 33512,
 'evidentemente': 40289,
 'Sucedió': 18153,
 'ejércitos': 37102,
 'Asiria': 1652,
 'Egipto': 6240,
 'Grecia': 8240,
 'Roma': 16278,
 'pasando': 53698,
 'campañas': 27737,
 'napoleónicas': 51529,
 'colonización': 29711,
 'británica': 27015,
 'supuesto': 63858,
 'regímenes': 59133,
 'tota

In [15]:
results = {}
with tqdm(total=len(clasificadores) * 10) as pbar:
    for c in clasificadores:
        results[c] = { 'real': {}, 'cine_real': {}, 'predicted': {}, 'cine_predicted': {} }
        i = 0
        for train_index, test_index in folds:
            train_x = x_vect[train_index]
            train_y = train_corpus.polarity[train_index]
            test_x = x_vect[test_index]
            test_y = train_corpus.polarity[test_index]

            pipelines_train[c].fit(train_x, train_y)

            predicted = pipelines_train[c].predict(test_x)
            cine_pred = pipelines_train[c].predict(cine_vect)
            
            results[c]['real'][i] = test_y.values.tolist()
            results[c]['cine_real'][i] = cine.polarity.values.tolist()
            
            results[c]['predicted'][i] = predicted.tolist()
            results[c]['cine_predicted'][i] = cine_pred.tolist()
            i = i + 1

            pbar.update(1)

    

  0%|          | 0/40 [00:00<?, ?it/s]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/suampa/.local/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3296, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-f1d6516e7e1b>", line 15, in <module>
    cine_pred = pipelines_train[c].predict(cine_vect)
  File "/home/suampa/.local/lib/python3.7/site-packages/sklearn/multiclass.py", line 302, in predict
    pred = _predict_binary(e, X)
  File "/home/suampa/.local/lib/python3.7/site-packages/sklearn/multiclass.py", line 95, in _predict_binary
    score = np.ravel(estimator.decision_function(X))
  File "/home/suampa/.local/lib/python3.7/site-packages/sklearn/linear_model/base.py", line 273, in decision_function
    dense_output=True) + self.intercept_
  File "/home/suampa/.local/lib/python3.7/site-packages/sklearn/utils/extmath.py", line 142, in safe_sparse_dot
    return np.dot(a, b)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Tr

KeyboardInterrupt: 

In [16]:
pd.DataFrame(results).to_pickle('../results/'+name+'/'+base_dir+'/results.pkl')