# Oscar Esaú Peralta Rosales
## Tarea 1: Fundamentos de Minería de Texto


In [107]:
import csv
import math
import argparse

from collections import defaultdict

import numpy as np
import pandas as pd

from tqdm import tqdm
from nltk.corpus import CategorizedPlaintextCorpusReader
from nltk.tokenize import WordPunctTokenizer 
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_fscore_support, roc_auc_score
from sklearn import metrics, preprocessing
from sklearn import svm, datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2

import matplotlib.pyplot as plt

%matplotlib inline


## Actividad 3: Detección de Agresividad con Análisis de Sentimiento Básico

### 2.1 Experimentos Parte 1

Carga de los datos

In [108]:
mex_corpus = CategorizedPlaintextCorpusReader('./data/corpus/', r'.*\.txt', cat_pattern=r'(\w+)/*')

In [109]:
tk = TweetTokenizer() 
stopw = stopwords.words('spanish') + stopwords.words('english')

In [110]:
x_train = [ 
            [token for token in tk.tokenize(tweet) if token not in stopw and len(token) > 2]
            for tweet in mex_corpus.raw('mex_train.txt').split('\n') if tweet
          ]
y_train = [int(label) for label in mex_corpus.raw('mex_train_labels.txt').split('\n') if label ]
x_val = [ 
            [token for token in tk.tokenize(tweet) if token not in stopw and len(token) > 2]
            for tweet in mex_corpus.raw('mex_val.txt').split('\n') if tweet
        ]
y_val = [int(label) for label in mex_corpus.raw('mex_val_labels.txt').split('\n') if label ]

#### 1. Utilice el recurso léxico del Consejo Nacional de Investigación de Canadá llamado "EmoLex" (https://www.saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm) para construir una "Bolsa de Emociones" de los Tweets de agresividad (Debe usar EmoLex en Español). Para esto, una estrategia sencilla sería enmascarar cada palabra con su emoción, y después construir la Bolsa de Emociones.


In [111]:
file_name = './data/NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx'

In [112]:
df = pd.read_excel(file_name, usecols='CI,DB:DK')

In [113]:
df.head()

Unnamed: 0,Spanish (es),Positive,Negative,Anger,Anticipation,Disgust,Fear,Joy,Sadness,Surprise,Trust
0,detrás,0,0,0,0,0,0,0,0,0,0
1,ábaco,0,0,0,0,0,0,0,0,0,1
2,abandonar,0,1,0,0,0,1,0,1,0,0
3,abandonado,0,1,1,0,0,1,0,1,0,0
4,abandono,0,1,1,0,0,1,0,1,1,0


In [114]:
spzip = zip(np.array([x.lower() for x in np.array(df['Spanish (es)'])]), 
            np.array(df['Positive']),
            np.array(df['Negative']),
            np.array(df['Anger']),
            np.array(df['Anticipation']),
            np.array(df['Disgust']),
            np.array(df['Fear']),
            np.array(df['Joy']),
            np.array(df['Sadness']),
            np.array(df['Surprise']),
            np.array(df['Trust']))

spanish_map = sorted(spzip, key=lambda item:item[0])
spanish_map[100:110]

[('absolución', 1, 0, 0, 0, 0, 0, 1, 0, 0, 1),
 ('absolutamente', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 ('absoluto', 1, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 ('absorbente', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 ('absorbente', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 ('absorbido', 1, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 ('absorción', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 ('absorto', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 ('abstención', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
 ('abstenerse', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)]

In [145]:
def spanish_map_search(spanish_map, word):
    """Returns a array with the emotions for any word"""
    word = word.lower()
    i = 0
    j = len(spanish_map) - 1
    
    while i < j:
        m = int((i+j)/2)
        match = spanish_map[m][0].lower()
        if match == word:
            return np.array(spanish_map[m][1:])
        if word > match:
            i = m + 1
        else:
            j = m - 1
    
    return np.zeros(10)

In [147]:
spanish_map_search(spanish_map, 'absorbido')

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [117]:
def build_emotions_bow(docs, spanish_map, emotions=10):
    """ Build a emotions bag """
    bow = np.zeros((len(docs), emotions), dtype=float)
    
    for index, doc in enumerate(tqdm(docs)):
        for word in doc:
            w_emotions = spanish_map_search(spanish_map, word)
            bow[index] += w_emotions
        
    return bow

In [118]:
bow = build_emotions_bow(x_train, spanish_map)

100%|██████████| 5544/5544 [00:00<00:00, 8669.30it/s]


#### 2. Representa a los documentos y clasifica con SVM como en la Practica de Clase 3. Evalúa varias representaciones, y ponga una tabla comparativa a modo de resumen (e.g., binario, frecuencia, tfidf, etc.).

In [148]:
def classify(x_train, y_train, x_val, y_val, kbest=None):
    """ Clasificación con SVM, feature selection with chi2 """
    parameters = {'C': [.05, .12, .25, .5, 1, 2, 4]}
    
    if kbest:
        selectk = SelectKBest(chi2, k=kbest)
        selectk.fit(x_train, y_train)
        x_train = selectk.transform(x_train)
        x_val = selectk.transform(x_val)
    
    svr = svm.LinearSVC(class_weight='balanced')
    grid = GridSearchCV(estimator=svr, param_grid=parameters, n_jobs=8, scoring="f1_macro", cv=5)
    
    grid.fit(x_train, y_train) 

    y_pred = grid.predict(x_val)

    p, r, f, _ = precision_recall_fscore_support(y_val, y_pred, average='macro', pos_label=None)
    a = accuracy_score(y_val, y_pred)
    print(confusion_matrix(y_val, y_pred) )
    print(metrics.classification_report(y_val, y_pred))
    return p, r , f, a

metrics_hist = []

##### Bolsa de emociones binaria

In [120]:
def build_binary_bow(emotions_bow):
    """ Build a emotions binary bow """
    bow = emotions_bow.copy()
    bow[bow > 0] = 1
    return bow

In [121]:
nx_train = build_binary_bow(build_emotions_bow(x_train, spanish_map))
nx_val = build_binary_bow(build_emotions_bow(x_val, spanish_map))
metrics_hist.append(("Bolsa de emociones binaria", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=None)))

100%|██████████| 5544/5544 [00:00<00:00, 8327.33it/s]
100%|██████████| 616/616 [00:00<00:00, 9162.29it/s]


[[337  60]
 [155  64]]
              precision    recall  f1-score   support

           0       0.68      0.85      0.76       397
           1       0.52      0.29      0.37       219

    accuracy                           0.65       616
   macro avg       0.60      0.57      0.57       616
weighted avg       0.62      0.65      0.62       616



##### Bolsa de emociones frecuencias

In [122]:
def build_frecs_bow(emotions_bow, normalize=False):
    """ Build a emotions frequencies bow """
    # The bow already has the frequencies
    bow = emotions_bow.copy()
    if normalize:
        for row in bow:
            row /= np.linalg.norm(row) or 1.0
    return bow

In [123]:
nx_train = build_frecs_bow(build_emotions_bow(x_train, spanish_map))
nx_val = build_frecs_bow(build_emotions_bow(x_val, spanish_map))
metrics_hist.append(("Bolsa de emociones frecuencias", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=None)))

100%|██████████| 5544/5544 [00:00<00:00, 8688.39it/s]
100%|██████████| 616/616 [00:00<00:00, 9414.03it/s]


[[333  64]
 [158  61]]
              precision    recall  f1-score   support

           0       0.68      0.84      0.75       397
           1       0.49      0.28      0.35       219

    accuracy                           0.64       616
   macro avg       0.58      0.56      0.55       616
weighted avg       0.61      0.64      0.61       616



Bolsa de emociones de frecuencias normalizadas

In [124]:
nx_train = build_frecs_bow(build_emotions_bow(x_train, spanish_map), normalize=True)
nx_val = build_frecs_bow(build_emotions_bow(x_val, spanish_map), normalize=True)
metrics_hist.append(("Bolsa de emociones frecuencias norm", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=None)))

100%|██████████| 5544/5544 [00:00<00:00, 9044.83it/s]
100%|██████████| 616/616 [00:00<00:00, 9185.90it/s]


[[325  72]
 [145  74]]
              precision    recall  f1-score   support

           0       0.69      0.82      0.75       397
           1       0.51      0.34      0.41       219

    accuracy                           0.65       616
   macro avg       0.60      0.58      0.58       616
weighted avg       0.63      0.65      0.63       616



##### Bolsa de emociones tfidf

In [125]:
def build_tfidf_bow(emotions_bows, normalize=False):
    """ Build a emotions tfidf bow """
    bows = emotions_bows.copy()
    # compute tf
    bows /= len(bows[0])
    # Compute idf
    ndocs_terms = np.sum(emotions_bows > 0, axis=0)
    zeros = np.where(ndocs_terms == 0)[0]
    ndocs_terms[zeros] = 1
    for bow in bows:
        bow *= np.log(emotions_bows.shape[0] / ndocs_terms)
        bow[zeros] = 0.0
        if normalize:
            bow /= np.linalg.norm(bow) or 1.0
    return bows

In [126]:
nx_train = build_tfidf_bow(build_emotions_bow(x_train, spanish_map))
nx_val = build_tfidf_bow(build_emotions_bow(x_val, spanish_map))
metrics_hist.append(("Bolsa de emociones tfidf", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=None)))

100%|██████████| 5544/5544 [00:00<00:00, 7988.11it/s]
100%|██████████| 616/616 [00:00<00:00, 9006.33it/s]


[[336  61]
 [159  60]]
              precision    recall  f1-score   support

           0       0.68      0.85      0.75       397
           1       0.50      0.27      0.35       219

    accuracy                           0.64       616
   macro avg       0.59      0.56      0.55       616
weighted avg       0.61      0.64      0.61       616



Bolsa de emociones tfidf normalizada

In [127]:
nx_train = build_tfidf_bow(build_emotions_bow(x_train, spanish_map), normalize=True)
nx_val = build_tfidf_bow(build_emotions_bow(x_val, spanish_map), normalize=True)
metrics_hist.append(("Bolsa de emociones tfidf norm", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=None)))

100%|██████████| 5544/5544 [00:00<00:00, 8885.97it/s]
100%|██████████| 616/616 [00:00<00:00, 9012.08it/s]


[[333  64]
 [153  66]]
              precision    recall  f1-score   support

           0       0.69      0.84      0.75       397
           1       0.51      0.30      0.38       219

    accuracy                           0.65       616
   macro avg       0.60      0.57      0.57       616
weighted avg       0.62      0.65      0.62       616



##### Tabla comparativa

In [138]:
dataset = pd.DataFrame(data=metrics_hist, columns = ['Embedding', 'Precision', 'Recall', 'Fscore', 'Accuracy'])
dataset

Unnamed: 0,Embedding,Precision,Recall,Fscore,Accuracy
0,Bolsa de emociones binaria,0.600544,0.570552,0.565667,0.650974
1,Bolsa de emociones frecuencias,0.583104,0.558665,0.552326,0.63961
2,Bolsa de emociones frecuencias norm,0.599169,0.57827,0.577596,0.647727
3,Bolsa de emociones tfidf,0.587328,0.56016,0.553152,0.642857
4,Bolsa de emociones tfidf norm,0.596439,0.57008,0.566235,0.647727
