# Oscar Esaú Peralta Rosales
## Tarea 1: Fundamentos de Minería de Texto


In [1]:
import csv
import math
import argparse

from collections import defaultdict

import numpy as np
import pandas as pd
import unidecode

from tqdm import tqdm
from nltk.corpus import CategorizedPlaintextCorpusReader
from nltk.tokenize import WordPunctTokenizer 
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_fscore_support, roc_auc_score
from sklearn import metrics, preprocessing
from sklearn import svm, datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2

import matplotlib.pyplot as plt

%matplotlib inline


## Actividad 3: Detección de Agresividad con Análisis de Sentimiento Básico

### Experimentos Parte 2

Carga de los datos

In [2]:
mex_corpus = CategorizedPlaintextCorpusReader('./data/corpus/', r'.*\.txt', cat_pattern=r'(\w+)/*')

In [3]:
tk = TweetTokenizer() 
stopw = stopwords.words('spanish') + stopwords.words('english')

In [4]:
x_train = [ 
            [token for token in tk.tokenize(tweet) if token not in stopw and len(token) > 2]
            for tweet in mex_corpus.raw('mex_train.txt').split('\n') if tweet
          ]
y_train = [int(label) for label in mex_corpus.raw('mex_train_labels.txt').split('\n') if label ]
x_val = [ 
            [token for token in tk.tokenize(tweet) if token not in stopw and len(token) > 2]
            for tweet in mex_corpus.raw('mex_val.txt').split('\n') if tweet
        ]
y_val = [int(label) for label in mex_corpus.raw('mex_val_labels.txt').split('\n') if label ]

#### 1. Utilice el recurso léxico llamado "Spanish Emotion Lexicon (SEL)" del Dr. Grigori Sidorov, profesor del Centro de Investigación en Computación (CIC) del Instituto Politecnico Nacional (http://www.cic.ipn.mx/∼sidorov/), para enmascarar cada palabra con su emoción, y después construir la Bolsa de Emociones con algún pesado (e.g., binario, tf, tfidf). Considere alguna estrategia para incorporar el "valor" del "Probability Factor of Affective use" en su representación vectorial del documento. Evalúa varias representaciones, y ponga una tabla comparativa a modo de resumen (e.g., binario, frecuencia, tfidf, etc.).

In [5]:
file_name = './data/SEL/SEL.csv'

with open(file_name) as fs:
    sel = [line.split(',') for line in fs if line]

sel_map = { unidecode.unidecode(item[1]).lower(): (item[7], float(item[6])) for item in sel[1:]}
sel_vocab = dict(zip(set([item[1][0] for item in sel_map.items()]), range(6)))

Construcción de la bolsa de emociones

In [6]:
def build_emotions_bow(docs, sel_map, sel_vocab):
    """ Build a emotions bag """
    bows = np.zeros((len(docs), len(sel_vocab)), dtype=float)
    
    for index, doc in enumerate(tqdm(docs)):
        for _word in doc:
            word = unidecode.unidecode(_word)
            if not word in sel_map:
                continue
            # Increase by pfa
            bows[index][sel_vocab[sel_map[word][0]]] += sel_map[word][1]
            
    return bows

In [7]:
bow = build_emotions_bow(x_train, sel_map, sel_vocab)
bow

100%|██████████| 5544/5544 [00:00<00:00, 55725.84it/s]


array([[0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       ...,
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   ],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.966]])

In [8]:
def classify(x_train, y_train, x_val, y_val, kbest=None):
    """ Clasificación con SVM, feature selection with chi2 """
    parameters = {'C': [.05, .12, .25, .5, 1, 2, 4]}
    
    if kbest:
        selectk = SelectKBest(chi2, k=kbest)
        selectk.fit(x_train, y_train)
        x_train = selectk.transform(x_train)
        x_val = selectk.transform(x_val)
    
    svr = svm.LinearSVC(class_weight='balanced')
    grid = GridSearchCV(estimator=svr, param_grid=parameters, n_jobs=8, scoring="f1_macro", cv=5)
    
    grid.fit(x_train, y_train) 

    y_pred = grid.predict(x_val)

    p, r, f, _ = precision_recall_fscore_support(y_val, y_pred, average='macro', pos_label=None)
    a = accuracy_score(y_val, y_pred)
    print(confusion_matrix(y_val, y_pred) )
    print(metrics.classification_report(y_val, y_pred))
    return p, r , f, a

metrics_hist = []

##### Bolsa de emociones binaria

In [9]:
def build_binary_bow(emotions_bow):
    """ Build a emotions binary bow """
    bow = emotions_bow.copy()
    bow[emotions_bow > 0] = 1
    return bow

In [10]:
nx_train = build_binary_bow(build_emotions_bow(x_train, sel_map, sel_vocab))
nx_val = build_binary_bow(build_emotions_bow(x_val, sel_map, sel_vocab))
metrics_hist.append(("Bolsa de emociones binaria", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=None)))

100%|██████████| 5544/5544 [00:00<00:00, 51841.10it/s]
100%|██████████| 616/616 [00:00<00:00, 89574.65it/s]


[[355  42]
 [177  42]]
              precision    recall  f1-score   support

           0       0.67      0.89      0.76       397
           1       0.50      0.19      0.28       219

    accuracy                           0.64       616
   macro avg       0.58      0.54      0.52       616
weighted avg       0.61      0.64      0.59       616



##### Bolsa de emociones frecuencias

In [11]:
def build_frecs_bow(emotions_bow, normalize=False):
    """ Build a emotions frequencies bow """
    # The bow already has the frequencies
    bow = emotions_bow.copy()
    if normalize:
        for row in bow:
            row /= np.linalg.norm(row) or 1.0
    return bow

In [12]:
nx_train = build_frecs_bow(build_emotions_bow(x_train, sel_map, sel_vocab))
nx_val = build_frecs_bow(build_emotions_bow(x_val, sel_map, sel_vocab))
metrics_hist.append(("Bolsa de emociones frecuencias", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=None)))

100%|██████████| 5544/5544 [00:00<00:00, 57548.50it/s]
100%|██████████| 616/616 [00:00<00:00, 94003.68it/s]


[[352  45]
 [176  43]]
              precision    recall  f1-score   support

           0       0.67      0.89      0.76       397
           1       0.49      0.20      0.28       219

    accuracy                           0.64       616
   macro avg       0.58      0.54      0.52       616
weighted avg       0.60      0.64      0.59       616



Bolsa de emociones de frecuencias normalizadas

In [13]:
nx_train = build_frecs_bow(build_emotions_bow(x_train, sel_map, sel_vocab), normalize=True)
nx_val = build_frecs_bow(build_emotions_bow(x_val, sel_map, sel_vocab), normalize=True)
metrics_hist.append(("Bolsa de emociones frecuencias norm", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=None)))

100%|██████████| 5544/5544 [00:00<00:00, 91230.25it/s]
100%|██████████| 616/616 [00:00<00:00, 87835.84it/s]


[[ 78 319]
 [ 18 201]]
              precision    recall  f1-score   support

           0       0.81      0.20      0.32       397
           1       0.39      0.92      0.54       219

    accuracy                           0.45       616
   macro avg       0.60      0.56      0.43       616
weighted avg       0.66      0.45      0.40       616



##### Bolsa de emociones tfidf

In [14]:
def build_tfidf_bow(emotions_bows, normalize=False):
    """ Build a emotions tfidf bow """
    bows = emotions_bows.copy()
    # Compute count of terms by document
    ndocs_terms = np.sum(emotions_bows > 0, axis=0)
    zeros = np.where(ndocs_terms == 0)[0]
    ndocs_terms[zeros] = 1
    for bow in bows:
        # compute tf
        bow /= np.sum(bow > 0) or 1
        bow *= np.log(emotions_bows.shape[0] / ndocs_terms)
        bow[zeros] = 0.0
        if normalize:
            bow /= np.linalg.norm(bow) or 1.0
    return bows

In [15]:
nx_train = build_tfidf_bow(build_emotions_bow(x_train, sel_map, sel_vocab))
nx_val = build_tfidf_bow(build_emotions_bow(x_val, sel_map, sel_vocab))
metrics_hist.append(("Bolsa de emociones tfidf", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=None)))

100%|██████████| 5544/5544 [00:00<00:00, 58897.99it/s]
100%|██████████| 616/616 [00:00<00:00, 78658.36it/s]


[[ 77 320]
 [ 19 200]]
              precision    recall  f1-score   support

           0       0.80      0.19      0.31       397
           1       0.38      0.91      0.54       219

    accuracy                           0.45       616
   macro avg       0.59      0.55      0.43       616
weighted avg       0.65      0.45      0.39       616





Bolsa de emociones tfidf normalizada

In [16]:
nx_train = build_tfidf_bow(build_emotions_bow(x_train, sel_map, sel_vocab), normalize=True)
nx_val = build_tfidf_bow(build_emotions_bow(x_val, sel_map, sel_vocab), normalize=True)
metrics_hist.append(("Bolsa de emociones tfidf norm", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=None)))

100%|██████████| 5544/5544 [00:00<00:00, 93223.94it/s]
100%|██████████| 616/616 [00:00<00:00, 86203.50it/s]


[[ 78 319]
 [ 18 201]]
              precision    recall  f1-score   support

           0       0.81      0.20      0.32       397
           1       0.39      0.92      0.54       219

    accuracy                           0.45       616
   macro avg       0.60      0.56      0.43       616
weighted avg       0.66      0.45      0.40       616



##### Tabla comparativa

In [17]:
dataset = pd.DataFrame(data=metrics_hist, columns = ['Embedding', 'Precision', 'Recall', 'Fscore', 'Accuracy'])
dataset

Unnamed: 0,Embedding,Precision,Recall,Fscore,Accuracy
0,Bolsa de emociones binaria,0.583647,0.542994,0.520745,0.644481
1,Bolsa de emociones frecuencias,0.577652,0.541498,0.520606,0.641234
2,Bolsa de emociones frecuencias norm,0.599519,0.557141,0.430204,0.452922
3,Bolsa de emociones tfidf,0.593349,0.553598,0.426823,0.449675
4,Bolsa de emociones tfidf norm,0.599519,0.557141,0.430204,0.452922


### En un comentario aparte, discuta sobre la estrategía que utilizó para incorporar el "Probability Factor of Affective use".

Para agregar el PFA a la bolsa de emociones se optó por no contabilizar cada match de una palabra con una emoción como 1 (es decir, realizar acumulaciones de uno en uno) sino que contar la influecia de cada match a una emoción mediante su PFA, es decir se acumulan los PFA de las palabras por cada match con una emoción. Así tratar de ponderar la influencia de cada emoción por tweet a través del PFA