# Oscar Esaú Peralta Rosales
## Tarea 1: Fundamentos de Minería de Texto


In [1]:
import csv
import math
import argparse

from collections import defaultdict

import numpy as np
import pandas as pd
import unidecode

from tqdm import tqdm
from nltk.corpus import CategorizedPlaintextCorpusReader
from nltk.tokenize import WordPunctTokenizer 
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_fscore_support, roc_auc_score
from sklearn import metrics, preprocessing
from sklearn import svm, datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2

import matplotlib.pyplot as plt

%matplotlib inline


## Actividad 3: Detección de Agresividad con Análisis de Sentimiento Básico

### Experimentos Parte 3

Carga de los datos

In [2]:
mex_corpus = CategorizedPlaintextCorpusReader('./data/corpus/', r'.*\.txt', cat_pattern=r'(\w+)/*')

In [3]:
tk = TweetTokenizer() 
stopw = stopwords.words('spanish') + stopwords.words('english')

In [4]:
x_train = [ 
            [token for token in tk.tokenize(tweet) if token not in stopw and len(token) > 2]
            for tweet in mex_corpus.raw('mex_train.txt').split('\n') if tweet
          ]
y_train = [int(label) for label in mex_corpus.raw('mex_train_labels.txt').split('\n') if label ]
x_val = [ 
            [token for token in tk.tokenize(tweet) if token not in stopw and len(token) > 2]
            for tweet in mex_corpus.raw('mex_val.txt').split('\n') if tweet
        ]
y_val = [int(label) for label in mex_corpus.raw('mex_val_labels.txt').split('\n') if label ]

#### 1. Utilice el recurso léxico de la actividad de representación fonética de esta tarea para construir una Bolsa de Palabras-Fonéticas. Evalúa varias representaciones (al menos binario, tf y tfidf), y ponga una tabla comparativa a modo de resumen.

In [5]:
# Reading phonemes dict nd build vocab
file_name = './data/phonemes_dict/es.csv'

phonemes_map = {}
phonemes_vocab = {}
count = 0

with open(file_name, newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    start = False
    for index, row in enumerate(tqdm(spamreader)):
        if index == 0:
            continue
        word = unidecode.unidecode(row[0]).lower()
        phonemes_map[word] = [''.join(fn.split(' ')) for fn in row[1].split('ˈ') if fn]
        # Add phoneme to the vocab
        for pf in phonemes_map[word]:
            if not pf in phonemes_vocab:
                phonemes_vocab[pf] = count
                count += 1

51637it [00:00, 91213.58it/s]


In [6]:
len(phonemes_vocab)

27300

Construcción de la bolsa de emociones

In [7]:
def build_phonemes_bow(docs, phonemes_map, phonemes_vocab):
    """ Build a phonemes bag """
    bows = np.zeros((len(docs), len(phonemes_vocab)), dtype=float)
    
    for index, doc in enumerate(tqdm(docs)):
        for _word in doc:
            word = unidecode.unidecode(_word)
            if not word in phonemes_map:
                continue
                
            for phome in phonemes_map[word]:
                w_index = phonemes_vocab[phome]
                bows[index][w_index] += 1
    return bows

In [8]:
bow = build_phonemes_bow(x_train, phonemes_map, phonemes_vocab)
bow.shape

100%|██████████| 5544/5544 [00:00<00:00, 18274.33it/s]


(5544, 27300)

In [9]:
def classify(x_train, y_train, x_val, y_val, kbest=None):
    """ Clasificación con SVM, feature selection with chi2 """
    parameters = {'C': [.05, .12, .25, .5, 1, 2, 4]}
    
    if kbest:
        selectk = SelectKBest(chi2, k=kbest)
        selectk.fit(x_train, y_train)
        x_train = selectk.transform(x_train)
        x_val = selectk.transform(x_val)
    
    svr = svm.LinearSVC(class_weight='balanced')
    grid = GridSearchCV(estimator=svr, param_grid=parameters, n_jobs=8, scoring="f1_macro", cv=5)
    
    grid.fit(x_train, y_train) 

    y_pred = grid.predict(x_val)

    p, r, f, _ = precision_recall_fscore_support(y_val, y_pred, average='macro', pos_label=None)
    a = accuracy_score(y_val, y_pred)
    print(confusion_matrix(y_val, y_pred) )
    print(metrics.classification_report(y_val, y_pred))
    return p, r , f, a

metrics_hist = []

##### Bolsa de fonemas binaria

In [10]:
def build_binary_bow(x, phonemes_map, phonemes_vocab):
    """ Build a phonemes binary bow """
    bow = build_phonemes_bow(x, phonemes_map, phonemes_vocab)
    bow[bow > 0] = 1
    return bow

In [11]:
nx_train = build_binary_bow(x_train, phonemes_map, phonemes_vocab)
nx_val = build_binary_bow(x_val, phonemes_map, phonemes_vocab)
metrics_hist.append(("Bolsa de fonemas binaria", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=3000)))

100%|██████████| 5544/5544 [00:00<00:00, 25719.49it/s]
100%|██████████| 616/616 [00:00<00:00, 32402.26it/s]


[[311  86]
 [101 118]]
              precision    recall  f1-score   support

           0       0.75      0.78      0.77       397
           1       0.58      0.54      0.56       219

    accuracy                           0.70       616
   macro avg       0.67      0.66      0.66       616
weighted avg       0.69      0.70      0.69       616



##### Bolsa de emociones frecuencias

In [12]:
def build_frecs_bow(x, phonemes_map, phonemes_vocab, normalize=False):
    """ Build a emotions frequencies bow """
    # The bow already has the frequencies
    bow = build_phonemes_bow(x, phonemes_map, phonemes_vocab)
    if normalize:
        for row in bow:
            row /= np.linalg.norm(row) or 1.0
    return bow

In [13]:
nx_train = build_frecs_bow(x_train, phonemes_map, phonemes_vocab, normalize=False)
nx_val = build_frecs_bow(x_val, phonemes_map, phonemes_vocab, normalize=False)
metrics_hist.append(("Bolsa de fonemas frecuencias", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=3000)))

100%|██████████| 5544/5544 [00:00<00:00, 30158.69it/s]
100%|██████████| 616/616 [00:00<00:00, 31356.24it/s]


[[309  88]
 [102 117]]
              precision    recall  f1-score   support

           0       0.75      0.78      0.76       397
           1       0.57      0.53      0.55       219

    accuracy                           0.69       616
   macro avg       0.66      0.66      0.66       616
weighted avg       0.69      0.69      0.69       616



Bolsa de emociones de frecuencias normalizadas

In [14]:
nx_train = build_frecs_bow(x_train, phonemes_map, phonemes_vocab, normalize=True)
nx_val = build_frecs_bow(x_val, phonemes_map, phonemes_vocab, normalize=True)
metrics_hist.append(("Bolsa de fonemas frecuencias", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=3000)))

100%|██████████| 5544/5544 [00:00<00:00, 27195.35it/s]
100%|██████████| 616/616 [00:00<00:00, 15437.37it/s]


[[308  89]
 [ 98 121]]
              precision    recall  f1-score   support

           0       0.76      0.78      0.77       397
           1       0.58      0.55      0.56       219

    accuracy                           0.70       616
   macro avg       0.67      0.66      0.67       616
weighted avg       0.69      0.70      0.69       616



##### Bolsa de emociones tfidf

In [15]:
def build_tfidf_bow(x, phonemes_map, phonemes_vocab, normalize=False):
    """ Build a emotions tfidf bow """
    bows = build_phonemes_bow(x, phonemes_map, phonemes_vocab)
    
    # Compute count of terms aparitions on documents
    ndocs_terms = np.sum(bows > 0, axis=0)
    zeros = np.where(ndocs_terms == 0)[0]
    ndocs_terms[zeros] = 1

    for index, bow in enumerate(bows):
        # compute tf
        bow /= np.sum(bow > 0) or 1
        # compute tf*idf
        bow *= np.log(bows.shape[0] / ndocs_terms)
        bow[zeros] = 0.0
        if normalize:
            bow /= np.linalg.norm(bow) or 1.0
    return bows

In [16]:
nx_train = build_tfidf_bow(x_train, phonemes_map, phonemes_vocab, normalize=False)
nx_val = build_tfidf_bow(x_val, phonemes_map, phonemes_vocab, normalize=False)
metrics_hist.append(("Bolsa de phonemas tfidf", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=1000)))

100%|██████████| 5544/5544 [00:00<00:00, 31187.22it/s]
100%|██████████| 616/616 [00:00<00:00, 35422.63it/s]


[[317  80]
 [102 117]]
              precision    recall  f1-score   support

           0       0.76      0.80      0.78       397
           1       0.59      0.53      0.56       219

    accuracy                           0.70       616
   macro avg       0.68      0.67      0.67       616
weighted avg       0.70      0.70      0.70       616



Bolsa de emociones tfidf normalizada

In [17]:
nx_train = build_tfidf_bow(x_train, phonemes_map, phonemes_vocab, normalize=True)
nx_val = build_tfidf_bow(x_val, phonemes_map, phonemes_vocab, normalize=True)
metrics_hist.append(("Bolsa de phonemas tfidf norm", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=1000)))

100%|██████████| 5544/5544 [00:00<00:00, 26635.02it/s]
100%|██████████| 616/616 [00:00<00:00, 16330.25it/s]


[[315  82]
 [104 115]]
              precision    recall  f1-score   support

           0       0.75      0.79      0.77       397
           1       0.58      0.53      0.55       219

    accuracy                           0.70       616
   macro avg       0.67      0.66      0.66       616
weighted avg       0.69      0.70      0.69       616



##### Tabla comparativa

In [18]:
dataset = pd.DataFrame(data=metrics_hist, columns = ['Embedding', 'Precision', 'Recall', 'Fscore', 'Accuracy'])
dataset

Unnamed: 0,Embedding,Precision,Recall,Fscore,Accuracy
0,Bolsa de fonemas binaria,0.666643,0.661094,0.663385,0.696429
1,Bolsa de fonemas frecuencias,0.661278,0.656292,0.658369,0.691558
2,Bolsa de fonemas frecuencias,0.667406,0.664165,0.665613,0.696429
3,Bolsa de phonemas tfidf,0.675236,0.666368,0.66973,0.704545
4,Bolsa de phonemas tfidf norm,0.667773,0.659283,0.662472,0.698052
