# Oscar Esaú Peralta Rosales
## Tarea 1: Fundamentos de Minería de Texto


In [1]:
import csv
import math
import argparse

from collections import defaultdict

import numpy as np
import pandas as pd
import unidecode

from tqdm import tqdm
from nltk.corpus import CategorizedPlaintextCorpusReader
from nltk.tokenize import WordPunctTokenizer 
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_recall_fscore_support, roc_auc_score
from sklearn import metrics, preprocessing
from sklearn import svm, datasets
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2

import matplotlib.pyplot as plt

%matplotlib inline


## Actividad 3: Detección de Agresividad con Análisis de Sentimiento Básico

### Experimentos Parte 4

**Carga de los datos**

In [2]:
mex_corpus = CategorizedPlaintextCorpusReader('./data/corpus/', r'.*\.txt', cat_pattern=r'(\w+)/*')

In [3]:
tk = TweetTokenizer() 
stopw = stopwords.words('spanish') + stopwords.words('english')

In [4]:
x_train = [ 
            [token for token in tk.tokenize(tweet) if token not in stopw and len(token) > 2]
            for tweet in mex_corpus.raw('mex_train.txt').split('\n') if tweet
          ]
y_train = [int(label) for label in mex_corpus.raw('mex_train_labels.txt').split('\n') if label ]
x_val = [ 
            [token for token in tk.tokenize(tweet) if token not in stopw and len(token) > 2]
            for tweet in mex_corpus.raw('mex_val.txt').split('\n') if tweet
        ]
y_val = [int(label) for label in mex_corpus.raw('mex_val_labels.txt').split('\n') if label ]

#### 1. Combine todo lo anterior en experimentos con una Bolsa de Palabras Tradicional con diferente pesado y observé si la clasificación mejora cuando se incorpora algo de lo anterior. Pruebe al menos tres pesados: binario, frecuencia normalizada y tfidf. Para construir la representación final del documento utilice la concatenación de todas representaciones anteriores (Bolsa de Palabras Normal + Bolsa de Sentimientos de Canada + Bolsa de Sentimientos de Grigori + Bolsa de PalabrasFoneticas), y aliméntelas a un SVM.

**Reading phonemes dict and build vocab**

In [5]:

file_name = './data/phonemes_dict/es.csv'

phonemes_map = {}
phonemes_vocab = {}
count = 0

with open(file_name, newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    start = False
    for index, row in enumerate(tqdm(spamreader)):
        if index == 0:
            continue
        word = unidecode.unidecode(row[0]).lower()
        phonemes_map[word] = [''.join(fn.split(' ')) for fn in row[1].split('ˈ') if fn]
        # Add phoneme to the vocab
        for pf in phonemes_map[word]:
            if not pf in phonemes_vocab:
                phonemes_vocab[pf] = count
                count += 1

51637it [00:00, 129344.11it/s]


**Carga de diccionario de emociones y vocab**

In [6]:
file_name = './data/NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx'
df = pd.read_excel(file_name, usecols='CI,DB:DK')

spzip = zip(np.array([x.lower() for x in np.array(df['Spanish (es)'])]), 
            np.array(df['Positive']),
            np.array(df['Negative']),
            np.array(df['Anger']),
            np.array(df['Anticipation']),
            np.array(df['Disgust']),
            np.array(df['Fear']),
            np.array(df['Joy']),
            np.array(df['Sadness']),
            np.array(df['Surprise']),
            np.array(df['Trust']))

spanish_map = sorted(spzip, key=lambda item:item[0])

def spanish_map_search(spanish_map, word):
    """Returns a array with the emotions for any word"""
    word = word.lower()
    i = 0
    j = len(spanish_map) - 1
    
    while i < j:
        m = int((i+j)/2)
        match = spanish_map[m][0].lower()
        if match == word:
            return np.array(spanish_map[m][1:])
        if word > match:
            i = m + 1
        else:
            j = m - 1
    
    return np.zeros(10)


**Carga de Emociones SEL y vocab**

In [7]:
file_name = './data/SEL/SEL.csv'

with open(file_name) as fs:
    sel = [line.split(',') for line in fs if line]

sel_map = { unidecode.unidecode(item[1]).lower(): (item[7], float(item[6])) for item in sel[1:]}
sel_vocab = dict(zip(set([item[1][0] for item in sel_map.items()]), range(6)))

**Vocabulario normal**

In [8]:
def build_vocabulary(docs):
    index = 0
    vocabulary = {}
    for doc in docs:
        for token in doc:
            if token not in vocabulary:
                vocabulary[token] = index
                index += 1
    return vocabulary

vocabulary = build_vocabulary(x_train)

**Construcción de la bolsa de emociones y fonemas**

In [9]:
def build_tradictional_bow(docs, _, vocab):
    """ Build a emotions bag """
    bow = np.zeros((len(docs), len(vocab)), dtype=float)
    
    for index, doc in enumerate(tqdm(docs)):
        for word in doc:
            if not word in vocab:
                continue
            bow[index][vocab[word]] += 1
    return bow


def build_emotions_bow(docs, spanish_map, _, emotions=10):
    """ Build a emotions bag """
    bow = np.zeros((len(docs), emotions), dtype=float)
    
    for index, doc in enumerate(tqdm(docs)):
        for word in doc:
            w_emotions = spanish_map_search(spanish_map, word)
            bow[index] += w_emotions
        
    return bow


def build_emotions_sel_bow(docs, sel_map, sel_vocab):
    """ Build a emotions bag """
    bows = np.zeros((len(docs), len(sel_vocab)), dtype=float)
    
    for index, doc in enumerate(tqdm(docs)):
        for _word in doc:
            word = unidecode.unidecode(_word)
            if not word in sel_map:
                continue
            # Increase by pfa
            bows[index][sel_vocab[sel_map[word][0]]] += sel_map[word][1]
            
    return bows


def build_phonemes_bow(docs, phonemes_map, phonemes_vocab):
    """ Build a phonemes bag """
    bows = np.zeros((len(docs), len(phonemes_vocab)), dtype=float)
    
    for index, doc in enumerate(tqdm(docs)):
        for _word in doc:
            word = unidecode.unidecode(_word)
            if not word in phonemes_map:
                continue
                
            for phome in phonemes_map[word]:
                w_index = phonemes_vocab[phome]
                bows[index][w_index] += 1
    return bows

**Construcción de bolsa {binaria, frecuencias, tfidf} de x tipo**

In [10]:
def build_binary_bow(x, bow_builder, token_map, vocab_map):
    """ Build a binary bow """
    bow = bow_builder(x, token_map, vocab_map)
    bow[bow > 0] = 1
    return bow


def build_frecs_bow(x, bow_builder, token_map, vocab_map, normalize=False):
    """ Build a frequencies bow """
    # The bow already has the frequencies
    bow = bow_builder(x, token_map, vocab_map)
    if normalize:
        for row in bow:
            row /= np.linalg.norm(row) or 1.0
    return bow


def build_tfidf_bow(x, bow_builder, token_map, vocab_map, normalize=False):
    """ Build a tfidf bow """
    bows = bow_builder(x, token_map, vocab_map)
    
    # Compute count of terms aparitions on documents
    ndocs_terms = np.sum(bows > 0, axis=0)
    zeros = np.where(ndocs_terms == 0)[0]
    ndocs_terms[zeros] = 1

    for index, bow in enumerate(bows):
        # compute tf
        bow /= np.sum(bow > 0) or 1
        # compute tf*idf
        bow *= np.log(bows.shape[0] / ndocs_terms)
        bow[zeros] = 0.0
        if normalize:
            bow /= np.linalg.norm(bow) or 1.0
    return bows

**Clasificación**

In [11]:
def classify(x_train, y_train, x_val, y_val, kbest=None):
    """ Clasificación con SVM, feature selection with chi2 """
    parameters = {'C': [.05, .12, .25, .5, 1, 2, 4]}
    
    if kbest:
        selectk = SelectKBest(chi2, k=kbest)
        selectk.fit(x_train, y_train)
        x_train = selectk.transform(x_train)
        x_val = selectk.transform(x_val)
    
    svr = svm.LinearSVC(class_weight='balanced')
    grid = GridSearchCV(estimator=svr, param_grid=parameters, n_jobs=8, scoring="f1_macro", cv=5)
    
    grid.fit(x_train, y_train) 

    y_pred = grid.predict(x_val)

    p, r, f, _ = precision_recall_fscore_support(y_val, y_pred, average='macro', pos_label=None)
    a = accuracy_score(y_val, y_pred)
    print(confusion_matrix(y_val, y_pred) )
    print(metrics.classification_report(y_val, y_pred))
    return p, r , f, a

metrics_hist = []

**Construcción de diferenties tipos de bolsas**

In [12]:
# Bolsas vocabulario tradicional
x_train_trad_binary = build_binary_bow(x_train, build_tradictional_bow, _, vocabulary)
x_val_trad_binary = build_binary_bow(x_val, build_tradictional_bow, _, vocabulary)

x_train_trad_frec = build_frecs_bow(x_train, build_tradictional_bow, _, vocabulary, normalize=False)
x_val_trad_frec = build_frecs_bow(x_val, build_tradictional_bow, _, vocabulary, normalize=False)

x_train_trad_frec_norm = build_frecs_bow(x_train, build_tradictional_bow, _, vocabulary, normalize=True)
x_val_trad_frec_norm = build_frecs_bow(x_val, build_tradictional_bow, _, vocabulary, normalize=True)

x_train_trad_tfidf = build_tfidf_bow(x_train, build_tradictional_bow, _, vocabulary, normalize=True)
x_val_trad_tfidf = build_tfidf_bow(x_val, build_tradictional_bow, _, vocabulary, normalize=True)

x_train_trad_tfidf_norm = build_tfidf_bow(x_train, build_tradictional_bow, _, vocabulary, normalize=True)
x_val_trad_tfidf_norm = build_tfidf_bow(x_val, build_tradictional_bow, _, vocabulary, normalize=True)

100%|██████████| 5544/5544 [00:00<00:00, 37144.06it/s]
100%|██████████| 616/616 [00:00<00:00, 54259.85it/s]
100%|██████████| 5544/5544 [00:00<00:00, 50991.45it/s]
100%|██████████| 616/616 [00:00<00:00, 55401.22it/s]
100%|██████████| 5544/5544 [00:00<00:00, 51769.43it/s]
100%|██████████| 616/616 [00:00<00:00, 16535.84it/s]
100%|██████████| 5544/5544 [00:00<00:00, 34884.47it/s]
100%|██████████| 616/616 [00:00<00:00, 12726.54it/s]
100%|██████████| 5544/5544 [00:00<00:00, 34101.18it/s]
100%|██████████| 616/616 [00:00<00:00, 15057.09it/s]


In [14]:
# Bolsas vocabulario emociones
x_train_emot_binary = build_binary_bow(x_train, build_emotions_bow, spanish_map, _)
x_val_emot_binary = build_binary_bow(x_val, build_emotions_bow, spanish_map, _)

x_train_emot_frec = build_frecs_bow(x_train, build_emotions_bow, spanish_map, _, normalize=False)
x_val_emot_frec = build_frecs_bow(x_val, build_emotions_bow, spanish_map, _, normalize=False)

x_train_emot_frec_norm = build_frecs_bow(x_train, build_emotions_bow, spanish_map, _, normalize=True)
x_val_emot_frec_norm = build_frecs_bow(x_val, build_emotions_bow, spanish_map, _, normalize=True)

x_train_emot_tfidf = build_tfidf_bow(x_train, build_emotions_bow, spanish_map, _, normalize=True)
x_val_emot_tfidf = build_tfidf_bow(x_val, build_emotions_bow, spanish_map, _, normalize=True)

x_train_emot_tfidf_norm = build_tfidf_bow(x_train, build_emotions_bow, spanish_map, _, normalize=True)
x_val_emot_tfidf_norm = build_tfidf_bow(x_val, build_emotions_bow, spanish_map, _, normalize=True)

100%|██████████| 5544/5544 [00:00<00:00, 8175.01it/s]
100%|██████████| 616/616 [00:00<00:00, 8788.93it/s]
100%|██████████| 5544/5544 [00:00<00:00, 8723.82it/s]
100%|██████████| 616/616 [00:00<00:00, 8993.16it/s]
100%|██████████| 5544/5544 [00:00<00:00, 8463.04it/s]
100%|██████████| 616/616 [00:00<00:00, 8688.94it/s]
100%|██████████| 5544/5544 [00:00<00:00, 8647.30it/s]
100%|██████████| 616/616 [00:00<00:00, 8695.61it/s]
100%|██████████| 5544/5544 [00:00<00:00, 8725.29it/s]
100%|██████████| 616/616 [00:00<00:00, 8824.53it/s]


In [15]:
# Bolsas vocabulario emociones SEL
x_train_emot_sel_binary = build_binary_bow(x_train, build_emotions_sel_bow, sel_map, sel_vocab)
x_val_emot_sel_binary = build_binary_bow(x_val, build_emotions_sel_bow, sel_map, sel_vocab)

x_train_emot_sel_frec = build_frecs_bow(x_train, build_emotions_sel_bow, sel_map, sel_vocab, normalize=False)
x_val_emot_sel_frec = build_frecs_bow(x_val, build_emotions_sel_bow, sel_map, sel_vocab, normalize=False)

x_train_emot_sel_frec_norm = build_frecs_bow(x_train, build_emotions_sel_bow, sel_map, sel_vocab, normalize=True)
x_val_emot_sel_frec_norm = build_frecs_bow(x_val, build_emotions_sel_bow, sel_map, sel_vocab, normalize=True)

x_train_emot_sel_tfidf = build_tfidf_bow(x_train, build_emotions_sel_bow, sel_map, sel_vocab, normalize=True)
x_val_emot_sel_tfidf = build_tfidf_bow(x_val, build_emotions_sel_bow, sel_map, sel_vocab, normalize=True)

x_train_emot_sel_tfidf_norm = build_tfidf_bow(x_train, build_emotions_sel_bow, sel_map, sel_vocab, normalize=True)
x_val_emot_sel_tfidf_norm = build_tfidf_bow(x_val, build_emotions_sel_bow, sel_map, sel_vocab, normalize=True)

100%|██████████| 5544/5544 [00:00<00:00, 53156.66it/s]
100%|██████████| 616/616 [00:00<00:00, 94007.10it/s]
100%|██████████| 5544/5544 [00:00<00:00, 91181.24it/s]
100%|██████████| 616/616 [00:00<00:00, 79179.04it/s]
100%|██████████| 5544/5544 [00:00<00:00, 91178.38it/s]
100%|██████████| 616/616 [00:00<00:00, 92915.14it/s]
100%|██████████| 5544/5544 [00:00<00:00, 93538.20it/s]
100%|██████████| 616/616 [00:00<00:00, 90764.11it/s]
100%|██████████| 5544/5544 [00:00<00:00, 95991.30it/s]
100%|██████████| 616/616 [00:00<00:00, 89593.29it/s]


In [27]:
# Bolsas vocabulario emociones SEL
x_train_phonemes_binary = build_binary_bow(x_train, build_phonemes_bow, phonemes_map, phonemes_vocab)
x_val_phonemes_binary = build_binary_bow(x_val, build_phonemes_bow, phonemes_map, phonemes_vocab)

x_train_phonemes_frec = build_frecs_bow(x_train, build_phonemes_bow, phonemes_map, phonemes_vocab, normalize=False)
x_val_phonemes_frec = build_frecs_bow(x_val, build_phonemes_bow, phonemes_map, phonemes_vocab, normalize=False)

x_train_phonemes_frec_norm = build_frecs_bow(x_train, build_phonemes_bow, phonemes_map, phonemes_vocab, normalize=True)
x_val_phonemes_frec_norm = build_frecs_bow(x_val, build_phonemes_bow, phonemes_map, phonemes_vocab, normalize=True)

x_train_phonemes_tfidf = build_tfidf_bow(x_train, build_phonemes_bow, phonemes_map, phonemes_vocab, normalize=True)
x_val_phonemes_tfidf = build_tfidf_bow(x_val, build_phonemes_bow, phonemes_map, phonemes_vocab, normalize=True)

x_train_phonemes_tfidf_norm = build_tfidf_bow(x_train, build_phonemes_bow, phonemes_map, phonemes_vocab, normalize=True)
x_val_phonemes_tfidf_norm = build_tfidf_bow(x_val, build_phonemes_bow, phonemes_map, phonemes_vocab, normalize=True)

100%|██████████| 5544/5544 [00:00<00:00, 29604.63it/s]
100%|██████████| 616/616 [00:00<00:00, 34423.97it/s]
100%|██████████| 5544/5544 [00:00<00:00, 34038.19it/s]
100%|██████████| 616/616 [00:00<00:00, 34614.91it/s]
100%|██████████| 5544/5544 [00:00<00:00, 33953.00it/s]
100%|██████████| 616/616 [00:00<00:00, 31362.33it/s]
100%|██████████| 5544/5544 [00:00<00:00, 26546.62it/s]
100%|██████████| 616/616 [00:00<00:00, 20655.32it/s]
100%|██████████| 5544/5544 [00:00<00:00, 28118.78it/s]
100%|██████████| 616/616 [00:00<00:00, 19979.52it/s]


**Bolsa tradicional + emociones, binaria**

In [16]:
nx_train = np.concatenate((x_train_trad_binary, x_train_emot_binary, x_train_emot_sel_binary), axis=1)
nx_val = np.concatenate((x_val_trad_binary, x_val_emot_binary, x_val_emot_sel_binary), axis=1)
metrics_hist.append(("Bolsa tradicional + emociones, binaria", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=1000)))

[[341  56]
 [ 68 151]]
              precision    recall  f1-score   support

           0       0.83      0.86      0.85       397
           1       0.73      0.69      0.71       219

    accuracy                           0.80       616
   macro avg       0.78      0.77      0.78       616
weighted avg       0.80      0.80      0.80       616



**Bolsa tradiciones + emociones, frecuencias**

In [17]:
nx_train = np.concatenate((x_train_trad_frec, x_train_emot_frec, x_train_emot_sel_frec), axis=1)
nx_val = np.concatenate((x_val_trad_frec, x_val_emot_frec, x_val_emot_sel_frec), axis=1)
metrics_hist.append(("Bolsa tradiciones + emociones, frecuencias", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=1000)))

[[339  58]
 [ 65 154]]
              precision    recall  f1-score   support

           0       0.84      0.85      0.85       397
           1       0.73      0.70      0.71       219

    accuracy                           0.80       616
   macro avg       0.78      0.78      0.78       616
weighted avg       0.80      0.80      0.80       616



**Bolsa tradiciones + emociones, frecuencias normalizadas**

In [18]:
nx_train = np.concatenate((x_train_trad_frec_norm, x_train_emot_frec_norm, x_train_emot_sel_frec_norm), axis=1)
nx_val = np.concatenate((x_val_trad_frec_norm, x_val_emot_frec_norm, x_val_emot_sel_frec_norm), axis=1)
metrics_hist.append(("Bolsa tradiciones + emociones, frecuencias norma", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=1000)))

[[335  62]
 [ 63 156]]
              precision    recall  f1-score   support

           0       0.84      0.84      0.84       397
           1       0.72      0.71      0.71       219

    accuracy                           0.80       616
   macro avg       0.78      0.78      0.78       616
weighted avg       0.80      0.80      0.80       616



**Bolsa tradiciones + emociones, tfidf**

In [19]:
nx_train = np.concatenate((x_train_trad_tfidf, x_train_emot_tfidf, x_train_emot_sel_tfidf), axis=1)
nx_val = np.concatenate((x_val_trad_tfidf, x_val_emot_tfidf, x_val_emot_sel_tfidf), axis=1)
metrics_hist.append(("Bolsa tradiciones + emociones, tfidf", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=1500)))

[[330  67]
 [ 63 156]]
              precision    recall  f1-score   support

           0       0.84      0.83      0.84       397
           1       0.70      0.71      0.71       219

    accuracy                           0.79       616
   macro avg       0.77      0.77      0.77       616
weighted avg       0.79      0.79      0.79       616



**Bolsa tradiciones + emociones, tfidf norm**

In [20]:
nx_train = np.concatenate((x_train_trad_tfidf_norm, x_train_emot_tfidf_norm, x_train_emot_sel_tfidf_norm), axis=1)
nx_val = np.concatenate((x_val_trad_tfidf_norm, x_val_emot_tfidf_norm, x_val_emot_sel_tfidf_norm), axis=1)
metrics_hist.append(("Bolsa tradiciones + emociones, tfidf norm", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=1500)))

[[330  67]
 [ 63 156]]
              precision    recall  f1-score   support

           0       0.84      0.83      0.84       397
           1       0.70      0.71      0.71       219

    accuracy                           0.79       616
   macro avg       0.77      0.77      0.77       616
weighted avg       0.79      0.79      0.79       616



**Bolsa phonemes + emociones, binaria**

In [22]:
nx_train = np.concatenate((x_train_phonemes_binary, x_train_emot_binary, x_train_emot_sel_binary), axis=1)
nx_val = np.concatenate((x_val_phonemes_binary, x_val_emot_binary, x_val_emot_sel_binary), axis=1)
metrics_hist.append(("Bolsa phonemes + emociones, binaria", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=2000)))

[[313  84]
 [105 114]]
              precision    recall  f1-score   support

           0       0.75      0.79      0.77       397
           1       0.58      0.52      0.55       219

    accuracy                           0.69       616
   macro avg       0.66      0.65      0.66       616
weighted avg       0.69      0.69      0.69       616



**Bolsa phonemes + emociones, frecuencias**

In [23]:
nx_train = np.concatenate((x_train_phonemes_frec, x_train_emot_frec, x_train_emot_sel_frec), axis=1)
nx_val = np.concatenate((x_val_phonemes_frec, x_val_emot_frec, x_val_emot_sel_frec), axis=1)
metrics_hist.append(("Bolsa phonemes + emociones, frecuencias", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=5000)))

[[308  89]
 [ 99 120]]
              precision    recall  f1-score   support

           0       0.76      0.78      0.77       397
           1       0.57      0.55      0.56       219

    accuracy                           0.69       616
   macro avg       0.67      0.66      0.66       616
weighted avg       0.69      0.69      0.69       616



**Bolsa phonemes + emociones, frecuencias normalizadas**

In [24]:
nx_train = np.concatenate((x_train_phonemes_frec_norm, x_train_emot_frec_norm, x_train_emot_sel_frec_norm), axis=1)
nx_val = np.concatenate((x_val_phonemes_frec_norm, x_val_emot_frec_norm, x_val_emot_sel_frec_norm), axis=1)
metrics_hist.append(("Bolsa phonemes + emociones, frecuencias norma", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=1000)))

[[308  89]
 [107 112]]
              precision    recall  f1-score   support

           0       0.74      0.78      0.76       397
           1       0.56      0.51      0.53       219

    accuracy                           0.68       616
   macro avg       0.65      0.64      0.65       616
weighted avg       0.68      0.68      0.68       616



**Bolsa phonemes + emociones, tfidf**

In [25]:
nx_train = np.concatenate((x_train_phonemes_tfidf, x_train_emot_tfidf, x_train_emot_sel_tfidf), axis=1)
nx_val = np.concatenate((x_val_phonemes_tfidf, x_val_emot_tfidf, x_val_emot_sel_tfidf), axis=1)
metrics_hist.append(("Bolsa phonemes + emociones, tfidf", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=1500)))

[[308  89]
 [ 96 123]]
              precision    recall  f1-score   support

           0       0.76      0.78      0.77       397
           1       0.58      0.56      0.57       219

    accuracy                           0.70       616
   macro avg       0.67      0.67      0.67       616
weighted avg       0.70      0.70      0.70       616



**Bolsa phonemes + emociones, tfidf norm**

In [28]:
nx_train = np.concatenate((x_train_phonemes_tfidf_norm, x_train_emot_tfidf_norm, x_train_emot_sel_tfidf_norm), axis=1)
nx_val = np.concatenate((x_val_phonemes_tfidf_norm, x_val_emot_tfidf_norm, x_val_emot_sel_tfidf_norm), axis=1)
metrics_hist.append(("Bolsa phonemes + emociones, tfidf norm", 
                     *classify(nx_train, y_train, nx_val, y_val, kbest=1500)))

[[308  89]
 [ 96 123]]
              precision    recall  f1-score   support

           0       0.76      0.78      0.77       397
           1       0.58      0.56      0.57       219

    accuracy                           0.70       616
   macro avg       0.67      0.67      0.67       616
weighted avg       0.70      0.70      0.70       616



**Tabla comparativa**

In [29]:
dataset = pd.DataFrame(data=metrics_hist, columns = ['Embedding', 'Precision', 'Recall', 'Fscore', 'Accuracy'])
dataset

Unnamed: 0,Embedding,Precision,Recall,Fscore,Accuracy
0,"Bolsa tradicional + emociones, binaria",0.781605,0.77422,0.777537,0.798701
1,"Bolsa tradiciones + emociones, frecuencias",0.782762,0.77855,0.78053,0.800325
2,"Bolsa tradiciones + emociones, frecuencias norma",0.778652,0.778079,0.778363,0.797078
3,"Bolsa tradiciones + emociones, tfidf",0.769623,0.771782,0.770663,0.788961
4,"Bolsa tradiciones + emociones, tfidf norm",0.769623,0.771782,0.770663,0.788961
5,"Bolsa phonemes + emociones, binaria",0.662281,0.654481,0.65743,0.693182
6,"Bolsa phonemes + emociones, frecuencias",0.66546,0.661882,0.663458,0.694805
7,"Bolsa phonemes + emociones, frecuencias norma",0.649691,0.643617,0.645977,0.681818
8,"Bolsa phonemes + emociones, tfidf",0.671282,0.668731,0.669902,0.699675
9,"Bolsa phonemes + emociones, tfidf norm",0.671282,0.668731,0.669902,0.699675


#### Conclusiones

Durante el uso de enmascaramiento con los recursos léxicos de emociones se observaron métricas más bajas que el solo usar la bolsa de palabras tradicional.
Se intuye que es debido a la amplitud del vocabulario usado de estos recursos, puesto que hay tweets cuyas palabras que no tienen ningun match con una emocíón y por tanto su bow contenía ceros, reduciendo la información bastante la información capturada.

Se implementaron algunas mejoras para mejorar las representaciones en bolsas de palabras como

1. Uso de stop words
2. Normalizar palabras a minúsculas
3. Remover tildes

Unas posibles mejoras a implementarse sería usar lemmatization o stemming y así poder reducir los ceros de los match con las emociones debido a alguna otra conjugación o una palabra y al mejor uso de la raices de estas.

También se implementó la selección de mejores caracteristicas usando CHI2 mediante la biblioteca de sklearn.

No se observaron muchas mejoras significativas con respecto a los ejercicios realizados en la práctica 3, en general integrar la bolsa de emociones junto con la representación tradicional de bolsa de palabras o la de fonemas no mejoro mucho. Pero se observa un mejor comportamiento usando la bolsa de palabras tradicional vs la bolsa de palabras de fonemas. Por otro lado a pesar de su simplicidad la bolsa de palabras binarias sigue teniando muy buenos resultados.
