In [5]:
import numpy as np
import pandas as pd
import re
import spacy
import nltk
import emoji
from nltk import word_tokenize
from nltk import SyllableTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression, RidgeCV, LassoCV, SGDRegressor
from sklearn.svm import SVC, SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import f1_score, confusion_matrix,classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay, mean_squared_error, classification_report
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MaxAbsScaler

In [6]:
nlp = spacy.load('es_core_news_sm')
stopwords_spacy = list(nlp.Defaults.stop_words)
stopwords_nltk = nltk.corpus.stopwords.words('spanish')
morestpwords = list(set(stopwords_spacy) - set(stopwords_nltk) )
STOPWORDS = stopwords_nltk + morestpwords

# Workflow

* Extracción de características
    * Textuales
        * Número de caracteres por documento
        * Número de dígitos
        * Número de palabras por documento
        * Número de caracteres por palabra
        * Numero de mayúsculas utilizadas por documento
        * Número de caracteres especiales
        * Número de emoticones (:),:/, <3 etc
        * Numero de emojis
        * FKGL
    * Características semánticas
        * Número de verbos
        * Número de adjetivos
        * Número de sustantivos
        * Número de pronombres
    * TfidfVectorizer
        * BoW
        * Bigramas de palabras
        * Bigramas de etiquetas POS


* Entrenamiento del Modelo      
    * Stratified K Fold
    * Logistic Regression
    * Random Forest



* Validación del Modelo

### Definición de funciones para extraer características

In [7]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add must be an array 
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add)], 'csr')

In [27]:
## Características del texto
def numcaracteres(docs):
    len_caracteres = []
    for doc in docs:
        raw = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', doc)
        raw = emoji.replace_emoji(raw, replace = '').lower().strip()
        raw = re.sub('_', ' ', raw.lower().strip())
        raw = re.sub('\n', ' ', raw.lower().strip())
        raw = re.sub(' {2,}', ' ', raw.lower().strip())
        len_caracteres.append(len(raw))
    len_caracteres = np.array(len_caracteres).reshape(-1, 1)
    return len_caracteres
def numdigitos(docs):
    len_digitos = np.array([len(re.findall('\d', doc)) for doc in docs]).reshape(-1, 1)
    return len_digitos
def palsxdoc(docs):
    nlp = spacy.load('es_core_news_sm')
    palsperdoc =[]
    for doc in docs:
        raw = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', doc)
        raw = emoji.replace_emoji(raw, replace = '').lower().strip()
        raw = re.sub('_', ' ', raw.lower().strip())
        raw = re.sub('\n', ' ', raw.lower().strip())
        raw = re.sub(' {2,}', ' ', raw.lower().strip())
        palsperdoc.append(len(nlp(raw)))
        #palsperdoc = np.array([len(nltk.word_tokenize(re.sub('[^\w\s]|\d]', '', doc.lower()))) for doc in docs]).reshape(-1, 1)
    return np.array(palsperdoc).reshape(-1, 1)
def charsxpal(docs):
    nlp = spacy.load('es_core_news_sm')
    tokens =[]
    for doc in docs:
        raw = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', doc)
        raw = emoji.replace_emoji(raw, replace = '').lower().strip()
        raw = re.sub('_', ' ', raw.lower().strip())
        raw = re.sub('\n', ' ', raw.lower().strip())
        raw = re.sub(' {2,}', ' ', raw.lower().strip())
        tokens.append(nlp(raw))
    #tokens = [nltk.word_tokenize(re.sub('[^\w\s]|\d', '', doc.lower())) for doc in docs]
    chars_per_token = []
    for i in range(len(tokens)):
        toks_p = []
        if len(tokens[i]) == 0:
            chars_per_token.append(0)
        else:
            for token in tokens[i]:
                toks_p.append(len(token))
            chars_per_token.append(np.mean(toks_p))
    chars_per_token = np.array(chars_per_token).reshape(-1, 1)                        
    return chars_per_token
def UpperCase_doc(docs): ### MEJORAR CONTANDO EL TOTAL DE PALABRAS COMPLETAMENTE EN MAYÚSCULAS
    upper_cnt = np.array([len(re.findall('[A-Z]', doc)) for doc in docs]).reshape(-1, 1)
    return upper_cnt
def UpperCase_compl(docs):
    nlp = spacy.load('es_core_news_sm')
    palsperdoc =[]
    for doc in docs:
        raw = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', doc)
        raw = emoji.replace_emoji(raw, replace = '').strip()
        raw = re.sub('_', ' ', raw.strip())
        raw = re.sub('\n', ' ', raw.strip())
        raw = re.sub(' {2,}', ' ', raw.strip())
        if len(nlp(raw)) == 0:
            palsperdoc.append(0)
        else:
            palsperdoc.append(sum([token.text.isupper() for token in nlp(raw)])/ len(nlp(raw)))
    palsperdoc = np.array(palsperdoc).reshape(-1, 1)
    return 
def specchar(docs): ### MEJORAR SEPARAR ENTRE COMAS ADMIRACIÓN PARENTÉSIS PUNTOS RISAS JAJAJA HAHAHA LOL----Pa después
    speccharcnt = np.array([len(re.findall('[^\w\s]', doc)) for doc in docs]).reshape(-1, 1)
    return speccharcnt
def cntemojis(docs):
    emojiscnt = np.array([emoji.emoji_count(doc) for doc in docs]).reshape(-1, 1)
    return emojiscnt
def LexRich(docs): ### MEJORAR ESTA PARTE DE LA RIQUEZA LEXICA HACIENDO LEMATIZACIÓN-- MEJORADO
    nlp = spacy.load('es_core_news_sm')
    LexRichperdoc = []
    for doc in docs:
        raw = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', doc)
        raw = emoji.replace_emoji(raw, replace = '').lower().strip()
        raw = re.sub('_', ' ', raw.lower().strip())
        raw = re.sub('\n', ' ', raw.lower().strip())
        raw = re.sub(' {2,}', ' ', raw.lower().strip())
        if len(nlp(raw)) == 0:
            LexRichperdoc.append(0)
        else:
            LexRichperdoc.append(len(set([token.lemma_ for token in nlp(raw)]))/len(nlp(raw)))
        #np.array([len(set(nltk.word_tokenize(re.sub('[^\w\s]|\d', '', doc.lower()))))/ len(nltk.word_tokenize(re.sub('[^\w\s]|\d', '', doc.lower()))) for doc in docs]).reshape(-1, 1)
    LexRichperdoc = np.array(LexRichperdoc).reshape(-1, 1)
    return LexRichperdoc

In [28]:
def FKGL(docs): #### HACER LA TOKENIZACIÓN MÁS LAS SENTENCIAS CON SPACY
    nlp = spacy.load('es_core_news_sm')
    def fkgl(doc):
        text = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', doc)
        text = emoji.replace_emoji(text, replace = '').lower().strip()
        text = re.sub('_', ' ', text.lower().strip())
        text = re.sub('\n', ' ', text.lower().strip())
        text = re.sub(' {2,}', ' ', text.lower().strip())
        tk = SyllableTokenizer()
        words = [token.text for token in nlp(text)]
        sentences = [sent for sent in nlp(doc).sents]
        silabas = tk.tokenize(text)
        if len(words) == 0:
            return 0
        else:
            fkgl_doc = 206.84 - 1.02 * (len(words)/len(sentences)) - 60 * (len(silabas)/len(words))
            return fkgl_doc
    FKGL_perdoc = np.array([fkgl(doc) for doc in docs]).reshape(-1, 1)
    return FKGL_perdoc

In [33]:
## Caracteristicas semánicas  ## HACER EL RATIO LAS CARACTERÍSTICAS/#TOTAL DE PALABRAS
def POS_Vect(docs): ### Devuelve VERB,ADJ, NOUN, PRON y POS_xTweet
    nlp = spacy.load('es_core_news_sm')
    POS_tags = []
    for i in docs:
        text = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', i)
        text = emoji.replace_emoji(text, replace = '').lower().strip()
        text = re.sub('_', ' ', text.lower().strip())
        text = re.sub('\n', ' ', text.lower().strip())
        text = re.sub(' {2,}', ' ', text.lower().strip())
        doc = nlp(text)
        POS_tags.append([token.pos_ for token in doc])
        
    POS_tweets = [' '.join(i) for i in POS_tags]
    
    VERBS = np.array([i.count('VERB') for i in POS_tags]).reshape(-1, 1)
    ADJS = np.array([i.count('ADJ') for i in POS_tags]).reshape(-1, 1)
    NOUNS = np.array([i.count('NOUN') for i in POS_tags]).reshape(-1, 1)
    PRONS = np.array([i.count('PRON') for i in POS_tags]).reshape(-1, 1)
    
    return [VERBS, ADJS, NOUNS, PRONS, POS_tweets]

In [30]:
def BoW(docs):
    vect_bow = CountVectorizer(stop_words = STOPWORDS)
    X_vect_bow = vect_bow.fit_transform(docs)
    return X_vect_bow
def BigramWord(docs):
    bigram_vect = CountVectorizer(stop_words = STOPWORDS, ngram_range=(2, 2))
    X_bigram_words = bigram_vect.fit_transform(docs)
    return X_bigram_words
def TfidfBoW(docs):
    vect_TfidfBoW = TfidfVectorizer(stop_words = STOPWORDS)
    X_TfidfBoW = vect_TfidfBoW.fit_transform(docs)
    return X_TfidfBoW
def TfidfBigram(docs):
    bigram_vectfidf = TfidfVectorizer(stop_words = STOPWORDS, ngram_range=(2, 2))
    X_bigram_tfidf = bigram_vectfidf.fit_transform(docs)
    return X_bigram_tfidf

## Lectura de datos

In [12]:
data_train = pd.read_csv('train.csv')
data_train.head(5)

Unnamed: 0,index,tweet,humor,prejudice_woman,prejudice_lgbtiq,prejudice_inmigrant_race,gordofobia,mean_prejudice
0,72157,Mi celular tiene una aplicación que te hace ve...,1,0,0,0,1,3.0
1,68084,"En esta vida me tocó tener mala suerte, espero...",1,0,0,0,1,2.8
2,69089,"Tu mamá es taaan taan obesa, que cuando pasa f...",1,0,0,0,1,3.6
3,69190,Mi tía me dijo: \n- tengo memoria de Elefante....,1,0,0,0,1,3.4
4,70474,"- Mamá, en el colegio me dicen gorda.\n- ¡Ay M...",1,0,0,0,1,3.0


In [14]:
X, y = data_train.tweet, data_train.humor

In [45]:
import scipy

## Generación de vectores

In [25]:
lendoc = numcaracteres(X)
numdigitos_ = numdigitos(X)
palsdoc = palsxdoc(X)
charspal = charsxpal(X)
mayusdoc = UpperCase_doc(X)
speccharcnt = specchar(X)
emojiscnt = cntemojis(X)
LexRich_ = LexRich(X)

In [31]:
flskGL = FKGL(X)



In [34]:
VERBcnt, ADJcnt, NOUNcnt, PRONcnt, POSxtweet = POS_Vect(X)
X_bow_vect = BoW(X)
bigram_BoW = BigramWord(X)
bigram_POS = BigramWord(POSxtweet)
X_TfidfBoW = TfidfBoW(X)
bigram_TfidfBoW = TfidfBigram(X)
bigram_TfidfPOS = TfidfBigram(POSxtweet)

In [47]:
features = [lendoc, numdigitos_,palsdoc,charspal,mayusdoc,speccharcnt,emojiscnt,
            LexRich_,flskGL,VERBcnt,ADJcnt,NOUNcnt,PRONcnt]
ft_idx = ['lendoc', 'numdigitos_','palsdoc','charspal','mayusdoc','speccharcnt','emojiscnt',
          'LexRich_','flskGL','VERBcnt','ADJcnt','NOUNcnt','PRONcnt']
Bows = [X_bow_vect,bigram_BoW,bigram_POS,X_TfidfBoW,bigram_TfidfBoW,bigram_TfidfPOS]
Bows_idx = ['X_bow_vect_train','bigram_BoW_train','bigram_POS_train','X_TfidfBoW_train','bigram_TfidfBoW_train',
            'bigram_TfidfPOS_train']

In [41]:
Combined_ftures = []
z = add_feature(features[0], features[1])
Combined_ftures.append(z)
for i in range(2,13):
    z = add_feature(z, features[i])
    Combined_ftures.append(z)

In [48]:
scipy.sparse.save_npz('Features_train.npz', Combined_ftures[-1])
for i in range(len(Bows)):
    scipy.sparse.save_npz(Bows_idx[i] +'.npz', Bows[i])

In [49]:
X_test = data_test.tweet
y_test = data_test.humor

In [50]:
vect_bow = CountVectorizer(stop_words = STOPWORDS)
X_vect_bow = vect_bow.fit_transform(X)

bigram_vect = CountVectorizer(stop_words = STOPWORDS, ngram_range=(2, 2))
X_bigram_POS = bigram_vect.fit_transform(POSxtweet)

vect_bibow = CountVectorizer(stop_words = STOPWORDS, ngram_range=(2, 2))
X_vect_bibow = vect_bibow.fit_transform(X)

vect_TfidfBoW = TfidfVectorizer(stop_words = STOPWORDS)
X_TfidfBoW = vect_TfidfBoW.fit_transform(X)

bigram_vectfidf = TfidfVectorizer(stop_words = STOPWORDS, ngram_range=(2, 2))
bigram_tfidf_POS = bigram_vectfidf.fit_transform(POSxtweet)

bigram_vectfidf_bow = TfidfVectorizer(stop_words = STOPWORDS, ngram_range=(2, 2))
bigram_tfid = bigram_vectfidf_bow.fit_transform(X)

In [51]:
lendoc_test = numcaracteres(X_test)
numdigitos_test = numdigitos(X_test)
palsdoc_test = palsxdoc(X_test)
charspal_test = charsxpal(X_test)
mayusdoc_test = UpperCase_doc(X_test)
speccharcnt_test = specchar(X_test)
emojiscnt_test = cntemojis(X_test)
LexRich_test = LexRich(X_test)
flskGL_test = FKGL(X_test)
VERBcnt_test, ADJcnt_test, NOUNcnt_test, PRONcnt_test, POSxtweet_test = POS_Vect(X_test)
X_bow_vect_test = vect_bow.transform(X_test)
bigram_BoW_test = vect_bibow.transform(X_test) #####
bigram_POS_test = bigram_vect.transform(POSxtweet_test)
X_tfidfbow_test = vect_bow.transform(X_test)
bigram_TfidfBoW_test = bigram_vectfidf_bow.transform(X_test)#####
bigram_tfidfPOS_test = bigram_vect.transform(POSxtweet_test)



In [58]:
features_test = [lendoc_test, numdigitos_test, palsdoc_test, charspal_test, mayusdoc_test, 
                 speccharcnt_test, emojiscnt_test, LexRich_test,flskGL_test,VERBcnt_test,ADJcnt_test,NOUNcnt_test,
                 PRONcnt_test]

ft_idx_test = ['lendoc_test', 'numdigitos_test','palsdoc_test','charspal_test','mayusdoc_test',
               'speccharcnt_test','emojiscnt_test','LexRich_test','flskGL_test','VERBcnt_test','ADJcnt_test',
               'NOUNcnt_test','PRONcnt_test']
Bows_test = [X_bow_vect_test,bigram_BoW_test,bigram_POS_test,X_tfidfbow_test,bigram_TfidfBoW_test,bigram_tfidfPOS_test]
Bows_test_idx = ['X_bow_vect_test','bigram_BoW_test','bigram_POS_test','X_tfidfbow_test','bigram_TfidfBoW_test','bigram_tfidfPOS_test']

In [60]:
Combined_ftures_tst = []
z = add_feature(features_test[0], features_test[1])
Combined_ftures_tst.append(z)
for i in range(2,13):
    z = add_feature(z, features_test[i])
    Combined_ftures_tst.append(z)

In [61]:
scipy.sparse.save_npz('Features_test.npz', Combined_ftures_tst[-1])
for i in range(len(Bows_test)):
    scipy.sparse.save_npz(Bows_test_idx[i] +'.npz', Bows_test[i])

## Entrenamiento 

    * StratifiedKFold y LogisticRegression
    * StratifiedKFold y SVM
    * StratifiedKFold y RandomForest

In [13]:
clfLR = LogisticRegression(solver = 'lbfgs', tol = 0.001, C = 0.01, class_weight = 'balanced')
clfSVC = SVC(C = 0.01, kernel = 'linear', class_weight = 'balanced')
clfRF = RandomForestClassifier(max_depth = 10, random_state = 0, class_weight = 'balanced')

In [14]:
def Resultados(X,y,clf):
    skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 1)
    lst_f1_stratified = []
    for train_index, test_index in skf.split(X, y):
        x_train_fold, x_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        clf.fit(x_train_fold, y_train_fold)
        lst_f1_stratified.append(f1_score(clf.predict(x_test_fold), y_test_fold))
    return(np.mean(lst_f1_stratified))
    #print(f'Promedio F1: {np.mean(lst_f1_stratified)}')
    #print(f'Varianza F1: {np.std(lst_f1_stratified)}')

## Empezamos a probar con cada uno de los vectores

In [15]:
features = [lendoc, numdigitos_,palsdoc,charspal,mayusdoc,speccharcnt,emojiscnt,LexRich_,flskGL,VERBcnt,ADJcnt,NOUNcnt,PRONcnt,
            X_bow_vect,bigram_BoW,bigram_POS,X_TfidfBoW,bigram_TfidfBoW,bigram_TfidfPOS]
ft_idx = ['lendoc', 'numdigitos_','palsdoc','charspal','mayusdoc', 'PalMayus','speccharcnt','emojiscnt','LexRich_','flskGL','VERBcnt','ADJcnt',
          'NOUNcnt','PRONcnt', 'X_bow_vect','bigram_BoW','bigram_POS','X_TfidfBoW','bigram_TfidfBoW','bigram_TfidfPOS']

In [None]:
LR_per_ft = []
for i in range(len(features)):
    LR_per_ft.append((Resultados(features[i], y, clfLR), ft_idx[i]))

In [None]:
sorted(LR_per_ft, key=lambda x: x[0], reverse = True)

In [None]:
SVM_per_ft = []
for i in range(len(features)):
    SVM_per_ft.append((Resultados(features[i], y, clfSVC), ft_idx[i]))
sorted(SVM_per_ft, key=lambda x: x[0], reverse = True)

In [None]:
RF_per_ft = []
for i in range(len(features)):
    RF_per_ft.append((Resultados(features[i], y, clfRF), ft_idx[i]))
sorted(RF_per_ft, key=lambda x: x[0], reverse = True)

## Resultados combinando las características

In [None]:
MAScaler = MaxAbsScaler()

### LR

In [None]:
best_ftures_LR = [features[ft_idx.index(x[1])] for x in sorted(LR_per_ft, key=lambda x: x[0], reverse = True)]

In [None]:
LR_z = []
z = add_feature(best_ftures_LR[0], best_ftures_LR[1])
LR_z.append(z)

In [None]:
for i in range(2,19):
    z = add_feature(z, best_ftures_LR[i])
    LR_z.append(z)

In [None]:
LR_z_scaled =[]
for i in LR_z:
    LR_z_scaled.append(MAScaler.fit_transform(i))

In [None]:
Results_combined_LR = []
for i in LR_z:
    Results_combined_LR.append(Resultados(i, y, clfLR))
Results_combined_LR

In [None]:
Results_combined_LR

In [None]:
Results_combined_LR_scaled = []
for i in LR_z_scaled:
    Results_combined_LR_scaled.append(Resultados(i, y, clfLR))

In [None]:
Results_combined_LR_scaled

## SVM

In [None]:
best_ftures_SVM = [features[ft_idx.index(x[1])] for x in sorted(SVM_per_ft, key=lambda x: x[0], reverse = True)]

SVM_z = []
SVMz = add_feature(best_ftures_SVM[0], best_ftures_SVM[1])
SVM_z.append(SVMz)
for i in range(2,19):
    SVMz = add_feature(SVMz, best_ftures_SVM[i])
    SVM_z.append(SVMz)

SVM_z_scaled =[]
for i in SVM_z:
    SVM_z_scaled.append(MAScaler.fit_transform(i))

In [None]:
Results_combined_SVM = []
for i in SVM_z:
    Results_combined_SVM.append(Resultados(i, y, clfSVC))

In [None]:
Results_combined_SVM

In [None]:
Results_combined_SVM_scaled = []
for i in SVM_z_scaled:
    Results_combined_SVM_scaled.append(Resultados(i, y, clfLR))

In [None]:
Results_combined_SVM_scaled

## RF

In [None]:
best_ftures_RF = [features[ft_idx.index(x[1])] for x in sorted(RF_per_ft, key=lambda x: x[0], reverse = True)]

RF_z = []
RFz = add_feature(best_ftures_RF[0], best_ftures_RF[1])
RF_z.append(RFz)
for i in range(2,19):
    RFz = add_feature(RFz, best_ftures_RF[i])
    RF_z.append(RFz)

RF_z_scaled =[]
for i in RF_z:
    RF_z_scaled.append(MAScaler.fit_transform(i))

In [None]:
Results_combined_RF = []
for i in RF_z:
    Results_combined_RF.append(Resultados(i, y, clfRF))

In [None]:
Results_combined_RF

In [None]:
Results_combined_RF_scaled = []
for i in RF_z_scaled:
    Results_combined_RF_scaled.append(Resultados(i, y, clfLR))

In [None]:
Results_combined_RF_scaled

In [None]:
BestCombination = [Results_combined_RF.index(max(Results_combined_RF))]

## Evaluación conjunto de test

### Lectura conjunto de test

In [16]:
data_test = pd.read_csv('test_gold.csv')
data_test.head(5)

Unnamed: 0,index,tweet,humor,prejudice_woman,prejudice_lgbtiq,prejudice_inmigrant_race,gordofobia,mean_prejudice
0,52830,-Mamá en la escuela me dicen gorda -Pobresilla...,1,1,0,0,1,1.6
1,78883,"No te sientas diferente, da igual si eres negr...",1,0,0,1,1,1.4
2,78926,Si esta asi.. SUPER SI.. y que se pongan celos...,1,1,0,0,1,1.3
3,61844,—Bebé ¿Me veo gorda con este vestido?\n—¡No mi...,1,1,0,0,1,2.3
4,78830,Las mujeres solo desean 2 cosas en la vida: co...,1,1,0,0,1,2.4


In [None]:
LR_per_ft_test = []
for i in range(len(features)):
    LR_per_ft_test.append((Resultados(features[i], y_test, clfLR), ft_idx[i]))

In [None]:
sorted(LR_per_ft_test, key=lambda x: x[0], reverse = True)

In [None]:
SVM_per_ft_test = []
for i in range(len(features)):
    SVM_per_ft_test.append((Resultados(features[i], y_test, clfSVC), ft_idx[i]))
sorted(SVM_per_ft, key=lambda x: x[0], reverse = True)

In [None]:
RF_per_ft = []
for i in range(len(features)):
    RF_per_ft.append((Resultados(features[i], y_test, clfRF), ft_idx[i]))
sorted(RF_per_ft, key=lambda x: x[0], reverse = True)

## Resultados combinando las características

### LR

In [None]:
best_ftures_LR_test = [features_test[ft_idx.index(x[1])] for x in sorted(LR_per_ft, key=lambda x: x[0], reverse = True)]

In [None]:
LR_z_test = []
z = add_feature(best_ftures_LR_test[0], best_ftures_LR_test[1])
LR_z_test.append(z)

In [None]:
for i in range(2,19):
    z = add_feature(z, best_ftures_LR_test[i])
    LR_z_test.append(z)

In [None]:
Results_combined_LR_z = []
for i in LR_z_test:
    Results_combined_LR_z.append(Resultados(i, y_test, clfLR))

In [None]:
Results_combined_LR_z

In [None]:
Resultados(LR_z_test, y_test, clfLR)

## SVM

In [None]:
best_ftures_SVM_test = [features_test[ft_idx.index(x[1])] for x in sorted(SVM_per_ft, key=lambda x: x[0], reverse = True)]

SVM_z_test = []
z = add_feature(best_ftures_SVM_test[0], best_ftures_SVM_test[1])
SVM_z_test.append(z)
for i in range(2,19):
    z = add_feature(z, best_ftures_SVM_test[i])
    SVM_z_test.append(z)

In [None]:
Results_combined_SVM_test = []
for i in SVM_z_test:
    Results_combined_SVM_test.append(Resultados(i, y_test, clfSVC))

In [None]:
Results_combined_SVM_test

## RF

In [None]:
best_ftures_RF_test = [features_test[ft_idx.index(x[1])] for x in sorted(RF_per_ft, key=lambda x: x[0], reverse = True)]

RF_z_test = []
z = add_feature(best_ftures_RF_test[0], best_ftures_RF_test[1])
RF_z_test.append(z)
for i in range(2,19):
    z = add_feature(z, best_ftures_RF_test[i])
    RF_z_test.append(z)

In [None]:
Results_combined_RF_test = []
for i in RF_z_test:
    Results_combined_RF_test.append(Resultados(i, y_test, clfRF))

In [None]:
Results_combined_RF_test

In [None]:
w_0 = add_feature(X_bow_vect_test, X_tfidfbow_vect_test)
w_00 = add_feature(w_0, bigram_POS_test)
w_000 = add_feature(w_00, bigram_tfidfPOS_test)
w_1 = add_feature(w_000, lendoc_test)
w_2 = add_feature(w_1, palsdoc_test)
w_3 = add_feature(w_2, speccharcnt_test)
w_4 = add_feature(w_3, flskGL_test)
w_5 = add_feature(w_4, emojiscnt_test)
w_6 = add_feature(w_5, ADJcnt_test)
w_7 = add_feature(w_6, VERBcnt_test)
w_8 = add_feature(w_7, charspal_test)
w_9 = add_feature(w_8, testrichlex)

#X_bow_vect
#bigram_POS
#lendoc
#palsdoc
#speccharcnt
#flskGL
#emojiscnt
#ADJcnt
#VERBcnt
#charspal
#LexRich

In [None]:
ResultadosTarea1 = pd.DataFrame(data_test['index'])
ResultadosTarea1['PrediccionesT1'] = pd.Series(RF_pred)

In [None]:
LR_pred = clfLR.predict(w_9)
SVC_pred = clfSVC.predict(w_9)
RF_pred = clfRF.predict(w_9)

In [None]:
ResultadosTarea1 = pd.DataFrame(data_test['index'])
ResultadosTarea1['PrediccionesT1'] = pd.Series(RF_pred)
ResultadosTarea1

In [None]:
ResultadosTarea1_2 = pd.DataFrame(data_test['index'])
ResultadosTarea1_2['PrediccionesT1'] = pd.Series(SVC_pred)
ResultadosTarea1_2.to_csv('ResultadosTarea1_2.csv', sep = ',', index = False)

## Task 2A:

Prejudice Target Detection:

Taking into account the minority groups analyzed, i.e, Women and feminists, LGBTIQ community and Immigrants, racially discriminated people, and overweight people,  participants are asked to identify the targeted groups on each tweet as a multilabel classification task.

The metric employed for the second task will be macro-F1.

In [None]:
## 'mejor combinacion'
z_8 = RF_z[Results_combined_RF.index(max(Results_combined_RF))]

In [None]:
y_prejudice = data_train[['prejudice_woman', 'prejudice_lgbtiq', 'prejudice_inmigrant_race', 'gordofobia']]

In [None]:
y_prejudice

In [None]:
x_train, x_test, y_train, y_test = train_test_split( z_8, y_prejudice, test_size = 0.2, random_state=0)

In [None]:
def test_classifier(classifier, x_train, y_train, x_test, y_test):
    # Make it an Multilabel classifier
    multilabel_classifier = MultiOutputClassifier(classifier, n_jobs=-1)

    # Fit the data to the Multilabel classifier
    multilabel_classifier = multilabel_classifier.fit(x_train, y_train)

    # Get predictions for test data
    y_test_pred = multilabel_classifier.predict(x_test)

    # Generate multiclass confusion matrices
    matrices = multilabel_confusion_matrix(y_test, y_test_pred)

    # Plotting matrices: code
    #cmd = ConfusionMatrixDisplay(matrices[0], display_labels=np.unique(y_test)).plot()
    #plt.title('Confusion Matrix for label 1 (type)')
    #plt.show()
    #cmd = ConfusionMatrixDisplay(matrices[1], display_labels=np.unique(y_test)).plot()
    #plt.title('Confusion Matrix for label 2 (color)')
    #plt.show()

    print(f1_score(y_test_pred, y_test, average = 'macro'))

In [None]:
test_classifier(clfLR, x_train, y_train, x_test, y_test)

In [None]:
test_classifier(clfSVC, x_train, y_train, x_test, y_test)

In [None]:
test_classifier(SVC(C = 1, kernel = 'linear', class_weight = 'balanced'), x_train, y_train, x_test, y_test)

In [None]:
test_classifier(clfRF, x_train, y_train, x_test, y_test)

## Evaluación en el test set

In [None]:
y_prejudice_test = data_test[['prejudice_woman', 'prejudice_lgbtiq', 'prejudice_inmigrant_race', 'gordofobia']]
y_prejudice_test.head()

In [None]:
w_9 = RF_z_test[Results_combined_RF.index(max(Results_combined_RF))]

In [None]:
multilabel_classifier = MultiOutputClassifier(clfLR, n_jobs=-1)
multilabel_classifier = multilabel_classifier.fit(x_train, y_train)
# Get predictions for test data
y_pred_tarea2 = multilabel_classifier.predict(w_9)
f1_score(y_pred_tarea2, y_prejudice_test, average = 'macro')

In [None]:
multilabel_classifier = MultiOutputClassifier(SVC(C = 1, kernel = 'linear', class_weight = 'balanced'), n_jobs=-1)
multilabel_classifier = multilabel_classifier.fit(x_train, y_train)
# Get predictions for test data
y_pred_tarea2 = multilabel_classifier.predict(w_9)
f1_score(y_pred_tarea2, y_prejudice_test, average = 'macro')


In [None]:
multilabel_classifier = MultiOutputClassifier(clfRF, n_jobs=-1)
multilabel_classifier = multilabel_classifier.fit(x_train, y_train)
# Get predictions for test data
y_pred_tarea2 = multilabel_classifier.predict(w_9)
f1_score(y_pred_tarea2, y_prejudice_test, average = 'macro')

In [None]:
Resultados_tarea2 = pd.DataFrame(y_pred_tarea2, columns = ['prejudice_woman', 'prejudice_lgbtiq', 'prejudice_inmigrant_race', 'gordofobia'])
Resultados_tarea2.to_csv('ResultadosTarea2.csv', sep = ',', index = False)

# Task 2B:

Degree of Prejudice Prediction:

The third subtask consists of predicting on a continuous scale from 1 to 5 to evaluate how prejudicial the message is on average among minority groups. We will evaluate the submitted predictions employing the Root Mean Squared Error. 

In [None]:
hurtlex = pd.read_csv('hurtlex.txt', sep = '\t')

In [None]:
THlx = []
for i in range(len(hurtlex)):
    if len(hurtlex.lemma[i].split()) < 2:
        THlx.append((hurtlex.lemma[i], hurtlex.category[i]))
ExHlx_2 = []
for i in range(len(hurtlex)):
    if len(hurtlex.lemma[i].split()) == 2:
        ExHlx_2.append((hurtlex.lemma[i], hurtlex.category[i]))
ExHlx_3 = []
for i in range(len(hurtlex)):
    if len(hurtlex.lemma[i].split()) == 3:
        ExHlx_3.append((hurtlex.lemma[i], hurtlex.category[i]))

In [None]:
with open('SHARE.txt', 'r', encoding = 'utf-8') as my_file:
    s = my_file.readlines()
TOf_S = []
ExOf_S_2 = []
ExOf_S_3 = []
for i in s:
    if len(i.split()) < 2:
        TOf_S.append(i.replace('\n', ''))
    elif len(i.split()) == 2:
        ExOf_S_2.append(i.replace('\n', ''))
    else:
        ExOf_S_3.append(i.replace('\n', ''))
TOf_S.append('bastardo')

In [None]:
with open('palabras_nuevas.txt', 'r', encoding = 'utf-8') as my_file:
    palabras_nuevas = my_file.readlines()

In [None]:
palabras_nuevas = [i.replace('\n', '') for i in palabras_nuevas]

In [None]:
TOf_S = TOf_S + palabras_nuevas

In [None]:
THlx_unicos = list(set([THlx[i][0] for i in range(len(THlx))]))
ExHlx_2_unicos = list(set([ExHlx_2[i][0] for i in range(len(ExHlx_2))]))
ExHlx_3_unicos = list(set([ExHlx_3[i][0] for i in range(len(ExHlx_3))]))
PalAgresivas = [pal for pal in THlx_unicos if pal not in TOf_S] + TOf_S
ExAgres_2 = [pal for pal in ExHlx_2_unicos if pal not in ExOf_S_2] + ExOf_S_2
ExAgres_3 = [pal for pal in ExHlx_3_unicos if pal not in ExOf_S_3] + ExOf_S_3

In [None]:
def NGramas(listaPalabras, n):
    return [listaPalabras[i:i+n] for i in range(len(listaPalabras)-(n-1))]

In [None]:
palAgresivas_TotPal = []
exprAgresivas2_totexpr2 = []
exprAgresivas3_totexpr3 = []
for i in data_train.tweet:
    raw = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', i)
    raw = emoji.replace_emoji(raw, replace = '').lower().strip()
    raw = re.sub('_', ' ', raw.lower().strip())
    raw = re.sub('\n', ' ', raw.lower().strip())
    raw = re.sub(' {2,}', ' ', raw.lower().strip())
    ofterm = 0
    no_ofterm = 0
    of_bigrams = 0
    no_ofbigrams = 0
    of_trigrams = 0
    no_oftrigrams = 0
    doc = nlp(raw)
    tot_pals = len([token.text for token in doc])
    doc_bigrams = [' '.join(j) for j in NGramas([token.text for token in doc],2)]
    tot_bigrams = len(doc_bigrams)
    doc_trigrams = [' '.join(j) for j in NGramas([token.text for token in doc],3)]
    tot_trigrams = len(doc_trigrams)
    if tot_pals != 0:
        for token in doc:
            if token.text in PalAgresivas:
                ofterm += 1
            else:
                no_ofterm += 1
        palAgresivas_TotPal.append(ofterm/tot_pals)
    else:
        palAgresivas_TotPal.append(0)
    if tot_bigrams != 0:
        for bigram in doc_bigrams:
            if bigram in ExAgres_2:
                of_bigrams += 1
            else:
                no_ofbigrams += 1
        exprAgresivas2_totexpr2.append(of_bigrams/tot_bigrams)
    else:
        exprAgresivas2_totexpr2.append(0)
    if tot_trigrams != 0:
        for trigram in doc_trigrams:
            if trigram in ExAgres_3:
                of_trigrams += 1
            else:
                no_oftrigrams += 1
        exprAgresivas3_totexpr3.append(of_trigrams/tot_trigrams)
    else:
        exprAgresivas3_totexpr3.append(0)

In [None]:
len([i for i in exprAgresivas3_totexpr3 if i > 0])

In [None]:
#z_8 El original despues sumas los indices agresivos
z_9 = add_feature(z_8, np.array(palAgresivas_TotPal).reshape(-1, 1))
#z_10 = add_feature(z_8, np.array(exprAgresivas2_totexpr2).reshape(-1, 1))
#z_11 = add_feature(z_8, np.array(exprAgresivas3_totexpr3).reshape(-1, 1))

In [None]:
y_meanprejudice = data_train.mean_prejudice

In [None]:
clfRidge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv = 10)
clfLasso = LassoCV( cv = 2)
clfSVM = SVR(C=1.0, epsilon=0.2, gamma= 'scale', kernel= 'linear')
clfDTR = DecisionTreeRegressor( random_state=0)

In [None]:
clfRidge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv = 10)
clfLasso = LassoCV( cv = 2)
clfSVM = SVR(C=1.0, epsilon=0.2, gamma= 'scale', kernel= 'linear')
clfSGDR = SGDRegressor(max_iter=1000, tol=1e-3)
clfDTR = DecisionTreeRegressor( random_state=0)

In [None]:
def ResTarea3(clf, x_train, y_train, x_test, y_test):
    clf.fit(x_train, y_train)
    # prediction
    y_pred = clf.predict(x_test)

    # accuracy check
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse**(0.5)
    print("MSE: %.2f" % mse)
    print("RMSE: %.2f" % rmse)

In [None]:
x_train, x_test, y_train, y_test = train_test_split( z_8, y_meanprejudice, test_size = 0.2, random_state=0)

In [None]:
ResTarea3(clfRidge, x_train, y_train, x_test, y_test)
ResTarea3(clfLasso, x_train, y_train, x_test, y_test)
ResTarea3(clfSVM, x_train, y_train, x_test, y_test)
ResTarea3(clfSGDR, x_train, y_train, x_test, y_test)
ResTarea3(clfDTR, x_train, y_train, x_test, y_test)

In [None]:
x_train, x_test, y_train, y_test = train_test_split( z_9, y_meanprejudice, test_size = 0.2, random_state=0)

In [None]:
ResTarea3(clfRidge, x_train, y_train, x_test, y_test)
ResTarea3(clfLasso, x_train, y_train, x_test, y_test)
ResTarea3(clfSVM, x_train, y_train, x_test, y_test)
ResTarea3(clfSGDR, x_train, y_train, x_test, y_test)
ResTarea3(clfDTR, x_train, y_train, x_test, y_test)

In [None]:
palAgresivas_TotPal_test = []
exprAgresivas2_totexpr2_test = []
exprAgresivas3_totexpr3_test = []
for i in data_test.tweet:
    raw = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', i)
    raw = emoji.replace_emoji(raw, replace = '').lower().strip()
    raw = re.sub('_', ' ', raw.lower().strip())
    raw = re.sub('\n', ' ', raw.lower().strip())
    raw = re.sub(' {2,}', ' ', raw.lower().strip())
    ofterm = 0
    no_ofterm = 0
    of_bigrams = 0
    no_ofbigrams = 0
    of_trigrams = 0
    no_oftrigrams = 0
    doc = nlp(raw)
    tot_pals = len([token.text for token in doc])
    doc_bigrams = [' '.join(j) for j in NGramas([token.text for token in doc],2)]
    tot_bigrams = len(doc_bigrams)
    doc_trigrams = [' '.join(j) for j in NGramas([token.text for token in doc],3)]
    tot_trigrams = len(doc_trigrams)
    if tot_pals != 0:
        for token in doc:
            if token.text in PalAgresivas:
                ofterm += 1
            else:
                no_ofterm += 1
        palAgresivas_TotPal_test.append(ofterm/tot_pals)
    else:
        palAgresivas_TotPal_test.append(0)
    if tot_bigrams != 0:
        for bigram in doc_bigrams:
            if bigram in ExAgres_2:
                of_bigrams += 1
            else:
                no_ofbigrams += 1
        exprAgresivas2_totexpr2_test.append(of_bigrams/tot_bigrams)
    else:
        exprAgresivas2_totexpr2_test.append(0)
    if tot_trigrams != 0:
        for trigram in doc_trigrams:
            if trigram in ExAgres_3:
                of_trigrams += 1
            else:
                no_oftrigrams += 1
        exprAgresivas3_totexpr3_test.append(of_trigrams/tot_trigrams)
    else:
        exprAgresivas3_totexpr3_test.append(0)

In [None]:
w_10 = add_feature(w_9, np.array(palAgresivas_TotPal_test).reshape(-1, 1))

In [None]:
y_meanprejudice_test = data_test.mean_prejudice

In [None]:
x_train, x_test, y_train, y_test = train_test_split( w_10, y_meanprejudice_test, test_size = 0.2, random_state=0)

In [None]:
ResTarea3(clfRidge, x_train, y_train, x_test, y_test)
ResTarea3(clfLasso, x_train, y_train, x_test, y_test)
ResTarea3(clfSVM, x_train, y_train, x_test, y_test)
ResTarea3(clfSGDR, x_train, y_train, x_test, y_test)
ResTarea3(clfDTR, x_train, y_train, x_test, y_test)

## Combinando Embeddings + características

# FastText

In [None]:
df_test_gold['clean'] = [simple_preprocess(preprocess(doc)) for doc in df_test_gold['tweet']]

In [None]:
df['clean'] = [simple_preprocess(preprocess(doc)) for doc in df['tweet']]

In [None]:
#use 1 for positive sentiment, 0 for negative
x_train, x_test, y_train, y_test = train_test_split( df['clean'], df['humor'], test_size=0.2, random_state = 0)

In [None]:
import fasttext
import fasttext.util
##fasttext.util.download_model('es', if_exists='ignore')  # Spanish
ft = fasttext.load_model('cc.es.300.bin')

In [None]:
n_dim = 300

model = ft

def get_embedding(word):
    try:
        embedding = model[word]
    except:
        embedding = np.zeros((n_dim,))
    return embedding


In [None]:
#X_train_embeddings =[np.mean(np.array(list(map(get_embedding,tok_sent))),axis=0) for tok_sent in x_train]
X_test_embeddings = [np.mean(np.array(list(map(get_embedding, tok_sent))),axis=0) for tok_sent in x_test]
X_train_embeddings =[np.mean(np.array(list(map(get_embedding, tok_sent))), axis=0) for tok_sent in x_train]

X_test_gold_embeddings =[np.mean(np.array(list(map(get_embedding, tok_sent))), axis=0) for tok_sent in df_test_gold.clean]

In [None]:
# Removing the stop words
from gensim.parsing.preprocessing import remove_stopwords

def preprocess( doc):
    doc = re.sub('\W+',' ', doc.lower())
    doc = doc.replace('\n','')
    return doc

In [None]:
df_test_gold['clean'] = [simple_preprocess(preprocess(doc)) for doc in df_test_gold['tweet']]

In [None]:
df['clean'] = [simple_preprocess(preprocess(doc)) for doc in df['tweet']]
df['clean']