In [8]:
import numpy as np
import pandas as pd
import re
import spacy
import nltk
import emoji
import tqdm
from nltk import word_tokenize
from nltk import SyllableTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression, RidgeCV, LassoCV, SGDRegressor
from sklearn.svm import SVC, SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split, KFold
from sklearn.metrics import f1_score, confusion_matrix,classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import multilabel_confusion_matrix, ConfusionMatrixDisplay, mean_squared_error
from sklearn.tree import DecisionTreeRegressor

In [2]:
nlp = spacy.load('es_core_news_sm')
stopwords_spacy = list(nlp.Defaults.stop_words)
stopwords_nltk = nltk.corpus.stopwords.words('spanish')
morestpwords = list(set(stopwords_spacy) - set(stopwords_nltk) )
STOPWORDS = stopwords_nltk + morestpwords

# Workflow

* Extracción de características
    * Textuales
        * Número de caracteres por documento
        * Número de dígitos
        * Número de palabras por documento
        * Número de caracteres por palabra
        * Numero de mayúsculas utilizadas por documento
        * Número de caracteres especiales
        * Número de emoticones (:),:/, <3 etc
        * Numero de emojis
        * FKGL
    * Características semánticas
        * Número de verbos
        * Número de adjetivos
        * Número de sustantivos
        * Número de pronombres
    * TfidfVectorizer
        * BoW
        * Bigramas de palabras
        * Bigramas de etiquetas POS


* Entrenamiento del Modelo      
    * Stratified K Fold
    * Logistic Regression
    * Random Forest



* Validación del Modelo

In [3]:
nlp = spacy.load('es_core_news_lg')

In [4]:
a = 'MENTION HASHTAG'
nlp = spacy.load('es_core_news_lg')
text = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', a)
text = emoji.replace_emoji(text, replace = '').lower().strip()
text = re.sub('_', ' ', text.lower().strip())
text = re.sub('\n', ' ', text.lower().strip())
text = re.sub(' {2,}', ' ', text.lower().strip())
text

''

In [6]:
len(nlp(text))

0

### Definición de funciones para extraer características

In [3]:
def add_feature(X, feature_to_add):
    """
    Returns sparse feature matrix with added feature.
    feature_to_add must be an array 
    """
    from scipy.sparse import csr_matrix, hstack
    return hstack([X, csr_matrix(feature_to_add)], 'csr')

In [4]:
## Características del texto
def numcaracteres(docs):
    len_caracteres = np.array([len(doc) for doc in docs]).reshape(-1, 1)
    return len_caracteres
def numdigitos(docs):
    len_digitos = np.array([len(re.findall('\d', doc)) for doc in docs]).reshape(-1, 1)
    return len_digitos
def palsxdoc(docs):
    palsperdoc =[]
    for doc in docs:
        raw = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', doc)
        raw = emoji.replace_emoji(raw, replace = '').lower().strip()
        raw = re.sub('_', ' ', raw.lower().strip())
        raw = re.sub('\n', ' ', raw.lower().strip())
        raw = re.sub(' {2,}', ' ', raw.lower().strip())
        palsperdoc.append(len(nltk.word_tokenize(raw)))
        #palsperdoc = np.array([len(nltk.word_tokenize(re.sub('[^\w\s]|\d]', '', doc.lower()))) for doc in docs]).reshape(-1, 1)
    return np.array(palsperdoc).reshape(-1, 1)
def charsxpal(docs):
    tokens =[]
    for doc in docs:
        raw = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', doc)
        raw = emoji.replace_emoji(raw, replace = '').lower().strip()
        raw = re.sub('_', ' ', raw.lower().strip())
        raw = re.sub('\n', ' ', raw.lower().strip())
        raw = re.sub(' {2,}', ' ', raw.lower().strip())
        tokens.append(nltk.word_tokenize(raw))
    #tokens = [nltk.word_tokenize(re.sub('[^\w\s]|\d', '', doc.lower())) for doc in docs]
    chars_per_token = []
    for i in range(len(tokens)):
        toks_p = []
        for token in tokens[i]:
            toks_p.append(len(token))
        chars_per_token.append(np.mean(toks_p))
    chars_per_token = np.array(chars_per_token).reshape(-1, 1)                        
    return chars_per_token
def UpperCase_doc(docs): ### MEJORAR CONTANDO EL TOTAL DE PALABRAS COMPLETAMENTE EN MAYÚSCULAS
    upper_cnt = np.array([len(re.findall('[A-Z]', doc)) for doc in docs]).reshape(-1, 1)
    return upper_cnt
def specchar(docs): ### MEJORAR SEPARAR ENTRE COMAS ADMIRACIÓN PARENTÉSIS PUNTOS RISAS JAJAJA HAHAHA LOL
    speccharcnt = np.array([len(re.findall('[^\w\s]', doc)) for doc in docs]).reshape(-1, 1)
    return speccharcnt
def cntemojis(docs):
    emojiscnt = np.array([emoji.emoji_count(doc) for doc in docs]).reshape(-1, 1)
    return emojiscnt
def LexRich(docs): ### MMEJORAR ESTA PARTE DE LA RIQUEZA LEXICA HACIENDO LEMATIZACIÓN
    LexRichperdoc = np.array([len(set(nltk.word_tokenize(re.sub('[^\w\s]|\d', '', doc.lower()))))/ len(nltk.word_tokenize(re.sub('[^\w\s]|\d', '', doc.lower()))) for doc in docs]).reshape(-1, 1)
    return LexRichperdoc

In [13]:
def FKGL(docs): #### HACER LA TOKENIZACIÓN MÁS LAS SENTENCIAS CON SPACY
    nlp = spacy.load('es_core_news_sm')
    def fkgl(doc):
        text = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', doc)
        text = emoji.replace_emoji(text, replace = '').lower().strip()
        text = re.sub('_', ' ', text.lower().strip())
        text = re.sub('\n', ' ', text.lower().strip())
        text = re.sub(' {2,}', ' ', text.lower().strip())
        #text = re.sub('[^\w\s]\d', '', doc.lower())
        #text = emoji.replace_emoji(text, replace='')
        tk = SyllableTokenizer()
        words = [token.text for token in nlp(text)]
        sentences = [sent for sent in nlp(doc).sents]
        silabas = tk.tokenize(text)
        if len(words) == 0:
            return 0
        else:
            fkgl_doc = 206.84 - 1.02 * (len(words)/len(sentences)) - 60 * (len(silabas)/len(words))
            return fkgl_doc
    FKGL_perdoc = np.array([fkgl(doc) for doc in docs]).reshape(-1, 1)
    return FKGL_perdoc

In [6]:
## Caracteristicas semánicas  ## HACER EL RATIO LAS CARACTERÍSTICAS/#TOTAL DE PALABRAS
def POS_Vect(docs): ### Devuelve VERB,ADJ, NOUN, PRON y POS_xTweet
    nlp = spacy.load('es_core_news_sm')
    POS_tags = []
    for i in docs:
        doc = nlp(re.sub('[^\w\s]|\d', '', i.lower()))
        POS_tags.append([token.pos_ for token in doc])
        
    POS_tweets = [' '.join(i) for i in POS_tags]
    
    VERBS = np.array([i.count('VERB') for i in POS_tags]).reshape(-1, 1)
    ADJS = np.array([i.count('ADJ') for i in POS_tags]).reshape(-1, 1)
    NOUNS = np.array([i.count('NOUN') for i in POS_tags]).reshape(-1, 1)
    PRONS = np.array([i.count('PRON') for i in POS_tags]).reshape(-1, 1)
    
    return [VERBS, ADJS, NOUNS, PRONS, POS_tweets]

In [7]:
def BoW(docs):
    vect_bow = CountVectorizer(stop_words = STOPWORDS)
    X_vect_bow = vect_bow.fit_transform(docs)
    return X_vect_bow
def BigramWord(docs):
    bigram_vect = CountVectorizer(stop_words = STOPWORDS, ngram_range=(2, 2))
    X_bigram_words = bigram_vect.fit_transform(docs)
    return X_bigram_words
def TfidfBoW(docs):
    vect_TfidfBoW = TfidfVectorizer(stop_words = STOPWORDS)
    X_TfidfBoW = vect_TfidfBoW.fit_transform(docs)
    return X_TfidfBoW
def TfidfBigram(docs):
    bigram_vectfidf = TfidfVectorizer(stop_words = STOPWORDS, ngram_range=(2, 2))
    X_bigram_tfidf = bigram_vectfidf.fit_transform(docs)
    return X_bigram_tfidf

## Lectura de datos

In [10]:
data_train = pd.read_csv('train.csv')
data_train.head(5)

Unnamed: 0,index,tweet,humor,prejudice_woman,prejudice_lgbtiq,prejudice_inmigrant_race,gordofobia,mean_prejudice
0,72157,Mi celular tiene una aplicación que te hace ve...,1,0,0,0,1,3.0
1,68084,"En esta vida me tocó tener mala suerte, espero...",1,0,0,0,1,2.8
2,69089,"Tu mamá es taaan taan obesa, que cuando pasa f...",1,0,0,0,1,3.6
3,69190,Mi tía me dijo: \n- tengo memoria de Elefante....,1,0,0,0,1,3.4
4,70474,"- Mamá, en el colegio me dicen gorda.\n- ¡Ay M...",1,0,0,0,1,3.0


In [11]:
X, y = data_train.tweet, data_train.humor

In [14]:
flskGL = FKGL(X)



## Generación de vectores

In [10]:
lendoc = numcaracteres(X)
numdigitos_ = numdigitos(X)
palsdoc = palsxdoc(X)
charspal = charsxpal(X)
mayusdoc = UpperCase_doc(X)
speccharcnt = specchar(X)
emojiscnt = cntemojis(X)
LexRich_ = LexRich(X)
flskGL = FKGL(X)
VERBcnt, ADJcnt, NOUNcnt, PRONcnt, POSxtweet = POS_Vect(X)
X_bow_vect = BoW(X)
bigram_BoW = BigramWord(X)
bigram_POS = BigramWord(POSxtweet)
X_TfidfBoW = TfidfBoW(X)
bigram_TfidfBoW = TfidfBigram(X)
bigram_TfidfPOS = TfidfBigram(POSxtweet)



## Entrenamiento 

    * StratifiedKFold y LogisticRegression
    * StratifiedKFold y SVM
    * StratifiedKFold y RandomForest

In [11]:
clfLR = LogisticRegression(solver = 'lbfgs', tol = 0.001, C = 0.01, class_weight = 'balanced')
clfSVC = SVC(C = 0.01, kernel = 'linear', class_weight = 'balanced')
clfRF = RandomForestClassifier(max_depth = 10, random_state = 0, class_weight = 'balanced')

In [12]:
def Resultados(X,y,clf):
    skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 1)
    lst_f1_stratified = []
    for train_index, test_index in skf.split(X, y):
        x_train_fold, x_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        clf.fit(x_train_fold, y_train_fold)
        lst_f1_stratified.append(f1_score(clf.predict(x_test_fold), y_test_fold))
    return(np.mean(lst_f1_stratified))
    #print(f'Promedio F1: {np.mean(lst_f1_stratified)}')
    #print(f'Varianza F1: {np.std(lst_f1_stratified)}')

## Empezamos a probar con cada uno de los vectores

In [13]:
features = [lendoc, numdigitos_,palsdoc,charspal,mayusdoc,speccharcnt,emojiscnt,LexRich_,flskGL,VERBcnt,ADJcnt,NOUNcnt,PRONcnt,
            X_bow_vect,bigram_BoW,bigram_POS,X_TfidfBoW,bigram_TfidfBoW,bigram_TfidfPOS]
ft_idx = ['lendoc', 'numdigitos_','palsdoc','charspal','mayusdoc','speccharcnt','emojiscnt','LexRich_','flskGL','VERBcnt','ADJcnt',
          'NOUNcnt','PRONcnt', 'X_bow_vect','bigram_BoW','bigram_POS','X_TfidfBoW','bigram_TfidfBoW','bigram_TfidfPOS']

In [14]:
LR_per_ft = []
for i in range(len(features)):
    LR_per_ft.append((Resultados(features[i], y, clfLR), ft_idx[i]))

In [15]:
sorted(LR_per_ft, key=lambda x: x[0], reverse = True)

[(0.672417600600383, 'X_bow_vect'),
 (0.6619437658941527, 'X_TfidfBoW'),
 (0.5566719938969658, 'bigram_POS'),
 (0.5480750645955403, 'flskGL'),
 (0.5478148419964768, 'bigram_TfidfPOS'),
 (0.5058975679300606, 'emojiscnt'),
 (0.49197943785638937, 'ADJcnt'),
 (0.48680241591297213, 'lendoc'),
 (0.48528640640756854, 'palsdoc'),
 (0.45870715269080947, 'charspal'),
 (0.45398935357480064, 'NOUNcnt'),
 (0.45190661457494985, 'VERBcnt'),
 (0.41828505343988825, 'speccharcnt'),
 (0.3876879253380573, 'LexRich_'),
 (0.37740858973104363, 'mayusdoc'),
 (0.3600446743742396, 'PRONcnt'),
 (0.32759744988956707, 'bigram_TfidfBoW'),
 (0.3223559846834301, 'bigram_BoW'),
 (0.19814262672377614, 'numdigitos_')]

In [16]:
SVM_per_ft = []
for i in range(len(features)):
    SVM_per_ft.append((Resultados(features[i], y, clfSVC), ft_idx[i]))
sorted(SVM_per_ft, key=lambda x: x[0], reverse = True)

[(0.6414375491687719, 'X_bow_vect'),
 (0.5683031413589866, 'bigram_POS'),
 (0.5522354497781581, 'flskGL'),
 (0.550145999441701, 'lendoc'),
 (0.5423544630316899, 'palsdoc'),
 (0.5214133708826127, 'NOUNcnt'),
 (0.4958985228200724, 'emojiscnt'),
 (0.49197943785638937, 'ADJcnt'),
 (0.4879765175006218, 'VERBcnt'),
 (0.45870715269080947, 'charspal'),
 (0.4538307544149787, 'bigram_TfidfPOS'),
 (0.4408439890491671, 'PRONcnt'),
 (0.3785571055931422, 'speccharcnt'),
 (0.29771056317278166, 'mayusdoc'),
 (0.19920369058162174, 'bigram_BoW'),
 (0.07974910995094983, 'numdigitos_'),
 (0.05128681177976953, 'X_TfidfBoW'),
 (0.049014084507042255, 'LexRich_'),
 (0.049014084507042255, 'bigram_TfidfBoW')]

In [77]:
RF_per_ft = []
for i in range(len(features)):
    RF_per_ft.append((Resultados(features[i], y, clfRF), ft_idx[i]))
sorted(RF_per_ft, key=lambda x: x[0], reverse = True)

[(0.6942975756889072, 'X_bow_vect'),
 (0.6882072992315476, 'X_TfidfBoW'),
 (0.5728351032429186, 'bigram_POS'),
 (0.5517639964010719, 'palsdoc'),
 (0.5469071897062637, 'bigram_TfidfPOS'),
 (0.5245080983914179, 'NOUNcnt'),
 (0.5244931291886379, 'lendoc'),
 (0.5110452685093199, 'flskGL'),
 (0.5060477277366158, 'emojiscnt'),
 (0.4957014049611724, 'PRONcnt'),
 (0.49041317517854244, 'speccharcnt'),
 (0.47881145693251803, 'VERBcnt'),
 (0.4633415542754025, 'ADJcnt'),
 (0.45144381933368394, 'mayusdoc'),
 (0.4502662890534679, 'charspal'),
 (0.343894436534793, 'LexRich_'),
 (0.2650373809749132, 'bigram_TfidfBoW'),
 (0.2649670102810389, 'bigram_BoW'),
 (0.1509526738038215, 'numdigitos_')]

## Resultados combinando las características

### LR

In [18]:
best_ftures_LR = [features[ft_idx.index(x[1])] for x in sorted(LR_per_ft, key=lambda x: x[0], reverse = True)]

In [19]:
LR_z = []
z = add_feature(best_ftures_LR[0], best_ftures_LR[1])
LR_z.append(z)

In [20]:
for i in range(2,19):
    z = add_feature(z, best_ftures_LR[i])
    LR_z.append(z)

In [21]:
Results_combined_LR = []
for i in LR_z:
    Results_combined_LR.append(Resultados(i, y, clfLR))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [22]:
Results_combined_LR

[0.6748724904734034,
 0.6464885733477115,
 0.6489502284236688,
 0.648266900077964,
 0.6494516268366552,
 0.6515506756373439,
 0.6475029088979538,
 0.6470650512112742,
 0.6470880718599229,
 0.6444814716384724,
 0.6431302667570284,
 0.6715567486286307,
 0.6727196655055416,
 0.6715523006338816,
 0.6741047628701426,
 0.6715633568209469,
 0.6719380710087354,
 0.6709642092580461]

## SVM

In [23]:
best_ftures_SVM = [features[ft_idx.index(x[1])] for x in sorted(SVM_per_ft, key=lambda x: x[0], reverse = True)]

SVM_z = []
SVMz = add_feature(best_ftures_SVM[0], best_ftures_SVM[1])
SVM_z.append(SVMz)
for i in range(2,19):
    SVMz = add_feature(SVMz, best_ftures_SVM[i])
    SVM_z.append(SVMz)

In [24]:
Results_combined_SVM = []
for i in SVM_z:
    Results_combined_SVM.append(Resultados(i, y, clfSVC))

In [25]:
Results_combined_SVM

[0.6595505799193175,
 0.662128771217862,
 0.6647473824567258,
 0.6637124715515139,
 0.6624528798165235,
 0.662944526146772,
 0.6600575266743524,
 0.6607135955739099,
 0.6600500718186268,
 0.6619741012493124,
 0.660602714448302,
 0.6814892192757898,
 0.680574557196953,
 0.688046172519733,
 0.6904062571695893,
 0.6914814814892066,
 0.6893489572286525,
 0.6915107229932419]

## RF

In [78]:
best_ftures_RF = [features[ft_idx.index(x[1])] for x in sorted(RF_per_ft, key=lambda x: x[0], reverse = True)]

RF_z = []
RFz = add_feature(best_ftures_RF[0], best_ftures_RF[1])
RF_z.append(RFz)
for i in range(2,19):
    RFz = add_feature(RFz, best_ftures_RF[i])
    RF_z.append(RFz)

In [79]:
Results_combined_RF = []
for i in RF_z:
    Results_combined_RF.append(Resultados(i, y, clfRF))

In [80]:
Results_combined_RF

[0.6876335535541284,
 0.6917149024762544,
 0.7042520148690695,
 0.662278033546722,
 0.664840867222803,
 0.6717229017289511,
 0.6889424575403911,
 0.66926459094955,
 0.6754626198502773,
 0.700553728936473,
 0.7107995017868894,
 0.7010541684267474,
 0.6953421410519717,
 0.6909760700421143,
 0.6945509096766094,
 0.6881331873118255,
 0.6756081594265674,
 0.6848234294803868]

In [29]:
BestCombination = [Results_combined_RF.index(max(Results_combined_RF))]

## Evaluación conjunto de test

### Lectura conjunto de test

In [36]:
data_test = pd.read_csv('test_gold.csv')
data_test

Unnamed: 0,index,tweet,humor,prejudice_woman,prejudice_lgbtiq,prejudice_inmigrant_race,gordofobia,mean_prejudice
0,52830,-Mamá en la escuela me dicen gorda -Pobresilla...,1,1,0,0,1,1.6
1,78883,"No te sientas diferente, da igual si eres negr...",1,0,0,1,1,1.4
2,78926,Si esta asi.. SUPER SI.. y que se pongan celos...,1,1,0,0,1,1.3
3,61844,—Bebé ¿Me veo gorda con este vestido?\n—¡No mi...,1,1,0,0,1,2.3
4,78830,Las mujeres solo desean 2 cosas en la vida: co...,1,1,0,0,1,2.4
...,...,...,...,...,...,...,...,...
773,9496,Decir que una mujer está soltera es de machist...,0,1,0,0,0,2.2
774,14026,¿cómo un aliado se atreve a chamuyar a una ant...,0,1,0,0,0,2.4
775,12393,"MENTION No hicieron nada por las mujeres, son ...",0,1,0,0,0,2.4
776,18723,Cuando llegará ese día en que las chicas organ...,0,1,0,0,0,1.8


In [31]:
X_test = data_test.tweet
y_test = data_test.humor

In [32]:
vect_bow = CountVectorizer(stop_words = STOPWORDS)
X_vect_bow = vect_bow.fit_transform(X)

bigram_vect = CountVectorizer(stop_words = STOPWORDS, ngram_range=(2, 2))
X_bigram_POS = bigram_vect.fit_transform(POSxtweet)

vect_bibow = CountVectorizer(stop_words = STOPWORDS, ngram_range=(2, 2))
X_vect_bibow = vect_bibow.fit_transform(X)

vect_TfidfBoW = TfidfVectorizer(stop_words = STOPWORDS)
X_TfidfBoW = vect_TfidfBoW.fit_transform(X)

bigram_vectfidf = TfidfVectorizer(stop_words = STOPWORDS, ngram_range=(2, 2))
bigram_tfidf_POS = bigram_vectfidf.fit_transform(POSxtweet)

bigram_vectfidf_bow = TfidfVectorizer(stop_words = STOPWORDS, ngram_range=(2, 2))
bigram_tfid = bigram_vectfidf_bow.fit_transform(X)

In [33]:
lendoc_test = numcaracteres(X_test)
numdigitos_test = numdigitos(X_test)
palsdoc_test = palsxdoc(X_test)
charspal_test = charsxpal(X_test)
mayusdoc_test = UpperCase_doc(X_test)
speccharcnt_test = specchar(X_test)
emojiscnt_test = cntemojis(X_test)
LexRich_test = LexRich(X_test)
flskGL_test = FKGL(X_test)
VERBcnt_test, ADJcnt_test, NOUNcnt_test, PRONcnt_test, POSxtweet_test = POS_Vect(X_test)
X_bow_vect_test = vect_bow.transform(X_test)
bigram_BoW_test = vect_bibow.transform(X_test) #####
bigram_POS_test = bigram_vect.transform(POSxtweet_test)
X_tfidfbow_test = vect_bow.transform(X_test)
bigram_TfidfBoW_test = bigram_vectfidf_bow.transform(X_test)#####
bigram_tfidfPOS_test = bigram_vect.transform(POSxtweet_test)



In [34]:
features_test = [lendoc_test, numdigitos_test, palsdoc_test, charspal_test, mayusdoc_test, speccharcnt_test, emojiscnt_test, LexRich_test,flskGL_test,VERBcnt_test,ADJcnt_test,NOUNcnt_test,PRONcnt_test,
            X_bow_vect_test,bigram_BoW_test,bigram_POS_test,X_tfidfbow_test,bigram_TfidfBoW_test,bigram_tfidfPOS_test]
ft_idx_test = ['lendoc_test', 'numdigitos_test','palsdoc_test','charspal_test','mayusdoc_test','speccharcnt_test','emojiscnt_test','LexRich_test','flskGL_test','VERBcnt_test','ADJcnt_test',
          'NOUNcnt_test','PRONcnt_test', 'X_bow_vect_test','bigram_BoW_test','bigram_POS_test','X_tfidfbow_vect_test','bigram_TfidfBoW_test','bigram_tfidfPOS_test']

In [37]:
LR_per_ft_test = []
for i in range(len(features)):
    LR_per_ft_test.append((Resultados(features_test[i], y_test, clfLR), ft_idx_test[i]))

In [38]:
sorted(LR_per_ft_test, key=lambda x: x[0], reverse = True)

[(0.7076013115867699, 'X_bow_vect_test'),
 (0.7076013115867699, 'X_tfidfbow_vect_test'),
 (0.5675499872502037, 'bigram_POS_test'),
 (0.5675499872502037, 'bigram_tfidfPOS_test'),
 (0.5321322746959495, 'flskGL_test'),
 (0.5315653955300872, 'palsdoc_test'),
 (0.52368014785823, 'NOUNcnt_test'),
 (0.5168806103205175, 'ADJcnt_test'),
 (0.5081837743784101, 'lendoc_test'),
 (0.47177301609876443, 'charspal_test'),
 (0.4560030133234716, 'mayusdoc_test'),
 (0.45013931566522114, 'VERBcnt_test'),
 (0.41831077027852503, 'speccharcnt_test'),
 (0.3900312308738266, 'PRONcnt_test'),
 (0.3656772030095043, 'bigram_TfidfBoW_test'),
 (0.36556814593754944, 'LexRich_test'),
 (0.34764463128002726, 'bigram_BoW_test'),
 (0.2271038189635751, 'numdigitos_test'),
 (0.09238310948801598, 'emojiscnt_test')]

In [39]:
SVM_per_ft_test = []
for i in range(len(features)):
    SVM_per_ft_test.append((Resultados(features_test[i], y_test, clfSVC), ft_idx_test[i]))
sorted(SVM_per_ft, key=lambda x: x[0], reverse = True)

[(0.6414375491687719, 'X_bow_vect'),
 (0.5683031413589866, 'bigram_POS'),
 (0.5522354497781581, 'flskGL'),
 (0.550145999441701, 'lendoc'),
 (0.5423544630316899, 'palsdoc'),
 (0.5214133708826127, 'NOUNcnt'),
 (0.4958985228200724, 'emojiscnt'),
 (0.49197943785638937, 'ADJcnt'),
 (0.4879765175006218, 'VERBcnt'),
 (0.45870715269080947, 'charspal'),
 (0.4538307544149787, 'bigram_TfidfPOS'),
 (0.4408439890491671, 'PRONcnt'),
 (0.3785571055931422, 'speccharcnt'),
 (0.29771056317278166, 'mayusdoc'),
 (0.19920369058162174, 'bigram_BoW'),
 (0.07974910995094983, 'numdigitos_'),
 (0.05128681177976953, 'X_TfidfBoW'),
 (0.049014084507042255, 'LexRich_'),
 (0.049014084507042255, 'bigram_TfidfBoW')]

In [81]:
RF_per_ft_test = []
for i in range(len(features)):
    RF_per_ft_test.append((Resultados(features_test[i], y_test, clfRF), ft_idx_test[i]))
sorted(RF_per_ft, key=lambda x: x[0], reverse = True)

[(0.6942975756889072, 'X_bow_vect'),
 (0.6882072992315476, 'X_TfidfBoW'),
 (0.5728351032429186, 'bigram_POS'),
 (0.5517639964010719, 'palsdoc'),
 (0.5469071897062637, 'bigram_TfidfPOS'),
 (0.5245080983914179, 'NOUNcnt'),
 (0.5244931291886379, 'lendoc'),
 (0.5110452685093199, 'flskGL'),
 (0.5060477277366158, 'emojiscnt'),
 (0.4957014049611724, 'PRONcnt'),
 (0.49041317517854244, 'speccharcnt'),
 (0.47881145693251803, 'VERBcnt'),
 (0.4633415542754025, 'ADJcnt'),
 (0.45144381933368394, 'mayusdoc'),
 (0.4502662890534679, 'charspal'),
 (0.343894436534793, 'LexRich_'),
 (0.2650373809749132, 'bigram_TfidfBoW'),
 (0.2649670102810389, 'bigram_BoW'),
 (0.1509526738038215, 'numdigitos_')]

## Resultados combinando las características

### LR

In [41]:
best_ftures_LR_test = [features_test[ft_idx.index(x[1])] for x in sorted(LR_per_ft, key=lambda x: x[0], reverse = True)]

In [42]:
LR_z_test = []
z = add_feature(best_ftures_LR_test[0], best_ftures_LR_test[1])
LR_z_test.append(z)

In [43]:
for i in range(2,19):
    z = add_feature(z, best_ftures_LR_test[i])
    LR_z_test.append(z)

In [44]:
Results_combined_LR_z = []
for i in LR_z_test:
    Results_combined_LR_z.append(Resultados(i, y_test, clfLR))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [45]:
Results_combined_LR_z

[0.711124831580936,
 0.6910848416926187,
 0.657938820386531,
 0.6618088489934018,
 0.6570079385040454,
 0.6577315126035365,
 0.6573690711651137,
 0.6575905017336716,
 0.658909195454472,
 0.6592789153047884,
 0.6497621351719205,
 0.6627910980449954,
 0.6616118412750753,
 0.6585358529544596,
 0.6619256834629341,
 0.6610289351890797,
 0.6562720922122629,
 0.6607741110783506]

## SVM

In [49]:
best_ftures_SVM_test = [features_test[ft_idx.index(x[1])] for x in sorted(SVM_per_ft, key=lambda x: x[0], reverse = True)]

SVM_z_test = []
z = add_feature(best_ftures_SVM_test[0], best_ftures_SVM_test[1])
SVM_z_test.append(z)
for i in range(2,19):
    z = add_feature(z, best_ftures_SVM_test[i])
    SVM_z_test.append(z)

In [50]:
Results_combined_SVM_test = []
for i in SVM_z_test:
    Results_combined_SVM_test.append(Resultados(i, y_test, clfSVC))

In [51]:
Results_combined_SVM_test

[0.6734638233145621,
 0.6744503335061338,
 0.6638578174000813,
 0.6617228435003348,
 0.6617228435003348,
 0.6647486371511284,
 0.6585241727938272,
 0.6546103130989654,
 0.6547391843905684,
 0.6563223044026574,
 0.6571742579200206,
 0.662900376093486,
 0.6637264663764787,
 0.6645915335127321,
 0.6670697351295836,
 0.6978451378240542,
 0.6978451378240542,
 0.6978451378240542]

## RF

In [82]:
best_ftures_RF_test = [features_test[ft_idx.index(x[1])] for x in sorted(RF_per_ft, key=lambda x: x[0], reverse = True)]

RF_z_test = []
z = add_feature(best_ftures_RF_test[0], best_ftures_RF_test[1])
RF_z_test.append(z)
for i in range(2,19):
    z = add_feature(z, best_ftures_RF_test[i])
    RF_z_test.append(z)

In [83]:
Results_combined_RF_test = []
for i in RF_z_test:
    Results_combined_RF_test.append(Resultados(i, y_test, clfRF))

In [84]:
Results_combined_RF_test

[0.73994418017811,
 0.7248932078882091,
 0.7276249443143685,
 0.6919550793684331,
 0.7135334721411958,
 0.7068563114100763,
 0.7089845675974774,
 0.6948348548629567,
 0.7143295794976122,
 0.7115668661775901,
 0.720811014551711,
 0.7089502974825219,
 0.7145178940344102,
 0.7353663924573046,
 0.7307529729552619,
 0.7032879820731671,
 0.6864888556265909,
 0.6943631132386239]

In [None]:
w_0 = add_feature(X_bow_vect_test, X_tfidfbow_vect_test)
w_00 = add_feature(w_0, bigram_POS_test)
w_000 = add_feature(w_00, bigram_tfidfPOS_test)
w_1 = add_feature(w_000, lendoc_test)
w_2 = add_feature(w_1, palsdoc_test)
w_3 = add_feature(w_2, speccharcnt_test)
w_4 = add_feature(w_3, flskGL_test)
w_5 = add_feature(w_4, emojiscnt_test)
w_6 = add_feature(w_5, ADJcnt_test)
w_7 = add_feature(w_6, VERBcnt_test)
w_8 = add_feature(w_7, charspal_test)
w_9 = add_feature(w_8, testrichlex)

#X_bow_vect
#bigram_POS
#lendoc
#palsdoc
#speccharcnt
#flskGL
#emojiscnt
#ADJcnt
#VERBcnt
#charspal
#LexRich

In [None]:
ResultadosTarea1 = pd.DataFrame(data_test['index'])
ResultadosTarea1['PrediccionesT1'] = pd.Series(RF_pred)

In [None]:
LR_pred = clfLR.predict(w_9)
SVC_pred = clfSVC.predict(w_9)
RF_pred = clfRF.predict(w_9)

In [None]:
ResultadosTarea1 = pd.DataFrame(data_test['index'])
ResultadosTarea1['PrediccionesT1'] = pd.Series(RF_pred)
ResultadosTarea1

In [None]:
ResultadosTarea1_2 = pd.DataFrame(data_test['index'])
ResultadosTarea1_2['PrediccionesT1'] = pd.Series(SVC_pred)
ResultadosTarea1_2.to_csv('ResultadosTarea1_2.csv', sep = ',', index = False)

## Task 2A:

Prejudice Target Detection:

Taking into account the minority groups analyzed, i.e, Women and feminists, LGBTIQ community and Immigrants, racially discriminated people, and overweight people,  participants are asked to identify the targeted groups on each tweet as a multilabel classification task.

The metric employed for the second task will be macro-F1.

In [None]:
## 'mejor combinacion'
z_8 = RF_z[Results_combined_RF.index(max(Results_combined_RF))]

In [None]:
y_prejudice = data_train[['prejudice_woman', 'prejudice_lgbtiq', 'prejudice_inmigrant_race', 'gordofobia']]

In [None]:
y_prejudice

In [None]:
x_train, x_test, y_train, y_test = train_test_split( z_8, y_prejudice, test_size = 0.2, random_state=0)

In [None]:
def test_classifier(classifier, x_train, y_train, x_test, y_test):
    # Make it an Multilabel classifier
    multilabel_classifier = MultiOutputClassifier(classifier, n_jobs=-1)

    # Fit the data to the Multilabel classifier
    multilabel_classifier = multilabel_classifier.fit(x_train, y_train)

    # Get predictions for test data
    y_test_pred = multilabel_classifier.predict(x_test)

    # Generate multiclass confusion matrices
    matrices = multilabel_confusion_matrix(y_test, y_test_pred)

    # Plotting matrices: code
    #cmd = ConfusionMatrixDisplay(matrices[0], display_labels=np.unique(y_test)).plot()
    #plt.title('Confusion Matrix for label 1 (type)')
    #plt.show()
    #cmd = ConfusionMatrixDisplay(matrices[1], display_labels=np.unique(y_test)).plot()
    #plt.title('Confusion Matrix for label 2 (color)')
    #plt.show()

    print(f1_score(y_test_pred, y_test, average = 'macro'))

In [None]:
test_classifier(clfLR, x_train, y_train, x_test, y_test)

In [None]:
test_classifier(clfSVC, x_train, y_train, x_test, y_test)

In [None]:
test_classifier(SVC(C = 1, kernel = 'linear', class_weight = 'balanced'), x_train, y_train, x_test, y_test)

In [None]:
test_classifier(clfRF, x_train, y_train, x_test, y_test)

## Evaluación en el test set

In [None]:
y_prejudice_test = data_test[['prejudice_woman', 'prejudice_lgbtiq', 'prejudice_inmigrant_race', 'gordofobia']]
y_prejudice_test.head()

In [None]:
w_9 = RF_z_test[Results_combined_RF.index(max(Results_combined_RF))]

In [None]:
multilabel_classifier = MultiOutputClassifier(clfLR, n_jobs=-1)
multilabel_classifier = multilabel_classifier.fit(x_train, y_train)
# Get predictions for test data
y_pred_tarea2 = multilabel_classifier.predict(w_9)
f1_score(y_pred_tarea2, y_prejudice_test, average = 'macro')

In [None]:
multilabel_classifier = MultiOutputClassifier(SVC(C = 1, kernel = 'linear', class_weight = 'balanced'), n_jobs=-1)
multilabel_classifier = multilabel_classifier.fit(x_train, y_train)
# Get predictions for test data
y_pred_tarea2 = multilabel_classifier.predict(w_9)
f1_score(y_pred_tarea2, y_prejudice_test, average = 'macro')


In [None]:
multilabel_classifier = MultiOutputClassifier(clfRF, n_jobs=-1)
multilabel_classifier = multilabel_classifier.fit(x_train, y_train)
# Get predictions for test data
y_pred_tarea2 = multilabel_classifier.predict(w_9)
f1_score(y_pred_tarea2, y_prejudice_test, average = 'macro')

In [None]:
Resultados_tarea2 = pd.DataFrame(y_pred_tarea2, columns = ['prejudice_woman', 'prejudice_lgbtiq', 'prejudice_inmigrant_race', 'gordofobia'])
Resultados_tarea2.to_csv('ResultadosTarea2.csv', sep = ',', index = False)

# Task 2B:

Degree of Prejudice Prediction:

The third subtask consists of predicting on a continuous scale from 1 to 5 to evaluate how prejudicial the message is on average among minority groups. We will evaluate the submitted predictions employing the Root Mean Squared Error. 

In [None]:
hurtlex = pd.read_csv('hurtlex.txt', sep = '\t')

In [None]:
THlx = []
for i in range(len(hurtlex)):
    if len(hurtlex.lemma[i].split()) < 2:
        THlx.append((hurtlex.lemma[i], hurtlex.category[i]))
ExHlx_2 = []
for i in range(len(hurtlex)):
    if len(hurtlex.lemma[i].split()) == 2:
        ExHlx_2.append((hurtlex.lemma[i], hurtlex.category[i]))
ExHlx_3 = []
for i in range(len(hurtlex)):
    if len(hurtlex.lemma[i].split()) == 3:
        ExHlx_3.append((hurtlex.lemma[i], hurtlex.category[i]))

In [None]:
with open('SHARE.txt', 'r', encoding = 'utf-8') as my_file:
    s = my_file.readlines()
TOf_S = []
ExOf_S_2 = []
ExOf_S_3 = []
for i in s:
    if len(i.split()) < 2:
        TOf_S.append(i.replace('\n', ''))
    elif len(i.split()) == 2:
        ExOf_S_2.append(i.replace('\n', ''))
    else:
        ExOf_S_3.append(i.replace('\n', ''))
TOf_S.append('bastardo')

In [None]:
with open('palabras_nuevas.txt', 'r', encoding = 'utf-8') as my_file:
    palabras_nuevas = my_file.readlines()

In [None]:
palabras_nuevas = [i.replace('\n', '') for i in palabras_nuevas]

In [None]:
TOf_S = TOf_S + palabras_nuevas

In [None]:
THlx_unicos = list(set([THlx[i][0] for i in range(len(THlx))]))
ExHlx_2_unicos = list(set([ExHlx_2[i][0] for i in range(len(ExHlx_2))]))
ExHlx_3_unicos = list(set([ExHlx_3[i][0] for i in range(len(ExHlx_3))]))
PalAgresivas = [pal for pal in THlx_unicos if pal not in TOf_S] + TOf_S
ExAgres_2 = [pal for pal in ExHlx_2_unicos if pal not in ExOf_S_2] + ExOf_S_2
ExAgres_3 = [pal for pal in ExHlx_3_unicos if pal not in ExOf_S_3] + ExOf_S_3

In [None]:
def NGramas(listaPalabras, n):
    return [listaPalabras[i:i+n] for i in range(len(listaPalabras)-(n-1))]

In [None]:
palAgresivas_TotPal = []
exprAgresivas2_totexpr2 = []
exprAgresivas3_totexpr3 = []
for i in data_train.tweet:
    raw = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', i)
    raw = emoji.replace_emoji(raw, replace = '').lower().strip()
    raw = re.sub('_', ' ', raw.lower().strip())
    raw = re.sub('\n', ' ', raw.lower().strip())
    raw = re.sub(' {2,}', ' ', raw.lower().strip())
    ofterm = 0
    no_ofterm = 0
    of_bigrams = 0
    no_ofbigrams = 0
    of_trigrams = 0
    no_oftrigrams = 0
    doc = nlp(raw)
    tot_pals = len([token.text for token in doc])
    doc_bigrams = [' '.join(j) for j in NGramas([token.text for token in doc],2)]
    tot_bigrams = len(doc_bigrams)
    doc_trigrams = [' '.join(j) for j in NGramas([token.text for token in doc],3)]
    tot_trigrams = len(doc_trigrams)
    if tot_pals != 0:
        for token in doc:
            if token.text in PalAgresivas:
                ofterm += 1
            else:
                no_ofterm += 1
        palAgresivas_TotPal.append(ofterm/tot_pals)
    else:
        palAgresivas_TotPal.append(0)
    if tot_bigrams != 0:
        for bigram in doc_bigrams:
            if bigram in ExAgres_2:
                of_bigrams += 1
            else:
                no_ofbigrams += 1
        exprAgresivas2_totexpr2.append(of_bigrams/tot_bigrams)
    else:
        exprAgresivas2_totexpr2.append(0)
    if tot_trigrams != 0:
        for trigram in doc_trigrams:
            if trigram in ExAgres_3:
                of_trigrams += 1
            else:
                no_oftrigrams += 1
        exprAgresivas3_totexpr3.append(of_trigrams/tot_trigrams)
    else:
        exprAgresivas3_totexpr3.append(0)

In [None]:
len([i for i in exprAgresivas3_totexpr3 if i > 0])

In [None]:
#z_8 El original despues sumas los indices agresivos
z_9 = add_feature(z_8, np.array(palAgresivas_TotPal).reshape(-1, 1))
#z_10 = add_feature(z_8, np.array(exprAgresivas2_totexpr2).reshape(-1, 1))
#z_11 = add_feature(z_8, np.array(exprAgresivas3_totexpr3).reshape(-1, 1))

In [None]:
y_meanprejudice = data_train.mean_prejudice

In [None]:
clfRidge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv = 10)
clfLasso = LassoCV( cv = 2)
clfSVM = SVR(C=1.0, epsilon=0.2, gamma= 'scale', kernel= 'linear')
clfDTR = DecisionTreeRegressor( random_state=0)

In [None]:
clfRidge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv = 10)
clfLasso = LassoCV( cv = 2)
clfSVM = SVR(C=1.0, epsilon=0.2, gamma= 'scale', kernel= 'linear')
clfSGDR = SGDRegressor(max_iter=1000, tol=1e-3)
clfDTR = DecisionTreeRegressor( random_state=0)

In [None]:
def ResTarea3(clf, x_train, y_train, x_test, y_test):
    clf.fit(x_train, y_train)
    # prediction
    y_pred = clf.predict(x_test)

    # accuracy check
    mse = mean_squared_error(y_test, y_pred)
    rmse = mse**(0.5)
    print("MSE: %.2f" % mse)
    print("RMSE: %.2f" % rmse)

In [None]:
x_train, x_test, y_train, y_test = train_test_split( z_8, y_meanprejudice, test_size = 0.2, random_state=0)

In [None]:
ResTarea3(clfRidge, x_train, y_train, x_test, y_test)
ResTarea3(clfLasso, x_train, y_train, x_test, y_test)
ResTarea3(clfSVM, x_train, y_train, x_test, y_test)
ResTarea3(clfSGDR, x_train, y_train, x_test, y_test)
ResTarea3(clfDTR, x_train, y_train, x_test, y_test)

In [None]:
x_train, x_test, y_train, y_test = train_test_split( z_9, y_meanprejudice, test_size = 0.2, random_state=0)

In [None]:
ResTarea3(clfRidge, x_train, y_train, x_test, y_test)
ResTarea3(clfLasso, x_train, y_train, x_test, y_test)
ResTarea3(clfSVM, x_train, y_train, x_test, y_test)
ResTarea3(clfSGDR, x_train, y_train, x_test, y_test)
ResTarea3(clfDTR, x_train, y_train, x_test, y_test)

In [None]:
palAgresivas_TotPal_test = []
exprAgresivas2_totexpr2_test = []
exprAgresivas3_totexpr3_test = []
for i in data_test.tweet:
    raw = re.sub('[^\w\s]|\d|(MENTION)|(URL)|(HASHTAG)|', '', i)
    raw = emoji.replace_emoji(raw, replace = '').lower().strip()
    raw = re.sub('_', ' ', raw.lower().strip())
    raw = re.sub('\n', ' ', raw.lower().strip())
    raw = re.sub(' {2,}', ' ', raw.lower().strip())
    ofterm = 0
    no_ofterm = 0
    of_bigrams = 0
    no_ofbigrams = 0
    of_trigrams = 0
    no_oftrigrams = 0
    doc = nlp(raw)
    tot_pals = len([token.text for token in doc])
    doc_bigrams = [' '.join(j) for j in NGramas([token.text for token in doc],2)]
    tot_bigrams = len(doc_bigrams)
    doc_trigrams = [' '.join(j) for j in NGramas([token.text for token in doc],3)]
    tot_trigrams = len(doc_trigrams)
    if tot_pals != 0:
        for token in doc:
            if token.text in PalAgresivas:
                ofterm += 1
            else:
                no_ofterm += 1
        palAgresivas_TotPal_test.append(ofterm/tot_pals)
    else:
        palAgresivas_TotPal_test.append(0)
    if tot_bigrams != 0:
        for bigram in doc_bigrams:
            if bigram in ExAgres_2:
                of_bigrams += 1
            else:
                no_ofbigrams += 1
        exprAgresivas2_totexpr2_test.append(of_bigrams/tot_bigrams)
    else:
        exprAgresivas2_totexpr2_test.append(0)
    if tot_trigrams != 0:
        for trigram in doc_trigrams:
            if trigram in ExAgres_3:
                of_trigrams += 1
            else:
                no_oftrigrams += 1
        exprAgresivas3_totexpr3_test.append(of_trigrams/tot_trigrams)
    else:
        exprAgresivas3_totexpr3_test.append(0)

In [None]:
w_10 = add_feature(w_9, np.array(palAgresivas_TotPal_test).reshape(-1, 1))

In [None]:
y_meanprejudice_test = data_test.mean_prejudice

In [None]:
x_train, x_test, y_train, y_test = train_test_split( w_10, y_meanprejudice_test, test_size = 0.2, random_state=0)

In [None]:
ResTarea3(clfRidge, x_train, y_train, x_test, y_test)
ResTarea3(clfLasso, x_train, y_train, x_test, y_test)
ResTarea3(clfSVM, x_train, y_train, x_test, y_test)
ResTarea3(clfSGDR, x_train, y_train, x_test, y_test)
ResTarea3(clfDTR, x_train, y_train, x_test, y_test)

In [None]:
clfSVM.fit(x_train, y_train)
# prediction
y_pred_Test = clfSVM.predict(w_10)

In [None]:
pred_tarea3 = (pd.DataFrame(y_pred_Test))

In [None]:
pred_tarea3.to_csv('ResultadosTarea3.csv', sep = ',', index = False)