En este notebook, entrenamos modelos de predicción de discursos de odio en base a distintos datasets, para poder predecir su uso en reddit.

In [1]:
import pickle

import pandas as pd
import spacy

from preprocessing_utils import preprocess_corpus


EMBEDDING = 'word2vec'  # valores posibles: 'lda', 'word2vec', 'fasttext'

TEXT_FILE_READ = 'docs/reddit_data_{}.csv'.format(EMBEDDING)

TEXT_SAVE_FILE = 'docs/reddit_data_hate_speech.csv'
TEXT_SAVE_FILE_POS_HATE_SPEECH = 'docs/test/reddit_data_hate_speech_pos.csv'
TEXT_SAVE_FILE_NEG_HATE_SPEECH = 'docs/test/reddit_data_hate_speech_neg.csv'

nlp = spacy.load("es_core_news_lg")

# cargamos vectorizadores y modelos entrenados

models = {}

for dataset in ['hateval', 'detoxis', 'meoffendmex']:
    # cargamos el vectorizador usado en el dataset
    
    
    for model in ['lg', 'rf', 'nb']:
        # cargamos cada modelo entrenado en el dataset
        with open('docs/models/{}_{}_model.pkl'.format(dataset, model), 'rb') as f:
            models['{}_{}'.format(dataset, model)] = pickle.load()

# cargamos vectorizadores y modelos entrenados

with open('docs/models/hateval_vectorizer.pkl', 'rb') as f:
    cv_hateval = pickle.load(f)
    
with open('docs/models/hateval_lg_model.pkl', 'rb') as f:
    lg_hateval = pickle.load(f)
    
with open('docs/models/hateval_rf_model.pkl', 'rb') as f:
    rf_hateval = pickle.load(f)
    
with open('docs/models/hateval_nb_model.pkl', 'rb') as f:
    nb_hateval = pickle.load(f)
    
    
with open('docs/models/detoxis_vectorizer.pkl', 'rb') as f:
    cv_detoxis = pickle.load(f)
    
with open('docs/models/detoxis_lg_model.pkl', 'rb') as f:
    nb_detoxis = pickle.load(f)
    
with open('docs/models/detoxis_rf_model.pkl', 'rb') as f:
    nb_detoxis = pickle.load(f)
    
with open('docs/models/detoxis_nb_model.pkl', 'rb') as f:
    nb_detoxis = pickle.load(f)
    
    
with open('docs/models/meoffendmex_vectorizer.pkl', 'rb') as f:
    cv_meoffendmex = pickle.load(f)
    
with open('docs/models/meoffendmex_lg_model.pkl', 'rb') as f:
    nb_meoffendmex = pickle.load(f)
    
with open('docs/models/meoffendmex_rf_model.pkl', 'rb') as f:
    nb_meoffendmex = pickle.load(f)
    
with open('docs/models/meoffendmex_nb_model.pkl', 'rb') as f:
    nb_meoffendmex = pickle.load(f)


In [2]:
# cargamos un vectorizador y un modelo entrenado

with open('docs/models/meoffendmex_vectorizer.pkl', 'rb') as f:
    cv_meoffendmex = pickle.load(f)
    
with open('docs/models/meoffendmex_nb_model.pkl', 'rb') as f:
    nb_meoffendmex = pickle.load(f)

# Prueba de modelos en Reddit con MeOffendMex

In [3]:
df = pd.read_csv(TEXT_FILE_READ)

In [4]:
reddit_corpus = preprocess_corpus(df['body'].astype('str'))
reddit_adapted = cv_meoffendmex.transform(reddit_corpus)

In [5]:
reddit_predictions = nb_meoffendmex.predict(reddit_adapted)
reddit_hs_proba = nb_meoffendmex.predict_proba(reddit_adapted)[:,1]
print(reddit_hs_proba)

[0.00569818 0.9816972  0.10259367 ... 0.47363154 0.02588299 0.41293737]


In [11]:
target_predict_proba = 0.8
hate_mask = reddit_hs_proba>=target_predict_proba
non_hate_mask = reddit_hs_proba < target_predict_proba
print(len(hate_mask))

27791


In [14]:
len(hate_mask)

27791

In [7]:
df['hate_speech'] = df.apply(lambda row: '-' , axis = 1) 

for index,row in enumerate(df['body']):
    if reddit_hs_proba[index] >= target_predict_proba :
        is_hate_speech = 'si'
    else:
        is_hate_speech = 'no'
    df['hate_speech'][index] = is_hate_speech

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hate_speech'][index] = is_hate_speech


In [8]:
df.to_csv(TEXT_SAVE_FILE)

In [9]:
df[hate_mask].to_csv(TEXT_SAVE_FILE_POS_HATE_SPEECH)

In [10]:
df[non_hate_mask].to_csv(TEXT_SAVE_FILE_NEG_HATE_SPEECH)

FIN