# Predicting with finetuned Robertuito for detecting Xenophobia

In [1]:
#transformers
from transformers import (
    AutoModelForSequenceClassification, AutoTokenizer)
from transformers import pipeline

#Data procesing
import pandas as pd
from pysentimiento.preprocessing import preprocess_tweet

# Load and config model

In [2]:
model_name = './robertuito/checkpoint-2472'

#load model from model name using huggingface library
model = AutoModelForSequenceClassification.from_pretrained(
        model_name, return_dict=True, num_labels=2)

#load tokenizer and config it (based on robertuito github)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.model_max_length = 128
model.config.id2label = {
            0: 'no xenofobo',
            1: 'xenofobo',
        }
id2label = {
            0: 'no xenofobo',
            1: 'xenofobo'
        }
label2id = {v:k for k,v in id2label.items()}
model.config.label2id = label2id

#add new tokens to tokenizer, resize model embeddings
special_tokens = ['@usuario', 'url', 'hashtag', 'emoji']
tokenizer.add_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))

pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

# Load and Prepare Data

In [3]:
#Load data to predict
predict_df = pd.read_csv('online_predictions.csv')#('zero_ones.csv')#
predict_df.label = predict_df.label.astype(int)

#preprocess text: normalize user mentions, laughts, urls and hashtag text
preprocess_fn = lambda x: preprocess_tweet(x, lang='es', shorten=2, url_token='url', hashtag_token='hashtag')
#preprocess all text data
predict_df['text'] = predict_df['text'].astype(str)
predict_df['text'] = predict_df['text'].apply(preprocess_fn)

In [4]:
dict_labels = { 0: 'no xenofobo', 1: 'xenofobo'}

def infer_text(df, index_=0, pipeline=None, dict_label=None):
    text = str(df.iloc[index_].text)
    label = df.iloc[index_].label
    print('El texto analizado es: \n {}'.format(text))
    print('-'*50)
    inference = pipeline(text)
    print('La etiqueta asignada por un humano es: \n {}'.format(dict_label[label]))
    print('-'*50)
    print("La IA está {:.2f}% segura de que corresponde a la clase: {}".format(inference[0]['score']*100,inference[0]['label']))
    print('-'*50)

In [9]:
infer_text(predict_df, index_=2, pipeline = pipe, dict_label=dict_labels)

El texto analizado es: 
 ¿Y a México? Que entre y salga quién le de la gana, ya casi un año con esto, ya le dieron en la madre a todo,solo falta el cierre de fronteras, que era de lo primero que debían hacer. ¡Dios, llévatelos! Pero no en avión
--------------------------------------------------
La etiqueta asignada por un humano es: 
 xenofobo
--------------------------------------------------
La IA está 59.79% segura de que corresponde a la clase: no xenofobo
--------------------------------------------------
