# 1. Procesado de datos

In [9]:
import pandas as pd
import numpy as np
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"


df = pd.read_csv('./data/train.csv')

In [10]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [11]:
en_stopwords = nlp.Defaults.stop_words

def remove_stop_words(text):
    cleanText = ''
    phrase = nlp(text)
    for token in phrase:
        if not token.is_stop and not token.is_punct and not token.like_url:
            cleanText += ' ' + token.text

    return cleanText

In [12]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [13]:
df['text_cleaned'] = df['text'].apply(preprocess)

In [14]:
df['text_cleaned']

0       Hi Roy hope you are ok, Trans people are not g...
1                   But fuckin' hell what even is biology
2                       Whose the nice looking clergyman?
3       AIDS ARE IN YOUR WAY, SIN HAS CONSEQUENCES AND...
4                                                to learn
                              ...                        
8143    Yeah...if we still alive at the time of the ne...
8144     There is something fundamentally wrong with this
8145    This always confused me. If homosexuality is "...
8146                                           disgusting
8147    Peter Sørensen Just a note on Peter's poor mat...
Name: text_cleaned, Length: 8148, dtype: object

In [15]:
from sklearn.model_selection import train_test_split

X = df['text_cleaned']
y = df['label']

X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.33, random_state=42)

In [16]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import torch

# Set the device to CUDA if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='hate'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)
tokenizer.save_pretrained(MODEL)
model.save_pretrained(MODEL)

In [17]:
# Tokenize all texts in the 'text_cleaned' column
encoded_inputs = df['text'].apply(lambda x: tokenizer(x, return_tensors='pt', max_length=504, padding=True, truncation=True).to(device))

In [18]:
encoded_inputs

0       [input_ids, attention_mask]
1       [input_ids, attention_mask]
2       [input_ids, attention_mask]
3       [input_ids, attention_mask]
4       [input_ids, attention_mask]
                   ...             
8143    [input_ids, attention_mask]
8144    [input_ids, attention_mask]
8145    [input_ids, attention_mask]
8146    [input_ids, attention_mask]
8147    [input_ids, attention_mask]
Name: text, Length: 8148, dtype: object

In [19]:
encoded_inputs[0]

{'input_ids': tensor([[    0, 30086,  5470,  1034,    47,    32, 15983,     6,  5428,    82,
            32,    45,  5100,     4,    67,     5,   129,   631, 45365,   145,
           910, 21639,   159,   110, 14599,    16,   335,    59, 12768,     6,
         21852,     8,     5,  1762,    50,  4972,    14,   379,     6,   151,
         38187,   260,  6214,  3878,    33,   156,    10,  9461,    14,   114,
           782,    28,     6,    51,    40,  9802,    49,   301,    13,     5,
         12253,     9,   643,     4,  6142,    54,    16,   441,     7,  1962,
           831,   518,     7,   323,    49,   247,  4395,    75,    28, 33338,
            19,   215, 32051,     4,  5359,    55,  9819,     6,    55, 12352,
             8, 10502,     8,   172,   540,  9425,     4,     2]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   

In [20]:
threshold = 0.5
batch_size = 8
predictions = []

for i in range(0, len(encoded_inputs), batch_size):
    batch_inputs = encoded_inputs[i:i+batch_size]
    batch_outputs = [model(**inputs) for inputs in batch_inputs]
    for output in batch_outputs:        
        scores = output[0][0].detach().cpu().numpy()
        scores = softmax(scores)

        prediction = 1 if scores[1] > threshold else 0
        predictions.append(prediction)

In [21]:
predictions

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [22]:
len(predictions)

8148

In [23]:
from sklearn.metrics import confusion_matrix, classification_report

In [24]:
print(confusion_matrix(df['label'], predictions))

[[5053  119]
 [2665  311]]


In [25]:
print(classification_report(df['label'], predictions))

              precision    recall  f1-score   support

           0       0.65      0.98      0.78      5172
           1       0.72      0.10      0.18      2976

    accuracy                           0.66      8148
   macro avg       0.69      0.54      0.48      8148
weighted avg       0.68      0.66      0.56      8148



In [26]:
from sklearn import metrics
metrics.accuracy_score(df['label'], predictions)

0.658321060382916