# 1. Procesado de datos

In [1]:
import pandas as pd
import numpy as np
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"


df = pd.read_csv('./data/train.csv')

In [2]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [3]:
en_stopwords = nlp.Defaults.stop_words

def remove_stop_words(text):
    cleanText = ''
    phrase = nlp(text)
    for token in phrase:
        if not token.is_stop and not token.is_punct and not token.like_url:
            cleanText += ' ' + token.text

    return cleanText

df['text_cleaned'] = df['text'].apply(remove_stop_words)

In [4]:
df['text_cleaned']

0        Hi Roy hope ok Trans people gay thing s ramme...
1                                     fuckin hell biology
2                                  nice looking clergyman
3                           AIDS WAY SIN CONSEQUENCES BAD
4                                                   learn
                              ...                        
8143     Yeah alive time election happen fairly soon U...
8144                                  fundamentally wrong
8145     confused homosexuality big deal proud normal ...
8146                                           disgusting
8147     Peter Sørensen note Peter poor maths 13 27 eq...
Name: text_cleaned, Length: 8148, dtype: object

In [5]:
from sklearn.model_selection import train_test_split

X = df['text_cleaned']
y = df['label']

X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.33, random_state=42)

In [6]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import torch

# Set the device to CUDA if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='hate'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)
tokenizer.save_pretrained(MODEL)
model.save_pretrained(MODEL)

In [7]:
# Tokenize all texts in the 'text_cleaned' column
encoded_inputs = df['text_cleaned'].apply(lambda x: tokenizer(x, return_tensors='pt', max_length=504, padding=True, truncation=True).to(device))

In [8]:
encoded_inputs

0       [input_ids, attention_mask]
1       [input_ids, attention_mask]
2       [input_ids, attention_mask]
3       [input_ids, attention_mask]
4       [input_ids, attention_mask]
                   ...             
8143    [input_ids, attention_mask]
8144    [input_ids, attention_mask]
8145    [input_ids, attention_mask]
8146    [input_ids, attention_mask]
8147    [input_ids, attention_mask]
Name: text_cleaned, Length: 8148, dtype: object

In [9]:
# Get the model outputs for all texts in batches
batch_size = 8  # Adjust as needed
predictions = []

In [10]:
threshold = 0.5

for i in range(0, len(encoded_inputs), batch_size):
    batch_inputs = encoded_inputs[i:i+batch_size]
    batch_outputs = [model(**inputs) for inputs in batch_inputs]
    for output in batch_outputs:
        logits = output.logits.detach().cpu().numpy()
        if logits.shape[0] == 1:  # Check if the output has a valid shape
            score = softmax(logits[0])
            prediction = 1 if score[1] > threshold else 0
            predictions.append(prediction)

In [11]:
predictions

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [12]:
from sklearn.metrics import confusion_matrix, classification_report

In [13]:
print(confusion_matrix(df['label'], predictions))

[[5099   73]
 [2778  198]]


In [14]:
print(classification_report(df['label'], predictions))

              precision    recall  f1-score   support

           0       0.65      0.99      0.78      5172
           1       0.73      0.07      0.12      2976

    accuracy                           0.65      8148
   macro avg       0.69      0.53      0.45      8148
weighted avg       0.68      0.65      0.54      8148



In [15]:
from sklearn import metrics
metrics.accuracy_score(df['label'], predictions)

0.6500981836033383