# [roberta-base-offensive](https://huggingface.co/cardiffnlp/roberta-base-offensive)

In [1]:
import pandas as pd
import numpy as np
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"


df = pd.read_csv('./data/train.csv')

In [2]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [3]:
en_stopwords = nlp.Defaults.stop_words

def remove_stop_words(text):
    cleanText = ''
    phrase = nlp(text)
    for token in phrase:
        if not token.is_stop and not token.is_punct and not token.like_url:
            cleanText += ' ' + token.text

    return cleanText

In [4]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [5]:
df['text_cleaned'] = df['text'].apply(preprocess)

In [6]:
# df['text_cleaned']

In [7]:
from sklearn.model_selection import train_test_split

X = df['text_cleaned']
y = df['label']

X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import torch

# Set the device to CUDA if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MODEL = f"cardiffnlp/roberta-base-offensive"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/offensive/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)
tokenizer.save_pretrained(MODEL)
model.save_pretrained(MODEL)

In [9]:
# Tokenize all texts in the 'text_cleaned' column
encoded_inputs = df['text'].apply(lambda x: tokenizer(x, return_tensors='pt', max_length=504, padding=True, truncation=True).to(device))

In [10]:
# encoded_inputs

In [11]:
# encoded_inputs[0]

In [12]:
threshold = 0.7
batch_size = 8
predictions = []

for i in range(0, len(encoded_inputs), batch_size):
    batch_inputs = encoded_inputs[i:i+batch_size]
    batch_outputs = [model(**inputs) for inputs in batch_inputs]
    for output in batch_outputs:        
        scores = output[0][0].detach().cpu().numpy()
        scores = softmax(scores)

        prediction = 1 if scores[1] > threshold else 0
        predictions.append(prediction)

In [13]:
# predictions

In [14]:
len(predictions)

8148

In [15]:
from sklearn.metrics import confusion_matrix, classification_report

In [16]:
print(confusion_matrix(df['label'], predictions))

[[4517  655]
 [1537 1439]]


In [17]:
print(classification_report(df['label'], predictions))

              precision    recall  f1-score   support

           0       0.75      0.87      0.80      5172
           1       0.69      0.48      0.57      2976

    accuracy                           0.73      8148
   macro avg       0.72      0.68      0.69      8148
weighted avg       0.72      0.73      0.72      8148



In [18]:
from sklearn import metrics
metrics.accuracy_score(df['label'], predictions)

0.7309769268532155