# [roberta-base-offensive](https://huggingface.co/cardiffnlp/roberta-base-offensive)

Fine tunning de roberta-base-offensive

In [1]:
import pandas as pd
import numpy as np
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"


df = pd.read_csv('./data/train.csv')

In [2]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import torch

# Set the device to CUDA if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

TOKENIZER = f"cardiffnlp/roberta-base-offensive"

MODEL = f"g1-models/roberta-base-offensive-fine/checkpoint-4000"

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/offensive/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL, local_files_only=True).to(device)

In [3]:
from sklearn.model_selection import train_test_split

X = df['text']
y = df['label']

X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.33, random_state=42)

In [4]:
# Tokenize all texts in the 'text_cleaned' column
encoded_inputs = df['text'].apply(lambda x: tokenizer(x, return_tensors='pt', padding="max_length", truncation=True).to(device))

In [5]:
threshold = 0.5
batch_size = 4
predictions = []

for i in range(0, len(encoded_inputs), batch_size):
    batch_inputs = encoded_inputs[i:i+batch_size]
    batch_outputs = [model(**inputs) for inputs in batch_inputs]
    for output in batch_outputs:        
        scores = output[0][0].detach().cpu().numpy()
        scores = softmax(scores)

        prediction = 1 if scores[1] > threshold else 0
        predictions.append(prediction)

In [6]:
len(predictions)

8148

In [7]:
from sklearn.metrics import confusion_matrix, classification_report

In [8]:
print(confusion_matrix(df['label'], predictions))

[[5172    0]
 [2976    0]]


In [9]:
print(classification_report(df['label'], predictions))

              precision    recall  f1-score   support

           0       0.63      1.00      0.78      5172
           1       0.00      0.00      0.00      2976

    accuracy                           0.63      8148
   macro avg       0.32      0.50      0.39      8148
weighted avg       0.40      0.63      0.49      8148



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
from sklearn import metrics
metrics.accuracy_score(df['label'], predictions)

0.6347569955817378