In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import AutoTokenizer

# Load dataset
# df = pd.read_csv('datasets/cleaned_OLID.tsv', sep="\t")
df = pd.read_csv('datasets/cleaned_SOLIDtest6K_trainer.tsv', sep='\t')
# df = pd.read_csv('datasets/cleaned_tr_offenseval_test.tsv', sep='\t')
# df = pd.read_csv('datasets/cleaned_hatespeech_offensive.tsv', sep='\t')

# Assuming your columns are named 'tweet' and 'class', change accordingly
tweets = df['tweet'].values
labels_df = df['label'].values


# Split the dataset into training and validation sets
# _, val_texts, _, val_labels = train_test_split(tweets, labels, test_size=0.9, random_state=42)

# val_texts = tweets
# Initialize BERT tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")

# Tokenize and encode the training and validation texts
# train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
encodings = tokenizer(tweets.tolist(), truncation=True, padding=True)


In [3]:
import torch

class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(encodings, labels_df)

# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=12, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=12, shuffle=False)


In [4]:
import torch
from torch import optim
from transformers import BertForSequenceClassification
# from transformers import AutoTokenizer, RobertaForSequenceClassification
import time
# Initialize BERT model for sequence classification
# model = RobertaForSequenceClassification.from_pretrained('models/SOLID_semi_RoBERTa_2_cleaned')
model = BertForSequenceClassification.from_pretrained('models/OLID_BERT_1')
# Define optimizer and learning rate
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [6]:
import numpy as np
import time
# Evaluation
model.eval()
# Perform evaluation on validation set and calculate metrics as needed
# Example: calculate accuracy
correct = 0
total = 0
i = 0
prediction_list = np.array([])

with torch.no_grad():
    test_start  = time.time()
    print('start')
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)
        prediction_list = np.append(prediction_list, predictions.detach().cpu().numpy())
    print('end')
    
    test_end = time.time()

accuracy = correct / total
print(f'Validation Accuracy: {accuracy}')


start
end
Validation Accuracy: 0.923410645753379


Model: SOLID_finetuneHSO_1

Validation dataset: cleaned_SOLIDtest6K_trainer

In [None]:
wrong_indices = [i for i in range(len(labels_df)) if labels_df[i] != prediction_list[i]]

for index in wrong_indices:

    print(f'tweet: {tweets[index]}, prediction {prediction_list[index]}, label {labels_df[index]:f02}')

In [33]:
wrong_indices = [i for i in range(len(labels_df)) if labels_df[i] != prediction_list[i]]

output_results_df = pd.DataFrame(columns=['index', 'tweet', 'prediction', 'label'])

for index in wrong_indices:
    # output_results_df
    
    output_results_df.loc[len(output_results_df)] = [index, tweets[index], int(prediction_list[index]), labels_df[index]]
    # pd.concat([output_results_df, {'tweet': tweets[index], 'prediction': prediction_list[index], 'label': labels_df[index]}])
    # print(f'tweet: {tweets[index]}, prediction {prediction_list[index]}, label {labels_df[index]:f02}')

output_results_df.head()

Unnamed: 0,index,tweet,prediction,label
0,1700,if he is not signed up by united in the next ...,1,0
1,1795,shittt do yall use the word i think here we co...,1,0
2,2506,i am no fan of her politics but this is disgus...,1,0
3,2516,evil stupid self serving but evil what a ridic...,1,0
4,2626,wow that dude really whooped yo ass,1,0


In [7]:
from sklearn.metrics import classification_report

# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
report = classification_report(labels_df, prediction_list)

print(report)

              precision    recall  f1-score   support

           0       0.97      0.87      0.92      2991
           1       0.88      0.97      0.93      3002

    accuracy                           0.92      5993
   macro avg       0.93      0.92      0.92      5993
weighted avg       0.93      0.92      0.92      5993



In [8]:
from sklearn.metrics import confusion_matrix

# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
cm = confusion_matrix(labels_df, prediction_list)

# Extract TP, TN, FP, FN from the confusion matrix
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")


True Positives (TP): 2923
True Negatives (TN): 2611
False Positives (FP): 380
False Negatives (FN): 79


In [9]:
# import pandas as pd

# # Create a DataFrame with the validation texts and labels
# validation_df = pd.DataFrame({'text': val_texts, 'label': val_labels})

# # Add the prediction list as a new column to the DataFrame
# validation_df['prediction'] = prediction_list

# # Save the DataFrame as a CSV file
# validation_df.to_csv('OLID_validation_with_predictions.csv', index=False)
