In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import AutoTokenizer
import time
import_start = time.time()
# Load dataset
# df = pd.read_csv('datasets/cleaned_OLID.tsv', sep="\t")
df = pd.read_csv('datasets/cleaned_SOLIDtest6K_trainer.tsv', sep='\t')
# df = pd.read_csv('datasets/cleaned_tr_offenseval_test.tsv', sep='\t')
# df = pd.read_csv('datasets/cleaned_hatespeech_offensive.tsv', sep='\t')
import_end = time.time()

# Assuming your columns are named 'tweet' and 'class', change accordingly
tweets = df['tweet'].values
labels_df = df['label'].values


# Split the dataset into training and validation sets
# _, val_texts, _, val_labels = train_test_split(tweets, labels, test_size=0.9, random_state=42)

# val_texts = tweets
# Initialize BERT tokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")

# Tokenize and encode the training and validation texts
# train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
tokenization_start = time.time()
encodings = tokenizer(tweets.tolist(), truncation=True, padding=True)
tokenization_end = time.time()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(encodings, labels_df)

# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=12, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=12, shuffle=False)


In [1]:
import torch
from torch import optim
from transformers import BertForSequenceClassification
# from transformers import AutoTokenizer, RobertaForSequenceClassification
import time
# Initialize BERT model for sequence classification
# model = RobertaForSequenceClassification.from_pretrained('models/SOLID_semi_RoBERTa_2_cleaned')
import_model_start = time.time()

model = BertForSequenceClassification.from_pretrained('models/SOLID_BERT_colearning_4')
# Define optimizer and learning rate
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# Training loop
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
import_model_end = time.time()


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import numpy as np
import time
# Evaluation
model.eval()
# Perform evaluation on validation set and calculate metrics as needed
# Example: calculate accuracy
correct = 0
total = 0
i = 0
prediction_list = np.array([])

with torch.no_grad():
    test_start  = time.time()
    print('start')
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)
        prediction_list = np.append(prediction_list, predictions.detach().cpu().numpy())
    print('end')
    
    test_end = time.time()

accuracy = correct / total
print(f'Validation Accuracy: {accuracy}')


start
end
Validation Accuracy: 0.9202402803270482


Model: SOLID_finetuneHSO_1

Validation dataset: cleaned_SOLIDtest6K_trainer

In [5]:
# wrong_indices = [i for i in range(len(labels_df)) if labels_df[i] != prediction_list[i]]

# for index in wrong_indices:

#     print(f'tweet: {tweets[index]}, prediction {prediction_list[index]}, label {labels_df[index]:f02}')

In [6]:
# wrong_indices = [i for i in range(len(labels_df)) if labels_df[i] != prediction_list[i]]

# output_results_df = pd.DataFrame(columns=['index', 'tweet', 'prediction', 'label'])

# for index in wrong_indices:
#     # output_results_df
    
#     output_results_df.loc[len(output_results_df)] = [index, tweets[index], int(prediction_list[index]), labels_df[index]]
#     # pd.concat([output_results_df, {'tweet': tweets[index], 'prediction': prediction_list[index], 'label': labels_df[index]}])
#     # print(f'tweet: {tweets[index]}, prediction {prediction_list[index]}, label {labels_df[index]:f02}')

# output_results_df.to_csv('./outputs/wr-SOLID_BERT_160k_colearning_5-SOLID_test.tsv', sep='\t')

In [3]:
import_time = import_end - import_start
tokenization_time = tokenization_end - tokenization_start
import_model_time = import_model_end - import_model_start
test_time = test_end - test_start


print(f'Import dataset(SOLID test) time: {import_time}')
print(f'Tokenization(BERT tokenizer) time: {tokenization_time}')
print(f'Model import time: {import_model_time}')
print(f'Test time: {test_time}')

Model import time: 2.0219321250915527


In [8]:
from sklearn.metrics import classification_report

# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
report = classification_report(labels_df, prediction_list)

print(report)

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      2991
           1       0.91      0.93      0.92      3002

    accuracy                           0.92      5993
   macro avg       0.92      0.92      0.92      5993
weighted avg       0.92      0.92      0.92      5993



In [9]:
from sklearn.metrics import confusion_matrix

# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
cm = confusion_matrix(labels_df, prediction_list)

# Extract TP, TN, FP, FN from the confusion matrix
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")


True Positives (TP): 2800
True Negatives (TN): 2715
False Positives (FP): 276
False Negatives (FN): 202


In [10]:
# import pandas as pd

# # Create a DataFrame with the validation texts and labels
# validation_df = pd.DataFrame({'text': val_texts, 'label': val_labels})

# # Add the prediction list as a new column to the DataFrame
# validation_df['prediction'] = prediction_list

# # Save the DataFrame as a CSV file
# validation_df.to_csv('OLID_validation_with_predictions.csv', index=False)
