In [None]:
!pip install transformers

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from collections import Counter
from sklearn.metrics import accuracy_score, classification_report
import pickle
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_recall_curve, auc
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("train_test_datasetV2.csv")
print(len(data))
print(data['target'].value_counts())
print(data.columns)
documents = data['clean_tweet'].to_list()
labels = data['target'].to_list()

print(f'Type of documents {type(documents)} and labels: {type(labels)}')
print(f'Length {len(documents)} and labels: {len(labels)}')

# Get 1000 SEED DOCUMENTS FROM data
class_0_indices = [i for i, label in enumerate(labels) if label == 0]
class_1_indices = [i for i, label in enumerate(labels) if label == 1]

seed_size = 1000

# Define seed size for each class
seed_size_per_class = seed_size // 2

# Sample equally from each class
seed_indices_0 = np.random.choice(class_0_indices, size=seed_size_per_class, replace=False)
seed_indices_1 = np.random.choice(class_1_indices, size=seed_size_per_class, replace=False)

# Combine the samples
seed_indices = np.concatenate((seed_indices_0, seed_indices_1))
np.random.shuffle(seed_indices)  # Shuffle to mix the classes

# Extract seed documents and labels
seed_documents = [documents[i] for i in seed_indices]
seed_labels = [labels[i] for i in seed_indices]

# Checking the distribution
print("Number of Seed Documents:", len(seed_documents))
distribution = Counter(seed_labels)
print("Value count of Seed Documents:")
print(distribution)

In [None]:
# Setup and Load Model
MODEL_NAME = "cardiffnlp/twitter-roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME)

# Move your model to the GPU
bert_model.to(device)

In [None]:
def bert_embed_texts(texts):
    """
    Converts text documents into bert pyTorch embedding using Bert

    Parameters:
        texts (List[str]): list of documents

    Returns: embeddings
    """
    with torch.no_grad():
        encoded_input = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)
        output = bert_model(**encoded_input)
    return output.pooler_output.cpu()

In [None]:
# Initial Training
X_seed = bert_embed_texts(seed_documents)

# Scale the BERT embeddings
scaler = StandardScaler()
X_seed_scaled = scaler.fit_transform(X_seed)

logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_seed_scaled, seed_labels)

In [None]:
# Active Learning Parameters
total_budget = 5000
batch_size = 150
remaining_budget = total_budget

# Active Learning Loop using SAL
while remaining_budget > batch_size:

    # Predict probabilities
    X = bert_embed_texts(documents)
    X_scaled = scaler.transform(X_seed)
    probabilities = logistic_model.predict_proba(X_scaled)[:, 1]

    # SAL: Select documents with probabilities closest to 0.5
    uncertain_indices = np.argsort(np.abs(probabilities - 0.5))[:batch_size]

    # Reinforce (annotation) using
    annotated_labels = [labels[i] for i in uncertain_indices]

    # Retrain the model
    X_train = bert_embed_texts([documents[i] for i in uncertain_indices])
    X_train_scaled = scaler.transform(X_train_scaled)
    logistic_model.fit(X_train, annotated_labels)
    remaining_budget -= batch_size
    print(f'Remaining budget: {remaining_budget}')

In [None]:
# Start the testing
exp_data = pd.read_csv("experiment_datasetV2.csv")

print(len(exp_data))
print(exp_data['target'].value_counts())
print(exp_data.columns)

test_documents = exp_data['clean_tweet'].to_list()
test_labels = exp_data['target'].to_list()
test_IDs = exp_data['ID'].to_list()

print(f'Type of documents {type(test_documents)} and labels: {type(test_labels)}')
print(f'Length {len(test_documents)} and labels: {len(test_labels)}')

In [None]:
X_test = bert_embed_texts(test_documents)
X_test_scaled = scaler.transform(X_test)

# Predict using the trained model
predicted_labels = logistic_model.predict(X_test_scaled) # <--- change here
predicted_probabilities = logistic_model.predict_proba(X_test_scaled) # <--- change here
hateful_probabilities = predicted_probabilities[:, 1]

distribution = Counter(predicted_labels)
print("Value count of predicted_labels")
print(distribution)

# Evaluate the model
accuracy = accuracy_score(test_labels, predicted_labels)
report = classification_report(test_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

In [None]:
bert_SAL_experiment = pd.DataFrame({
    'ID': test_IDs,
    'clean_tweet': test_documents,
    'true_label': test_labels,
    'predicted_label': predicted_labels,
    'probability_of_hateful': predicted_probabilities[:, 1]
})

In [None]:
bert_SAL_experiment.to_csv("bert_SAL_experiment_results.csv", index=False)

with open('logistic_regression_model_bert_SAL.pkl', 'wb') as file:
    pickle.dump(logistic_model, file)  #<--- change here

with open('bert_sal_embedder.pkl', 'wb') as file:
    pickle.dump(bert_model, file)  #<--- change here
    
print("Done")