In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from collections import Counter
from sklearn.metrics import accuracy_score, classification_report
import pickle
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_recall_curve, auc
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv("train_test_datasetV2.csv")
print(len(data))
print(data['target'].value_counts())
print(data.columns)
documents = data['clean_tweet'].to_list()
labels = data['target'].to_list()

print(f'Type of documents {type(documents)} and labels: {type(labels)}')
print(f'Length {len(documents)} and labels: {len(labels)}')

# Get 1000 SEED DOCUMENTS FROM data
class_0_indices = [i for i, label in enumerate(labels) if label == 0]
class_1_indices = [i for i, label in enumerate(labels) if label == 1]

seed_size = 1000

# Define seed size for each class
seed_size_per_class = seed_size // 2

# Sample equally from each class
seed_indices_0 = np.random.choice(class_0_indices, size=seed_size_per_class, replace=False)
seed_indices_1 = np.random.choice(class_1_indices, size=seed_size_per_class, replace=False)

# Combine the samples
seed_indices = np.concatenate((seed_indices_0, seed_indices_1))
np.random.shuffle(seed_indices)  # Shuffle to mix the classes

# Extract seed documents and labels
seed_documents = [documents[i] for i in seed_indices]
seed_labels = [labels[i] for i in seed_indices]

# Checking the distribution
print("Number of Seed Documents:", len(seed_documents))
distribution = Counter(seed_labels)
print("Value count of Seed Documents:")
print(distribution)

6008
target
0    3715
1    2293
Name: count, dtype: int64
Index(['ID', 'tweet', 'clean_tweet', 'target'], dtype='object')
Type of documents <class 'list'> and labels: <class 'list'>
Length 6008 and labels: 6008
Number of Seed Documents: 1000
Value count of Seed Documents:
Counter({0: 500, 1: 500})


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Setup and Load Model
MODEL_NAME = "cardiffnlp/twitter-roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
bert_model = AutoModel.from_pretrained(MODEL_NAME)

# Move your model to the GPU
bert_model.to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [10]:
def bert_embed_texts(texts):
    """
    Converts text documents into bert pyTorch embedding using Bert

    Parameters:
        texts (List[str]): list of documents

    Returns: embeddings
    """
    with torch.no_grad():
        encoded_input = bert_tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)
        output = bert_model(**encoded_input)
    return output.pooler_output.cpu()

In [11]:
# Initial Training
X_seed = bert_embed_texts(seed_documents)

# Scale the BERT embeddings
scaler = StandardScaler()
X_seed_scaled = scaler.fit_transform(X_seed)

logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_seed_scaled, seed_labels)

In [13]:
# Active Learning Parameters
total_budget = 5000
batch_size = 150
remaining_budget = total_budget

# Active Learning Loop using CAL
while remaining_budget > batch_size:

    # Predict probabilities
    X = bert_embed_texts(documents)
    X_scaled = scaler.transform(X_seed)
    probabilities = logistic_model.predict_proba(X_scaled)[:, 1]

    # CAL: Select documents with highest P - most likely to be hateful
    selected_indices = np.argsort(-probabilities)[:batch_size]

    # get the true labels to Reinforce (annotation) 
    annotated_labels = [labels[i] for i in selected_indices]

    # Retrain the model
    X_train = bert_embed_texts([documents[i] for i in selected_indices]) 
    X_train_scaled = scaler.transform(X_train)
    logistic_model.fit(X_train_scaled, annotated_labels)
    remaining_budget -= batch_size
    print(f'Remaining budget: {remaining_budget}')

Remaining budget: 4850
Remaining budget: 4700
Remaining budget: 4550
Remaining budget: 4400
Remaining budget: 4250
Remaining budget: 4100
Remaining budget: 3950
Remaining budget: 3800
Remaining budget: 3650
Remaining budget: 3500
Remaining budget: 3350
Remaining budget: 3200
Remaining budget: 3050
Remaining budget: 2900
Remaining budget: 2750
Remaining budget: 2600
Remaining budget: 2450
Remaining budget: 2300
Remaining budget: 2150
Remaining budget: 2000
Remaining budget: 1850
Remaining budget: 1700
Remaining budget: 1550
Remaining budget: 1400
Remaining budget: 1250
Remaining budget: 1100
Remaining budget: 950
Remaining budget: 800
Remaining budget: 650
Remaining budget: 500
Remaining budget: 350
Remaining budget: 200
Remaining budget: 50


In [14]:
# Start the testing
exp_data = pd.read_csv("experiment_datasetV2.csv")

print(len(exp_data))
print(exp_data['target'].value_counts())
print(exp_data.columns)

test_documents = exp_data['clean_tweet'].to_list()
test_labels = exp_data['target'].to_list()
test_IDs = exp_data['ID'].to_list()

print(f'Type of documents {type(test_documents)} and labels: {type(test_labels)}')
print(f'Length {len(test_documents)} and labels: {len(test_labels)}')

1000
target
0    627
1    373
Name: count, dtype: int64
Index(['ID', 'tweet', 'clean_tweet', 'target'], dtype='object')
Type of documents <class 'list'> and labels: <class 'list'>
Length 1000 and labels: 1000


In [15]:
X_test = bert_embed_texts(test_documents)
X_test_scaled = scaler.transform(X_test)

# Predict using the trained model
predicted_labels = logistic_model.predict(X_test_scaled) # <--- change here
predicted_probabilities = logistic_model.predict_proba(X_test_scaled) # <--- change here
hateful_probabilities = predicted_probabilities[:, 1]

distribution = Counter(predicted_labels)
print("Value count of predicted_labels")
print(distribution)

# Evaluate the model
accuracy = accuracy_score(test_labels, predicted_labels)
report = classification_report(test_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Value count of predicted_labels
Counter({0: 725, 1: 275})
Accuracy: 0.772
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.90      0.83       627
           1       0.76      0.56      0.65       373

    accuracy                           0.77      1000
   macro avg       0.77      0.73      0.74      1000
weighted avg       0.77      0.77      0.76      1000



In [16]:
bert_CAL_experiment = pd.DataFrame({
    'ID': test_IDs,
    'clean_tweet': test_documents,
    'true_label': test_labels,
    'predicted_label': predicted_labels,
    'probability_of_hateful': predicted_probabilities[:, 1]
})

In [None]:
bert_CAL_experiment.to_csv("bert_CAL_experiment_results.csv", index=False)

with open('logistic_regression_model_bert_CAL.pkl', 'wb') as file:
    pickle.dump(logistic_model, file)  #<--- change here

with open('bert_cal_embedder.pkl', 'wb') as file:
    pickle.dump(bert_model, file)  #<--- change here
    
print("Done")