In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import torch
from collections import Counter
from sklearn.metrics import accuracy_score, classification_report
import pickle
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import StandardScaler

In [3]:
data = pd.read_csv("train_test_datasetV2.csv")

In [6]:
print(len(data))
print(data['target'].value_counts())
print(data.columns)
documents = data['clean_tweet'].to_list()
labels = data['target'].to_list()

print(f'Type of documents {type(documents)} and labels: {type(labels)}')
print(f'Length {len(documents)} and labels: {len(labels)}')

6008
target
0    3715
1    2293
Name: count, dtype: int64
Index(['ID', 'tweet', 'clean_tweet', 'target'], dtype='object')
Type of documents <class 'list'> and labels: <class 'list'>
Length 6008 and labels: 6008


In [7]:
# Step 1: Select Seed Documents
seed_size = 1000
seed_indices = np.random.choice(range(len(documents)), size=seed_size, replace=False)
seed_documents = [documents[i] for i in seed_indices]
seed_labels = [labels[i] for i in seed_indices]

print(len(seed_documents))
distribution = Counter(seed_labels)
print("Value count of Seed Documents")
print(distribution)

1000
Value count of Seed Documents
Counter({0: 609, 1: 391})


In [8]:
class_0_indices = [i for i, label in enumerate(labels) if label == 0]
class_1_indices = [i for i, label in enumerate(labels) if label == 1]

seed_size = 1000

# Define seed size for each class
seed_size_per_class = seed_size // 2

# Sample equally from each class
seed_indices_0 = np.random.choice(class_0_indices, size=seed_size_per_class, replace=False)
seed_indices_1 = np.random.choice(class_1_indices, size=seed_size_per_class, replace=False)

# Combine the samples
seed_indices = np.concatenate((seed_indices_0, seed_indices_1))
np.random.shuffle(seed_indices)  # Shuffle to mix the classes

# Extract seed documents and labels
seed_documents = [documents[i] for i in seed_indices]
seed_labels = [labels[i] for i in seed_indices]

# Checking the distribution
print("Number of Seed Documents:", len(seed_documents))
distribution = Counter(seed_labels)
print("Value count of Seed Documents:")
print(distribution)

Number of Seed Documents: 1000
Value count of Seed Documents:
Counter({0: 500, 1: 500})


In [9]:
# Define TFIDF Vectorizer

vectorizer = TfidfVectorizer()
X_seed = vectorizer.fit_transform(seed_documents)

scaler = StandardScaler(with_mean=False)
X_seed_scaled = scaler.fit_transform(X_seed)

# Initialize Logistic Regression with more iterations
# model = LogisticRegression(max_iter=1000) 
model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

# Fit the model
model.fit(X_seed_scaled, seed_labels)



In [10]:
total_budget = 5000
batch_size = 150
remaining_budget = total_budget - len(seed_documents)

while remaining_budget > 0:
    # Predict hatefulness of each document
    X = vectorizer.transform(documents)
    X_scaled = scaler.transform(X)
    probabilities = model.predict_proba(X_scaled)[:, 1]  # Logistic Regression on CPU

    # Document selection criteria for CAL
    selected_indices = np.argsort(-probabilities)[:batch_size]

    # Simulate annotation
    annotated_labels = [labels[i] for i in selected_indices]

    # Retrain the model with selected documents
    X_train = vectorizer.transform([documents[i] for i in selected_indices])
    X_train_scaled = scaler.transform(X_train)
    model.fit(X_train_scaled, annotated_labels)

    # Update budget
    remaining_budget -= batch_size
    print(f'Remaining budget: {remaining_budget}')



Remaining budget: 3850
Remaining budget: 3700




Remaining budget: 3550
Remaining budget: 3400
Remaining budget: 3250
Remaining budget: 3100




Remaining budget: 2950
Remaining budget: 2800




Remaining budget: 2650
Remaining budget: 2500




Remaining budget: 2350
Remaining budget: 2200
Remaining budget: 2050
Remaining budget: 1900
Remaining budget: 1750
Remaining budget: 1600
Remaining budget: 1450
Remaining budget: 1300
Remaining budget: 1150
Remaining budget: 1000
Remaining budget: 850
Remaining budget: 700
Remaining budget: 550
Remaining budget: 400
Remaining budget: 250
Remaining budget: 100
Remaining budget: -50


In [11]:
# Start the testing
exp_data = pd.read_csv("experiment_datasetV2.csv")

print(len(exp_data))
print(exp_data['target'].value_counts())
print(exp_data.columns)

test_documents = exp_data['clean_tweet'].to_list()
test_labels = exp_data['target'].to_list()
test_IDs = exp_data['ID'].to_list()

print(f'Type of documents {type(test_documents)} and labels: {type(test_labels)}')
print(f'Length {len(test_documents)} and labels: {len(test_labels)}')

1000
target
0    627
1    373
Name: count, dtype: int64
Index(['ID', 'tweet', 'clean_tweet', 'target'], dtype='object')
Type of documents <class 'list'> and labels: <class 'list'>
Length 1000 and labels: 1000


In [12]:
X_test = vectorizer.transform(test_documents)
X_test_scaled = scaler.transform(X_test)

# Predict using the trained model
predicted_labels = model.predict(X_test_scaled) # <--- change here
predicted_probabilities = model.predict_proba(X_test_scaled) # <--- change here
hateful_probabilities = predicted_probabilities[:, 1]

distribution = Counter(predicted_labels)
print("Value count of predicted_labels")
print(distribution)

Value count of predicted_labels
Counter({1: 993, 0: 7})


In [13]:
# Evaluate the model
accuracy = accuracy_score(test_labels, predicted_labels)
report = classification_report(test_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)

Accuracy: 0.376
Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.01      0.02       627
           1       0.37      0.99      0.54       373

    accuracy                           0.38      1000
   macro avg       0.54      0.50      0.28      1000
weighted avg       0.59      0.38      0.21      1000



In [14]:
tfidf_CAL_experiment = pd.DataFrame({
    'ID': test_IDs,
    'clean_tweet': test_documents,
    'true_label': test_labels,
    'predicted_label': predicted_labels,
    'probability_of_hateful': predicted_probabilities[:, 1]
})

In [15]:
tfidf_CAL_experiment.to_csv("tfidf_CAL_experiment_results.csv", index=False)

with open('logistic_regression_model_tfidf_CAL.pkl', 'wb') as file:
    pickle.dump(model, file)  #<--- change here

print("Done")

Done
