In [1]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay
import optuna
import numpy as np
import random
from tqdm import tqdm
import json
import seaborn as sns
import matplotlib.pyplot as plt



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("../datasets/train/final_labels.csv")
df = df[['body', 'level_1']].dropna()

new_df = pd.read_csv("../datasets/train/SD_dataset_FINAL.csv")
comments_df = pd.read_excel("../datasets/train/sampled_comments.xlsx")
submissions_df = pd.read_excel("../datasets/train/sampled_submissions.xlsx")

combined_new = pd.concat([comments_df, submissions_df], ignore_index=True)
combined_new['level_1'] = combined_new['label'].map({
    'Neutral': 'Nonmisogynistic',
    'Misogynistic': 'Misogynistic',
    'Mentions Misogyny': 'Misogynistic'
})

combined_new = combined_new[['body', 'level_1']].dropna()

new_df['level_1'] = new_df['level_1'].map({1: 'Misogynistic', 0: 'Nonmisogynistic'})
df = pd.concat([df, new_df], ignore_index=True)
df = pd.concat([df, combined_new], ignore_index=True)



In [3]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['level_1'])


dataset = Dataset.from_pandas(df[['body', 'label']])

dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset['train']
test_dataset = dataset['test']

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "MilaNLProc/bert-base-uncased-ear-misogyny"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


# model = AutoModelForSequenceClassification.from_pretrained("best-misogyny-model")
# tokenizer = AutoTokenizer.from_pretrained("best-misogyny-model")


# 6. Tokenize the data
def tokenize_function(example):
    return tokenizer(example["body"], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 6637/6637 [00:02<00:00, 2901.49 examples/s]
Map: 100%|██████████| 1660/1660 [00:00<00:00, 2848.27 examples/s]


In [28]:
import torch
print(torch.__version__)  # should show +cu118
print(torch.cuda.is_available())  # should be True
print(torch.cuda.get_device_name(0))  # should say "NVIDIA RTX A2000"


2.5.1+cu118
True
NVIDIA RTX A2000 8GB Laptop GPU


In [15]:
# def model_init():
#     return AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions),
        "precision": precision_score(labels, predictions),
        "recall": recall_score(labels, predictions),
    }

# def hp_space(trial):
#     return {
#         "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
#         "num_train_epochs": trial.suggest_int("num_train_epochs", 2, 5),
#         "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
#         "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
#     }

# # 8. Training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     load_best_model_at_end=True,
#     logging_dir="./logs",
#     metric_for_best_model="f1"
# )

# 8. Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)


# def set_seed(seed=42):
#     random.seed(seed)
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     if torch.cuda.is_available():
#         torch.cuda.manual_seed_all(seed)

# set_seed(42)

# training_args = TrainingArguments(output_dir="./eval", per_device_eval_batch_size=16)

# 9. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
eval_result = trainer.evaluate()
print("Evaluation Results:", eval_result)

trainer.save_model("bert-ear-manual")
tokenizer.save_pretrained("bert-ear-manual")

# best_trial = trainer.hyperparameter_search(
#     direction="maximize",
#     backend="optuna",
#     hp_space=hp_space,
#     n_trials=10  # increase if you want a deeper search
# )

# print("Best trial:")
# print(best_trial)

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.259894,0.896386,0.940731,0.898026,0.987699
2,0.413300,0.367948,0.883133,0.928307,0.94864,0.908828
3,0.169500,0.476528,0.903012,0.942643,0.928421,0.957308
4,0.080300,0.514739,0.903012,0.942107,0.936383,0.947902


Evaluation Results: {'eval_loss': 0.47652769088745117, 'eval_accuracy': 0.903012048192771, 'eval_f1': 0.942643391521197, 'eval_precision': 0.9284210526315789, 'eval_recall': 0.9573082489146165, 'eval_runtime': 58.9378, 'eval_samples_per_second': 28.165, 'eval_steps_per_second': 1.765, 'epoch': 4.0}


('bert-ear-manual\\tokenizer_config.json',
 'bert-ear-manual\\special_tokens_map.json',
 'bert-ear-manual\\vocab.txt',
 'bert-ear-manual\\added_tokens.json',
 'bert-ear-manual\\tokenizer.json')

In [7]:
# Step 1: Extract best params
best_params = best_trial.hyperparameters

# Step 2: Update training args with them
final_args = TrainingArguments(
    output_dir="./best_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["per_device_train_batch_size"],
    num_train_epochs=best_params["num_train_epochs"],
    weight_decay=best_params["weight_decay"],
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs"
)

# Step 3: Create a new Trainer with best settings
final_trainer = Trainer(
    model_init=model_init,
    args=final_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Step 4: Train it!
final_trainer.train()

# Step 5 (optional): Save the model + tokenizer
final_trainer.save_model("best-misogyny-model")
tokenizer.save_pretrained("best-misogyny-model")


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4316,0.298205,0.890964,0.936558,0.908226,0.966715
2,0.2092,0.394947,0.891566,0.935159,0.931133,0.939219
3,0.1253,0.580403,0.9,0.940714,0.928773,0.952967


('best-misogyny-model\\tokenizer_config.json',
 'best-misogyny-model\\special_tokens_map.json',
 'best-misogyny-model\\vocab.txt',
 'best-misogyny-model\\added_tokens.json',
 'best-misogyny-model\\tokenizer.json')

In [9]:
# # 11. Evaluate
eval_result = final_trainer.evaluate()
print("Evaluation Results:", eval_result)

# # (Optional) Save model
# trainer.save_model("misogyny-classifier")
# tokenizer.save_pretrained("misogyny-classifier")

Evaluation Results: {'eval_loss': 0.5804033875465393, 'eval_accuracy': 0.9, 'eval_f1': 0.9407142857142857, 'eval_precision': 0.9287729196050776, 'eval_recall': 0.9529667149059334, 'eval_runtime': 60.4833, 'eval_samples_per_second': 27.446, 'eval_steps_per_second': 3.439, 'epoch': 3.0}


In [None]:
# this will run in batches under the hood and stay on GPU
predictions = trainer.predict(test_dataset).predictions  # shape [n_examples, n_labels]
pred_labels = np.argmax(predictions, axis=1)

# now build your confusion matrix:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(test_dataset["label"], pred_labels)
disp = ConfusionMatrixDisplay(cm, display_labels=["Misogynistic","Not Misogynistic"])
fig, ax = plt.subplots(figsize=(5,5))
disp.plot(ax=ax, cmap="Blues")
plt.title("Confusion Matrix")
plt.savefig("../images/bert-hyperparams-confusion.png")


In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
import torch
from datasets import Dataset
import json
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

# 1) Load the model & tokenizer from disk
model = AutoModelForSequenceClassification.from_pretrained("best-misogyny-model")
tokenizer = AutoTokenizer.from_pretrained("best-misogyny-model")

# 2) Set device for GPU if available
device = 0 if torch.cuda.is_available() else -1

# 3) Load your custom dataset
file_path = '../datasets/test/womenEngineers_comments_filtered.json'  # Update this path to your file
output_csv_path = '../datasets/results/bert-hyperparams-comments.csv'
 
with open(file_path, 'r') as f:
    lines = f.readlines()

# Prepare data for Trainer
texts = []
for line in lines:
    data = json.loads(line)
    body = data.get("body", "").strip()  # Extracting 'body' field for comment text
    texts.append(body)


# Tokenize the texts using the tokenizer
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)

# Convert inputs to a dataset
inputs_dataset = Dataset.from_dict({
    "input_ids": inputs["input_ids"],
    "attention_mask": inputs["attention_mask"]
})

# Create the Trainer
training_args = TrainingArguments(
    output_dir="./results",  # Directory to save results (not used for inference, but required)
    per_device_eval_batch_size=16,  # Use 16 for batch size (or another value that fits your GPU memory)
)

trainer = Trainer(
    model=model,               # The fine-tuned model
    args=training_args,        # TrainingArguments
    tokenizer=tokenizer,       # The tokenizer used in the fine-tuning
)

# 4) Predict on the dataset using the trainer
predictions = trainer.predict(inputs_dataset).predictions  # shape [n_examples, n_labels]
pred_labels = predictions.argmax(axis=-1)  # Get the predicted labels (max probability)

# 5) Save the results to a CSV file
labeled_data = []
for text, label in zip(texts, pred_labels):
    sentiment = "misogynistic" if label == 0 else "non-misogynistic"
    labeled_data.append({"title": text, "sentiment": sentiment})

df = pd.DataFrame(labeled_data)
df.to_csv(output_csv_path, index=False)
print(f"Saved labeled data to {output_csv_path}")

# 6) Plot the distribution of sentiment labels
level_counts = {"misogynistic": sum(pred_labels == 0), "non-misogynistic": sum(pred_labels == 1)}

plt.figure(figsize=(8, 6))
sns.barplot(x=list(level_counts.keys()), 
            y=list(level_counts.values()), 
            hue=list(level_counts.keys()), 
            palette="Blues", 
            legend=False)

plt.xlabel("Sentiment Level")
plt.ylabel("Number of Submissions")
plt.title("Distribution of Submissions Across Sentiment Levels (Bert EAR hyperparameters)")
plt.savefig("../images/hyperparameters-bert-submissions.png")
plt.show()

In [50]:
import os
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm  # Progress bar
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Load environment variables
load_dotenv()
huggingface_api_token = os.getenv('HUGGINGFACEHUB_API_TOKEN')

# Load RoBERTa model and tokenizer
# model_name = "cardiffnlp/twitter-roberta-base-sentiment"
model_name = "MilaNLProc/bert-base-uncased-ear-misogyny"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Initialize sentiment analysis pipeline with truncation enabled
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, truncation=True, max_length=512)

# Read initial datasets
df = pd.read_csv("../datasets/train/final_labels.csv") 
df = df[['body', 'level_1']].dropna()

new_df = pd.read_csv("../datasets/train/SD_dataset_FINAL.csv")
comments_df = pd.read_excel("../datasets/train/sampled_comments.xlsx")
submissions_df = pd.read_excel("../datasets/train/sampled_submissions.xlsx")

# Combine comments and submissions
combined_new = pd.concat([comments_df, submissions_df], ignore_index=True)
combined_new['level_1'] = combined_new['label'].map({
    'Neutral': 'Nonmisogynistic',
    'Misogynistic': 'Misogynistic',
    'Mentions Misogyny': 'Misogynistic'
})

combined_new = combined_new[['body', 'level_1']].dropna()

# Map new_df labels and combine datasets
new_df['level_1'] = new_df['level_1'].map({1: 'Misogynistic', 0: 'Nonmisogynistic'})
df = pd.concat([df, new_df], ignore_index=True)
df = pd.concat([df, combined_new], ignore_index=True)

# Prepare lists for predictions
pred_labels = []
true_labels = df['level_1'].tolist()

# List to store labeled data
labeled_data = []

# Process the dataset with a progress bar
for idx, row in tqdm(df.iterrows(), desc="Classifying comments", unit="comment", total=len(df)):
    input_text = row['body'].strip()

    # Run sentiment classification
    result = classifier(input_text)
    sentiment_label = result[0]['label']  # "LABEL_0", "LABEL_1", "LABEL_2"

    # Save the labeled data
    labeled_data.append({"body": input_text, "sentiment": sentiment_label})
    pred_labels.append(sentiment_label)

# Create confusion matrix
cm = confusion_matrix(true_labels, pred_labels, labels=["Nonmisogynistic", "Misogynistic"])
disp = ConfusionMatrixDisplay(cm, display_labels=["Misogynistic", "Nonmisogynistic"])

# Save confusion matrix plot
fig, ax = plt.subplots(figsize=(5, 5))
disp.plot(ax=ax, cmap="Blues")
plt.title("Confusion Matrix")
plt.savefig("../images/bert-ear-confusion.png")
plt.close()


Device set to use cuda:0


RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
