In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os

PROJECT_DIR_PATH = '/content/drive/MyDrive/BCSE306L_Project'
os.makedirs(PROJECT_DIR_PATH, exist_ok=True)
%cd {PROJECT_DIR_PATH}


REPO_URL = 'https://github.com/Amit-adh/AI-Algorithm-Analysis.git'
REPO_NAME = REPO_URL.split('/')[-1].replace('.git', '')


if not os.path.exists(REPO_NAME):
    !git clone {REPO_URL}
else:
    print(f"Directory '{REPO_NAME}' already exists. Skipping clone.")


%cd {REPO_NAME}

!pwd

/content/drive/MyDrive/BCSE306L_Project
Directory 'AI-Algorithm-Analysis' already exists. Skipping clone.
/content/drive/MyDrive/BCSE306L_Project/AI-Algorithm-Analysis
/content/drive/MyDrive/BCSE306L_Project/AI-Algorithm-Analysis


In [3]:
%cd /content/drive/MyDrive/BCSE306L_Project/AI-Algorithm-Analysis


/content/drive/MyDrive/BCSE306L_Project/AI-Algorithm-Analysis


In [23]:
!git config --global user.name "thsauravgupta"
!git config --global user.email "sauravg3059@gmail.com"

In [5]:

%%writefile requirements.txt
transformers==4.57.1
datasets==2.16.1
tokenizers==0.22.1
torch
pandas
numpy
scikit-learn
matplotlib
kaggle



Overwriting requirements.txt


In [None]:
!python -m pip show tokenizers transformers


Name: tokenizers
Version: 0.22.1
Summary: 
Home-page: https://github.com/huggingface/tokenizers
Author: 
Author-email: Nicolas Patry <patry.nicolas@protonmail.com>, Anthony Moi <anthony@huggingface.co>
License: 
Location: /usr/local/lib/python3.12/dist-packages
Requires: huggingface-hub
Required-by: torchtune, transformers
---
Name: transformers
Version: 4.57.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.12/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers


In [None]:
!pip install -r requirements.txt


Collecting datasets==2.16.1 (from -r requirements.txt (line 2))
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow-hotfix (from datasets==2.16.1->-r requirements.txt (line 2))
  Downloading pyarrow_hotfix-0.7-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.16.1->-r requirements.txt (line 2))
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.16.1->-r requirements.txt (line 2))
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.16.1->-r requirements.txt (line 2))
  Downloading multiprocess-0.70.18-py312-none-any.whl.metadata (7.5 kB)
  Downloading multiprocess-0.70.17-py312-none-any.whl.metadata (7.2 kB)
  Downloading

In [6]:
%%writefile .gitignore

__pycache__/
*.pyc

data/
*.csv
*.zip

models/
*.bin
*.json
*.safetensors

results/

logs/
*.log

.ipynb_checkpoints/

kaggle.json

Overwriting .gitignore


In [None]:
!mkdir -p src
!mkdir -p models
!mkdir -p data

In [None]:
!touch src/__init__.py

In [3]:
%%writefile src/config.py
-
MODEL_NAME = 'bert-base-uncased'
NUM_LABELS = 2

IMDB_DATASET_NAME = 'imdb'
BASELINE_MODEL_PATH = './models/baseline_model'


JIGSAW_COMPETITION_NAME = 'jigsaw-unintended-bias-in-toxicity-classification'
TOXICITY_THRESHOLD = 0.5
BIASED_MODEL_PATH = './models/biased_model'


EEC_DATASET_NAME = 'peixian/equity_evaluation_corpus'


TRAIN_EPOCHS = 20
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
MAX_SEQ_LEN = 128
TRAIN_SUBSET_SIZE =5000
VAL_SUBSET_SIZE = 1000


USE_FP16 = True
GRADIENT_ACCUMULATION_STEPS = 2


DATA_DIR = './data'
RESULTS_DIR = './results'
JIGSAW_TRAIN_CSV = f'{DATA_DIR}/train.csv'


Overwriting src/config.py


In [4]:
%%writefile src/data_loader.py
import pandas as pd
from datasets import load_dataset
from torch.utils.data import Dataset
import torch
import os
import kaggle
import zipfile
from . import config


def download_jigsaw_data():

    os.makedirs(config.DATA_DIR, exist_ok=True)

    if os.path.exists(config.JIGSAW_TRAIN_CSV):
        print("Jigsaw data (train.csv) already exists. Skipping download.")
        return

    print("Downloading Jigsaw data...")
    try:
        kaggle.api.competition_download_files(
            config.JIGSAW_COMPETITION_NAME,
            path=config.DATA_DIR,
            quiet=False
        )

        zip_path = f"{config.DATA_DIR}/{config.JIGSAW_COMPETITION_NAME}.zip"
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extract('train.csv', path=config.DATA_DIR)
        os.remove(zip_path)
        print("Data downloaded and unzipped.")
    except Exception as e:
        print(f"Error downloading/unzipping Jigsaw data: {e}")


def get_jigsaw_dataframe():

    try:
        df = pd.read_csv(config.JIGSAW_TRAIN_CSV)
    except FileNotFoundError:
        print("Jigsaw data not found. Downloading...")
        download_jigsaw_data()
        df = pd.read_csv(config.JIGSAW_TRAIN_CSV)

    df['labels'] = (df['target'] >= config.TOXICITY_THRESHOLD).astype(int)
    df = df[['comment_text', 'labels', 'target']].dropna()
    return df

class ToxicityDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


from datasets import load_dataset

def get_imdb_dataset():
    print("Loading IMDb dataset...")
    dataset = load_dataset("imdb")
    dataset = dataset.rename_column("label", "labels")
    return dataset



def get_eec_dataset():

    print("Loading EEC dataset...")
    return load_dataset(config.EEC_DATASET_NAME, trust_remote_code=True)


Overwriting src/data_loader.py


In [6]:
%%writefile src/model_trainer.py
from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForSequenceClassification
)
from datasets import DatasetDict
from . import config
from .data_loader import get_imdb_dataset


def get_model_and_tokenizer():
    """Loads the pre-trained model and tokenizer."""
    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        config.MODEL_NAME,
        num_labels=config.NUM_LABELS
    )
    return model, tokenizer


def train_baseline_model():
    print("Starting BASELINE model training process...")

    model, tokenizer = get_model_and_tokenizer()


    dataset = get_imdb_dataset()


    if isinstance(dataset, tuple):
        print("Detected tuple dataset format — converting to DatasetDict...")
        dataset = DatasetDict({"train": dataset[0], "test": dataset[1]})
    elif not isinstance(dataset, DatasetDict):
        raise TypeError("Unsupported dataset format returned from get_imdb_dataset()")


    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=config.MAX_SEQ_LEN
        )

    print("Tokenizing dataset...")
    tokenized_datasets = dataset.map(tokenize_function, batched=True)


    train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(
        range(min(config.TRAIN_SUBSET_SIZE, len(tokenized_datasets["train"])))
    )
    val_dataset = tokenized_datasets["test"].shuffle(seed=42).select(
        range(min(config.VAL_SUBSET_SIZE, len(tokenized_datasets["test"])))
    )


    training_args = TrainingArguments(
        output_dir=f"{config.RESULTS_DIR}/baseline_checkpoints",
        num_train_epochs=config.TRAIN_EPOCHS,
        per_device_train_batch_size=config.TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=config.EVAL_BATCH_SIZE,
        learning_rate=config.LEARNING_RATE,
        logging_dir=f"{config.RESULTS_DIR}/baseline_logs",
        logging_steps=100,
        save_strategy="epoch",
        eval_strategy="epoch",
        load_best_model_at_end=True,
        fp16=config.USE_FP16,
        gradient_accumulation_steps=config.GRADIENT_ACCUMULATION_STEPS,
        report_to="none"
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    print("Training baseline model...")
    trainer.train()


    print(f"Saving baseline model to {config.BASELINE_MODEL_PATH}")
    trainer.save_model(config.BASELINE_MODEL_PATH)
    tokenizer.save_pretrained(config.BASELINE_MODEL_PATH)
    print(" Baseline training complete.")

    return model, tokenizer


Overwriting src/model_trainer.py


In [7]:
%%writefile src/evaluation.py
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import (
    BertForSequenceClassification,
    AutoTokenizer,
    TextClassificationPipeline
)
from tqdm.auto import tqdm
from . import config
from .data_loader import get_eec_dataset

def load_trained_model(model_type='baseline'):

    if model_type == 'baseline':
        path = config.BASELINE_MODEL_PATH
    elif model_type == 'biased':
        path = config.BIASED_MODEL_PATH
    else:
        raise ValueError("model_type must be 'baseline' or 'biased'")

    print(f"Loading trained model '{model_type}' from {path}")
    try:
        model = BertForSequenceClassification.from_pretrained(path)
        tokenizer = AutoTokenizer.from_pretrained(path)
    except OSError:
        print(f"Error: Model not found at {path}")
        print("Please make sure you have trained the model first.")
        return None, None

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    return model, tokenizer

def get_logits(text, model, tokenizer):
    """ Gets the raw logits for a single piece of text. """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.logits[0].cpu().numpy()

def calculate_asd(model, tokenizer):

    print("Calculating Average Sentiment Difference (ASD)...")
    eec = get_eec_dataset()


    gender_data = eec['validation'].filter(lambda x: x['category'] == 'Gender')

    logit_diffs = []

    for item in tqdm(gender_data):
        logit1 = get_logits(item['sentence1'], model, tokenizer)
        logit2 = get_logits(item['sentence2'], model, tokenizer)


        diff = logit1[1] - logit2[1]
        logit_diffs.append(diff)

    asd = np.mean(np.abs(logit_diffs))
    print(f"Average Sentiment (Logit) Difference: {asd:.4f}")
    return asd

def plot_sentiment_shift(model, tokenizer):

    print("Plotting Sentiment Distribution Shift...")
    eec = get_eec_dataset()
    gender_data = eec['validation'].filter(lambda x: x['category'] == 'Gender')

    sample_data = gender_data.shuffle(seed=42).select(range(5))

    pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=model.device, return_all_scores=True)

    for item in sample_data:
        sentences = [item['sentence1'], item['sentence2']]
        results = pipe(sentences)


        probs = [res[1]['score'] for res in results]
        labels = [item['stereotype'], item['anti-stereotype']]

        plt.figure(figsize=(6, 4))
        plt.bar(labels, probs, color=['blue', 'orange'])
        plt.title(f"Probability of 'Label 1' (Positive/Toxic)\nTemplate: {item['template']}")
        plt.ylabel("P(Label 1)")
        plt.ylim(0, 1)
        plt.show()

Overwriting src/evaluation.py


In [8]:
!ls /content/drive/MyDrive/BCSE306L_Project/AI-Algorithm-Analysis

bias_analysis.ipynb  data  models  requirements.txt  results  src


In [8]:
import os


!mkdir -p /root/.kaggle/


!mv kaggle.json /root/.kaggle/kaggle.json


!chmod 600 /root/.kaggle/kaggle.json

print("Kaggle API key setup complete.")

Kaggle API key setup complete.


In [10]:
!git add .
!git commit -m "Edit in initial project structure"


!git status

[main 620562c] Edit in initial project structure
 2 files changed, 44 insertions(+), 22 deletions(-)
On branch main
Your branch is ahead of 'origin/main' by 4 commits.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean


In [None]:
!pip uninstall -y transformers accelerate tokenizers huggingface-hub


Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
Found existing installation: accelerate 0.31.0
Uninstalling accelerate-0.31.0:
  Successfully uninstalled accelerate-0.31.0
Found existing installation: tokenizers 0.22.1
Uninstalling tokenizers-0.22.1:
  Successfully uninstalled tokenizers-0.22.1
Found existing installation: huggingface-hub 0.36.0
Uninstalling huggingface-hub-0.36.0:
  Successfully uninstalled huggingface-hub-0.36.0


In [None]:
!pip install transformers==4.41.2 tokenizers==0.19.1 huggingface-hub==0.25.2 accelerate==0.31.0 --upgrade


Collecting transformers==4.41.2
  Using cached transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers==0.19.1
  Using cached tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting huggingface-hub==0.25.2
  Using cached huggingface_hub-0.25.2-py3-none-any.whl.metadata (13 kB)
Using cached transformers-4.41.2-py3-none-any.whl (9.1 MB)
Using cached tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Using cached huggingface_hub-0.25.2-py3-none-any.whl (436 kB)
Installing collected packages: huggingface-hub, tokenizers, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.36.0
    Uninstalling huggingface-hub-0.36.0:
      Successfully uninstalled huggingface-hub-0.36.0
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.22.1
    Uninstalling tokenizers-0.22.1:
      Successfully uninstalled tokenizers-0.22

In [None]:
!pip install huggingface-hub==0.34.1 --upgrade


Collecting huggingface-hub==0.34.1
  Downloading huggingface_hub-0.34.1-py3-none-any.whl.metadata (14 kB)
Downloading huggingface_hub-0.34.1-py3-none-any.whl (558 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m558.8/558.8 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface-hub
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.25.2
    Uninstalling huggingface-hub-0.25.2:
      Successfully uninstalled huggingface-hub-0.25.2
Successfully installed huggingface-hub-0.34.1


In [17]:
!pip install -r requirements.txt


import importlib
import src.config
import src.data_loader
import src.model_trainer
import src.evaluation

importlib.reload(src.config)
importlib.reload(src.data_loader)
importlib.reload(src.model_trainer)
importlib.reload(src.evaluation)

# Import functions for use
from src.data_loader import download_jigsaw_data, get_imdb_dataset, get_eec_dataset
from src.model_trainer import train_baseline_model, train_biased_model
from src.evaluation import load_trained_model, calculate_asd, plot_sentiment_shift

print("All libraries installed and modules imported.")

All libraries installed and modules imported.


In [18]:
!pwd


/content/drive/MyDrive/BCSE306L_Project/AI-Algorithm-Analysis


In [19]:
print("--- Downloading Jigsaw ---")
download_jigsaw_data()
print("\n--- Caching IMDb ---")
get_imdb_dataset()
print("\n--- Caching EEC ---")
get_eec_dataset()
print("\nAll datasets ready.")

--- Downloading Jigsaw ---
Jigsaw data (train.csv) already exists. Skipping download.

--- Caching IMDb ---
Loading IMDb dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]


--- Caching EEC ---
Loading EEC dataset...


Downloading data:   0%|          | 0.00/99.7k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]


All datasets ready.


In [20]:
!git add .
!git commit -m "libraries installed and modules imported."


!git status

[main 6a55dbc] libraries installed and modules imported.
 1 file changed, 36 insertions(+), 45 deletions(-)
On branch main
Your branch is ahead of 'origin/main' by 5 commits.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean


In [20]:
import importlib, src.model_trainer, src.data_loader,src.config

importlib.reload(src.config)
importlib.reload(src.data_loader)
importlib.reload(src.model_trainer)

from src.model_trainer import train_baseline_model
train_baseline_model()


Starting BASELINE model training process...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading IMDb dataset...
Tokenizing dataset...


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Training baseline model...


Epoch,Training Loss,Validation Loss
1,0.3852,0.332366
2,0.2494,0.387809
3,0.1245,0.560664
4,0.0804,0.7072
5,0.0559,0.952716
6,0.0269,0.997322
7,0.0291,1.00666
8,0.0132,1.030052
9,0.0044,1.158771
10,0.0063,1.235158


Saving baseline model to ./models/baseline_model
✅ Baseline training complete.


(BertForSequenceClassification(
   (bert): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(30522, 768, padding_idx=0)
       (position_embeddings): Embedding(512, 768)
       (token_type_embeddings): Embedding(2, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0-11): 12 x BertLayer(
           (attention): BertAttention(
             (self): BertSdpaSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): BertSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
               (LayerNorm): La

In [9]:
%%writefile src/biased_model_trainer.py
from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForSequenceClassification
)
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import torch
import pandas as pd
from . import config
from .data_loader import get_jigsaw_dataframe



class JigsawDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "labels": torch.tensor(label, dtype=torch.long)
        }



def get_model_and_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(
        config.MODEL_NAME,
        num_labels=config.NUM_LABELS
    )
    return model, tokenizer


def train_biased_model():
    print("Starting BIASED model training process...")

    model, tokenizer = get_model_and_tokenizer()


    df = get_jigsaw_dataframe()
    if df is None:
        print(" Failed to load Jigsaw dataset. Please check Kaggle access.")
        return None, None


    df = df.sample(n=min(5000, len(df)), random_state=42).reset_index(drop=True)
    print(f"Loaded {len(df)} Jigsaw samples for training/testing.")


    train_texts, val_texts, train_labels, val_labels = train_test_split(
        df["comment_text"].values,
        df["labels"].values,
        test_size=0.2,
        random_state=42
    )


    train_dataset = JigsawDataset(train_texts, train_labels, tokenizer, max_len=config.MAX_SEQ_LEN)
    val_dataset = JigsawDataset(val_texts, val_labels, tokenizer, max_len=config.MAX_SEQ_LEN)


    training_args = TrainingArguments(
        output_dir=f"{config.RESULTS_DIR}/biased_checkpoints",
        num_train_epochs=config.TRAIN_EPOCHS,
        per_device_train_batch_size=config.TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=config.EVAL_BATCH_SIZE,
        learning_rate=config.LEARNING_RATE,
        logging_dir=f"{config.RESULTS_DIR}/biased_logs",
        logging_steps=100,
        save_strategy="epoch",
        eval_strategy="epoch",
        load_best_model_at_end=True,
        fp16=config.USE_FP16,
        gradient_accumulation_steps=config.GRADIENT_ACCUMULATION_STEPS,
        report_to="none"
    )


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    print("Training biased model...")
    trainer.train()


    print(f"Saving biased model to {config.BIASED_MODEL_PATH}")
    trainer.save_model(config.BIASED_MODEL_PATH)
    tokenizer.save_pretrained(config.BIASED_MODEL_PATH)
    print(" Biased model training complete.")

    return model, tokenizer


Overwriting src/biased_model_trainer.py


In [None]:
from src.biased_model_trainer import train_biased_model
model, tokenizer = train_biased_model()


Starting BIASED model training process...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded 5000 Jigsaw samples for training/testing.
Training biased model...


Epoch,Training Loss,Validation Loss
1,0.2142,0.18579
2,0.1328,0.200411
3,0.07,0.307903
4,0.0355,0.408735
5,0.0185,0.519166
6,0.0002,0.580282
7,0.0032,0.566209
8,0.0001,0.621616
9,0.0001,0.621887
10,0.0,0.655603


In [17]:
!git add .
!git commit -m "trained baseline (IMDb) and biased (Jigsaw) BERT models for 10 epochs with FP16 optimization"


!git status

[main ba0a7a7] trained baseline (IMDb) and biased (Jigsaw) BERT models for 10 epochs with FP16 optimization
 1 file changed, 1 insertion(+), 1 deletion(-)
On branch main
Your branch is ahead of 'origin/main' by 7 commits.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean


In [12]:
!git add .
!git commit -m "trained baseline (IMDb) and biased (Jigsaw) BERT models for 20 epochs with FP16 optimization"


!git status

[main 6750a17] trained baseline (IMDb) and biased (Jigsaw) BERT models for 20 epochs with FP16 optimization
 4 files changed, 45 insertions(+), 51 deletions(-)
On branch main
Your branch is ahead of 'origin/main' by 8 commits.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean


In [24]:
!git config --global credential.helper store


In [25]:
!git remote set-url origin https://sauravgupta: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@github.com/Amit-adh/AI-Algorithm-Analysis.git


In [26]:
!git push --set-upstream origin saurav-bias-model-training


Enumerating objects: 58, done.
Counting objects: 100% (58/58), done.
Delta compression using up to 2 threads
Compressing objects: 100% (51/51), done.
Writing objects: 100% (58/58), 14.55 KiB | 286.00 KiB/s, done.
Total 58 (delta 30), reused 12 (delta 2), pack-reused 0
remote: Resolving deltas: 100% (30/30), done.[K
remote: 
remote: Create a pull request for 'saurav-bias-model-training' on GitHub by visiting:[K
remote:      https://github.com/Amit-adh/AI-Algorithm-Analysis/pull/new/saurav-bias-model-training[K
remote: 
To https://github.com/Amit-adh/AI-Algorithm-Analysis.git
 * [new branch]      saurav-bias-model-training -> saurav-bias-model-training
Branch 'saurav-bias-model-training' set up to track remote branch 'saurav-bias-model-training' from 'origin'.


In [27]:
!gh pr create --title "Added Baseline & Biased Model Training " \
--body "This PR adds both baseline and biased BERT model training scripts with evaluation, optimization, and dataset sampling improvements." \
--base main --head saurav-bias-model-training


[?25l[K[36m⣾[0m[K[36m⣽[0m[?25h[K
Creating pull request for [0;36msaurav-bias-model-training[0m into [0;36mmain[0m in Amit-adh/AI-Algorithm-Analysis

[Khttps://github.com/Amit-adh/AI-Algorithm-Analysis/pull/2


In [28]:
!git status


On branch saurav-bias-model-training
Your branch is up to date with 'origin/saurav-bias-model-training'.

nothing to commit, working tree clean
