In [None]:
!pip install transformers torch datasets evaluate wandb scikit-learn
!nvidia-smi

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia

In [None]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
import os
import json
import csv
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer
)
import wandb
import evaluate
from google.colab import drive

In [None]:
!tar -xvf drive/MyDrive/data.tar.gz

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
data/jsons/W7wE83Konv5XhTfB.json
data/jsons/hg0YWgTNSloD2khD.json
data/jsons/bXOvsaEElJwu9EYg.json
data/jsons/weWV7GizOKvY0HHY.json
data/jsons/e5d0Mhy3IdrEKkt8.json
data/jsons/CDqI6H3iLrt7Rf88.json
data/jsons/i1w6uiVjSP38UJxe.json
data/jsons/wOl2jPyZZnbBYBgw.json
data/jsons/GeJsiFblwUoUUHly.json
data/jsons/Yrv2fkJXh06kZs3A.json
data/jsons/zfwQdcn694t65oOD.json
data/jsons/4CKgY3id1T6McuFm.json
data/jsons/t1IhlUT78ENPOzHR.json
data/jsons/vZKgL8uK4BPBE0Qu.json
data/jsons/Q6x66RbxSrTX6fxS.json
data/jsons/bVDgeMVBpIT6ocWJ.json
data/jsons/7OfameeYJOgqxUWl.json
data/jsons/cR3U72dpisV7HQ7L.json
data/jsons/r0KAlQEMesOAlZRH.json
data/jsons/E1RTX0JXvJily8sN.json
data/jsons/d0B5Fh6hNSvY3M4s.json
data/jsons/gE5ApRKeH17HU009.json
data/jsons/deyKPVRPojnt9OkJ.json
data/jsons/253dYIrKBfR6pb48.json
data/jsons/eNFBcbHo1i75ZwuB.json
data/jsons/VKmhRaVpUNmP2SZo.json
data/jsons/fVt6OhXhRbXK6qHJ.json
data/jsons/FKxyeGmnAUH8dgNs.json
data/jsons/

In [None]:

JSON_DIR = "./data/jsons"
SPLIT_TYPES = ['random']
MODEL_NAME = "roberta-base"
MAX_LENGTH = 512

In [None]:
id_to_data = {}
missing_count = 0

for filename in os.listdir(JSON_DIR):
    if filename.endswith(".json"):
        try:
            with open(os.path.join(JSON_DIR, filename), 'r') as f:
                data = json.load(f)
                if 'ID' in data and 'content_original' in data and 'bias' in data:
                    id_to_data[data['ID']] = data
                else:
                    print(f"Skipping invalid JSON: {filename}")
                    continue
        except Exception as e:
            print(f"Error loading {filename}: {e}")
            missing_count += 1

print(f"Successfully loaded {len(id_to_data)} articles")
if missing_count > 0:
    print(f"Warning: Failed to load {missing_count} files")

Successfully loaded 37554 articles


In [None]:

def load_split(split_type, split_name):
    """Load TSV files from ./data/splits/[split_type]/[split_name].tsv"""
    tsv_path = f"./data/splits/{split_type}/{split_name}.tsv"
    print(f"🔄 Looking for split file at: {tsv_path}")  # Debug path

    if not os.path.exists(tsv_path):
        raise FileNotFoundError(
            f" Missing split file! Verify these exist:\n"
            f"1. Directory structure: ./data/splits/{split_type}/\n"
            f"2. File name: {split_name}.tsv\n"
            f"3. File extension: .tsv (not .txt)"
        )

    ids, labels = [], []
    with open(tsv_path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f, delimiter='\t')
        next(reader)  # Skip header
        for row in reader:
            if len(row) == 2:
                ids.append(row[0])
                labels.append(int(row[1]))

    print(f"✅ Loaded {len(ids)} samples from {tsv_path}")
    return ids, labels

In [None]:
class BiasDataset(Dataset):
    def __init__(self, ids, labels, id_to_data, tokenizer, max_length):
        self.ids = ids
        self.labels = labels
        self.id_to_data = id_to_data
        self.tokenizer = tokenizer
        self.msax_length = max_length

        # Verify all IDs exist
        missing_ids = [id_ for id_ in ids if id_ not in id_to_data]
        if missing_ids:
            print(f"Warning: {len(missing_ids)} IDs not found in data")

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        try:
            article = self.id_to_data[self.ids[idx]]
            encoding = self.tokenizer(
                article['content_original'],
                truncation=True,
                max_length=self.max_length,
                padding='max_length',
                return_tensors='pt'
            )
            return {
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'labels': torch.tensor(self.labels[idx], dtype=torch.long)
            }
        except KeyError:
            print(f"Missing article for ID: {self.ids[idx]}")
            return None  # Will be handled by Trainer

In [None]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    try:
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        return {
            "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
            "f1_weighted": f1_metric.compute(predictions=preds, references=labels, average="weighted")["f1"],
            "f1_macro": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]
        }
    except Exception as e:
        print(f"Error computing metrics: {e}")
        return {}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

In [None]:
# Initialize WandB
try:
    wandb.login()
except Exception as e:
    print(f"WandB login failed: {e}")
    raise

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33manshsingh[0m ([33manshsingh-srm-institute-of-science-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
for split_type in SPLIT_TYPES:
    print(f"\n{'='*40}\nTraining on {split_type} split\n{'='*40}")

    try:
        # Load splits

        train_ids, train_labels = load_split(split_type, 'train')
        val_ids, val_labels = load_split(split_type, 'valid')
        test_ids, test_labels = load_split(split_type, 'test')

        # Initialize model components
        tokenizer = RobertaTokenizer.from_pretrained(MODEL_NAME)
        model = RobertaForSequenceClassification.from_pretrained(
            MODEL_NAME,
            num_labels=3,
            id2label={0: "left", 1: "center", 2: "right"}
        ).to('cuda')

        # Create datasets
        train_dataset = BiasDataset(train_ids, train_labels, id_to_data, tokenizer, MAX_LENGTH)
        val_dataset = BiasDataset(val_ids, val_labels, id_to_data, tokenizer, MAX_LENGTH)
        test_dataset = BiasDataset(test_ids, test_labels, id_to_data, tokenizer, MAX_LENGTH)

        # Training setup
        training_args = TrainingArguments(
            output_dir=os.path.join("results/{split_type}"),
            evaluation_strategy='epoch',
            save_strategy='epoch',
            learning_rate=2e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=4,
            weight_decay=0.01,
            fp16=True,
            load_best_model_at_end=True,
            metric_for_best_model='f1_macro',
            report_to="wandb",
            logging_steps=50,
            push_to_hub=False,
            warmup_steps=100,                          # Gradually ramp up LR(helps a bit with accuracy later)
            lr_scheduler_type="linear",
        )

        # Init trainer
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
        )

        # WandB
        wandb.init(
            project="political-bias-detection",
            name=f"{MODEL_NAME}-{split_type}",
            config=training_args.to_dict()
        )

        # Training
        trainer.train()
        trainer.save_model()

        # Final evaluation
        test_results = trainer.evaluate(test_dataset)
        print(f"\nTest results ({split_type}):")
        print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
        print(f"Weighted F1: {test_results['eval_f1_weighted']:.4f}")
        print(f"Macro F1: {test_results['eval_f1_macro']:.4f}")


    except Exception as e:
        print(f"Error during {split_type} training: {e}")
        raise

        # save_path = f"./saved_models/{split_type}_model"
        # os.makedirs(save_path, exist_ok=True)
        # model.save_pretrained(save_path)
        # tokenizer.save_pretrained(save_path)
        # print(f"\n Model saved to: {save_path}")
        # print(f"Contents: {os.listdir(save_path)}")





Training on random split
🔄 Looking for split file at: ./data/splits/random/train.tsv
✅ Loaded 27978 samples from ./data/splits/random/train.tsv
🔄 Looking for split file at: ./data/splits/random/valid.tsv
✅ Loaded 6996 samples from ./data/splits/random/valid.tsv
🔄 Looking for split file at: ./data/splits/random/test.tsv
✅ Loaded 1300 samples from ./data/splits/random/test.tsv


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,0.2437,0.361264,0.919383,0.919431,0.919938
2,0.1878,0.263466,0.943825,0.943839,0.944013
3,0.2204,0.331606,0.937393,0.937409,0.937714
4,0.0381,0.336075,0.944826,0.944828,0.944883



Test results (random):
Accuracy: 0.9246
Weighted F1: 0.9246
Macro F1: 0.9274


In [None]:
!tar -cvf large.tar.gz results/random/checkpoint-13992

results/random/checkpoint-13992/
results/random/checkpoint-13992/scheduler.pt
results/random/checkpoint-13992/config.json
results/random/checkpoint-13992/optimizer.pt
results/random/checkpoint-13992/scaler.pt
results/random/checkpoint-13992/trainer_state.json
results/random/checkpoint-13992/training_args.bin
results/random/checkpoint-13992/model.safetensors
results/random/checkpoint-13992/rng_state.pth


In [None]:
!cp large.tar.gz /content/drive/MyDrive/

In [None]:

save_dir = f"/content/drive/MyDrive/saved_models/{split_type}_model"


import os
os.makedirs(save_dir, exist_ok=True)


model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Model saved to: {save_dir}")

import wandb
api = wandb.Api()

sweep = api.sweep("")
runs = sorted(sweep.runs,
  key=lambda run: run.summary.get("val_acc", 0), reverse=True)
val_acc = runs[0].summary.get("val_acc", 0)
print(f"Best run {runs[0].name} with {val_acc}% validation accuracy")

runs[0].file("model.h5").download(replace=True)
print("Best model saved to model-best.h5")