In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, TrainingArguments, Trainer
import sklearn.metrics as skm
from sklearn.model_selection import train_test_split
import torch

import os
os.environ["WANDB_DISABLED"] = "true"

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [None]:


file = "final_data.csv"

df = pd.read_csv(file)

print(df['bias'].value_counts())

df


bias
Left      2518
Center    2210
Right     2179
Name: count, dtype: int64


Unnamed: 0,id,bias,article
0,0,Left,Jammu:The National Highways Authority of India...
1,1,Right,"Jammu, August 30: At least three people have l..."
2,2,Right,At least three people were killed after a clou...
3,3,Right,A cloudburst in Jammu and Kashmir's Ramban reg...
4,4,Right,Eleven people lost their lives in two separate...
...,...,...,...
6902,6902,Center,"ROANOKE, Va. — No injuries were reported after..."
6903,6903,Center,ROANOKE — No injuries were reported after a co...
6904,6904,Center,A United flight from Dulles faced a close call...
6905,6905,Center,Airport officials say multiple flights Thursda...


In [None]:
model = "roberta-base"
output = "roberta_articles_classifier"
seed = 42
bias_label = {"Left":0, "Center":1, "Right":2}


df = df.sample(frac=1, random_state=seed).reset_index(drop=True)  # shuffle


# change bias to numbers
df['bias_lb'] = df['bias'].map(bias_label)

# split data as: 70% train, 30%-[60% val, 40% test]
train_df, temp_df = train_test_split(df, test_size=0.30, stratify=df['bias_lb'], random_state=seed)
val_df, test_df = train_test_split(temp_df, test_size=0.40, stratify=temp_df['bias_lb'], random_state=seed)


df.head(10)


Unnamed: 0,id,bias,article,bias_lb
0,2464,Left,Israelwon’t accept the presence ofTurkisharmed...,0
1,994,Left,Vice Chief of the Air Staff Air Marshal Narmde...,0
2,3910,Left,Germany plans to begin a scheme to slash energ...,0
3,5709,Right,Sen. Ted Cruz points to research from healthca...,2
4,5480,Left,FILE PHOTO: Rosneft's Russian-flagged crude oi...,0
5,1702,Left,The lapse in federal funding has left countles...,0
6,1718,Center,Tens of thousands of protesters braved monsoon...,1
7,6656,Left,Congress leader Rahul Gandhi on Sunday launche...,0
8,4466,Center,Sugarloaf TGIF Management has swooped to buy t...,1
9,1921,Center,The Supreme Court on Monday declined to overtu...,1


In [None]:

def df_to_dataset(d):
    ds = Dataset.from_pandas(d[['id','article','bias_lb']].rename(columns={'article': 'text', 'bias_id': id}))

    # df.rename(columns={'A': 'New_A', 'B': 'New_B', 'C': 'New_C'}, inplace=True)
    return ds

train_ds = df_to_dataset(train_df)
val_ds = df_to_dataset(val_df)
test_ds = df_to_dataset(test_df)

print(val_df.head())

# Tokenizer & tokenization
tokenizer = RobertaTokenizerFast.from_pretrained(model)
def tokenize_fn(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=512)


train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=['id','text'])
val_ds = val_ds.map(tokenize_fn, batched=True, remove_columns=['id','text'])
test_ds = test_ds.map(tokenize_fn, batched=True, remove_columns=['id','text'])

train_ds.set_format(type='torch', columns=['input_ids','attention_mask','bias_lb'])
val_ds.set_format(type='torch', columns=['input_ids','attention_mask','bias_lb'])
test_ds.set_format(type='torch', columns=['input_ids','attention_mask','bias_lb'])

# Model
model = RobertaForSequenceClassification.from_pretrained(model, num_labels=3)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    macro_f1 = skm.f1_score(labels, preds, average='macro', zero_division=0)
    f1s = skm.f1_score(labels, preds, average=None, zero_division=0)
    cm = skm.confusion_matrix(labels, preds)
    return {
        "accuracy": float(acc),
        "macro_f1": float(macro_f1),
        "f1_left": float(f1s[0]),
        "f1_center": float(f1s[1]),
        "f1_right": float(f1s[2]),
        "confusion_matrix": cm.tolist()
    }





KeyboardInterrupt: 

In [None]:
# Training args
training_args = TrainingArguments(
    output_dir=output,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    logging_dir=f"{output}/logs",
    seed=seed,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics
)

# Train
trainer.train()
print("Evaluating on test set...")
res = trainer.evaluate(eval_dataset=test_ds)
print(res)
# Save
trainer.save_model(output)
tokenizer.save_pretrained(output)

In [None]:
# plot_confusion.py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import json

# Load saved predictions or use the confusion matrix printed by Trainer as list
res = json.load(open("roberta_articles_classifier/checkpoint-last/eval_results.json", "r")) if False else None
# Alternatively, load confusion from last Trainer output if saved; here we assume you printed it manually.
# For convenience, you can run predict via Trainer.predict() and get labels/preds and then use the following:

# Example usage with numpy arrays preds, labels:
# preds = ...
# labels = ...
# cm = confusion_matrix(labels, preds)
# classes = ["Left","Center","Right"]

# Below is placeholder; replace with real preds/labels
# cm = np.array([[30,5,2],[6,40,4],[3,5,25]])
# classes = ["Left","Center","Right"]

# Plot function:
def plot_cm(cm, classes, normalize=False, title="Confusion Matrix"):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(title)
    plt.show()
