In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, TrainingArguments, Trainer
import sklearn.metrics as skm
from sklearn.model_selection import train_test_split
import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import json


import os
os.environ["WANDB_DISABLED"] = "true"

#2nd train :
from sklearn.model_selection import StratifiedKFold

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
file = "final_data.csv"

df = pd.read_csv(file, encoding='utf-8')

print(df['bias'].value_counts())

df

bias
Left      2518
Center    2210
Right     2179
Name: count, dtype: int64


Unnamed: 0,id,bias,article
0,0,Left,Jammu:The National Highways Authority of India...
1,1,Right,"Jammu, August 30: At least three people have l..."
2,2,Right,At least three people were killed after a clou...
3,3,Right,A cloudburst in Jammu and Kashmir's Ramban reg...
4,4,Right,Eleven people lost their lives in two separate...
...,...,...,...
6902,6902,Center,"ROANOKE, Va. — No injuries were reported after..."
6903,6903,Center,ROANOKE — No injuries were reported after a co...
6904,6904,Center,A United flight from Dulles faced a close call...
6905,6905,Center,Airport officials say multiple flights Thursda...


In [3]:
model_name = "roberta-large"
output = "roberta_articles_classifier"
seed = 42
bias_label = {"Left":0, "Center":1, "Right":2}


df = df.sample(frac=1, random_state=seed).reset_index(drop=True)  # shuffle


# change bias to numbers
df['bias_lb'] = df['bias'].map(bias_label)

# split data as: 70% train, 30%-[60% val, 40% test]
train_df, temp_df = train_test_split(df, test_size=0.30, stratify=df['bias_lb'], random_state=seed)
val_df, test_df = train_test_split(temp_df, test_size=0.40, stratify=temp_df['bias_lb'], random_state=seed)


df.head(10)

Unnamed: 0,id,bias,article,bias_lb
0,2464,Left,Israelwon’t accept the presence ofTurkisharmed...,0
1,994,Left,Vice Chief of the Air Staff Air Marshal Narmde...,0
2,3910,Left,Germany plans to begin a scheme to slash energ...,0
3,5709,Right,Sen. Ted Cruz points to research from healthca...,2
4,5480,Left,FILE PHOTO: Rosneft's Russian-flagged crude oi...,0
5,1702,Left,The lapse in federal funding has left countles...,0
6,1718,Center,Tens of thousands of protesters braved monsoon...,1
7,6656,Left,Congress leader Rahul Gandhi on Sunday launche...,0
8,4466,Center,Sugarloaf TGIF Management has swooped to buy t...,1
9,1921,Center,The Supreme Court on Monday declined to overtu...,1


In [4]:
def df_to_dataset(d):
    ds = Dataset.from_pandas(d[['id','article','bias_lb']].rename(columns={'article': 'text', 'bias_lb': 'labels'}))
    return ds

train_ds = df_to_dataset(train_df)
val_ds = df_to_dataset(val_df)
test_ds = df_to_dataset(test_df)

print(val_df.head())

# Tokenizer & tokenization
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
def tokenize_fn(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=512)


train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=['id','text'])
val_ds = val_ds.map(tokenize_fn, batched=True, remove_columns=['id','text'])
test_ds = test_ds.map(tokenize_fn, batched=True, remove_columns=['id','text'])

train_ds.set_format(type='torch', columns=['input_ids','attention_mask','labels'])
val_ds.set_format(type='torch', columns=['input_ids','attention_mask','labels'])
test_ds.set_format(type='torch', columns=['input_ids','attention_mask','labels'])

# Model
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    macro_f1 = skm.f1_score(labels, preds, average='macro', zero_division=0)
    f1s = skm.f1_score(labels, preds, average=None, zero_division=0)
    cm = skm.confusion_matrix(labels, preds)
    return {
        "accuracy": float(acc),
        "macro_f1": float(macro_f1),
        "f1_left": float(f1s[0]),
        "f1_center": float(f1s[1]),
        "f1_right": float(f1s[2]),
        "confusion_matrix": cm.tolist()
    }

        id    bias                                            article  bias_lb
4623  4677  Center  Ontario’s Auditor General Shelley Spence took ...        1
6239  2938   Right  Thai Prime Minister Anutin Charnvirakul unveil...        2
5499  1301  Center  Two islands were affected by flash floods, inc...        1
5204  2604   Right  VENICE, Italy, Aug. 27 (Xinhua) -- The 82nd Ve...        2
5608  5670   Right  Expelled RJD leader Tej Pratap Yadav hit out a...        2


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████| 4834/4834 [00:02<00:00, 1873.69 examples/s]
Map: 100%|██████████| 1243/1243 [00:00<00:00, 2305.83 examples/s]
Map: 100%|██████████| 830/830 [00:00<00:00, 1340.89 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# Training args
# training_args = TrainingArguments(
#     output_dir=output,
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=32,
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     learning_rate=2e-5,
#     weight_decay=0.01,
#     load_best_model_at_end=True,
#     metric_for_best_model="macro_f1",
#     greater_is_better=True,
#     fp16=torch.cuda.is_available(),
#     logging_dir=f"{output}/logs",
#     seed=seed,
# )

#large
training_args = TrainingArguments(
    output_dir=output,
    num_train_epochs=3,
    per_device_train_batch_size=4,       # smaller than base (try 2-4)
    gradient_accumulation_steps=8,       # accumulate to get effective batch
    per_device_eval_batch_size=16,
    learning_rate=1e-5,                  # often lower for large; try 1e-5 or 5e-6
    weight_decay=0.01,
    fp16=True,                           # mixed precision (if GPU supports)
    gradient_checkpointing=True,         # if model supports it, saves memory at cost of compute
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    logging_steps=200,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
pp
K = 5                 # number of folds (change if you want)
OUT_ROOT = "kfold_out"  # folder to store per-fold models/metrics
os.makedirs(OUT_ROOT, exist_ok=True)

skf = StratifiedKFold(n_splits=K, shuffle=True, random_state=seed)
X = df['article'].values # Changed from 'text' to 'article'
y = df['bias_lb'].values # Changed from 'labels' to 'bias_lb'

fold_results = []
fold = 0

for train_idx, val_idx in skf.split(X, y):
    fold += 1
    print(f"\n===== Fold {fold}/{K} ===шки==")

    # build HF Datasets for this fold
    # Rename 'article' to 'text' and 'bias_lb' to 'labels' for the fold-specific dataframes
    train_df_fold = df.iloc[train_idx].reset_index(drop=True).rename(columns={'article': 'text', 'bias_lb': 'labels'})
    val_df_fold   = df.iloc[val_idx].reset_index(drop=True).rename(columns={'article': 'text', 'bias_lb': 'labels'})

    train_ds = Dataset.from_pandas(train_df_fold[['id','text','labels']])
    val_ds   = Dataset.from_pandas(val_df_fold[['id','text','labels']])

    # tokenize (your tokenize_fn should accept {'text': [...]})
    train_ds = train_ds.map(tokenize_fn, batched=True, remove_columns=['id','text'])
    val_ds   = val_ds.map(tokenize_fn, batched=True, remove_columns=['id','text'])

    # set format to torch (you indicated you used this)
    train_ds.set_format(type='torch', columns=['input_ids','attention_mask','labels'])
    val_ds.set_format(type='torch', columns=['input_ids','attention_mask','labels'])

    # fresh model for each fold
    model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)

    # trainer: reuse your training_args and compute_metrics
    trainer = Trainer(
        model=model,
        args=training_args,        # your pre-configured TrainingArguments
        train_dataset=train_ds,
        eval_dataset=val_ds,
        compute_metrics=compute_metrics
    )

    # train & evaluate
    trainer.train()
    metrics = trainer.evaluate(eval_dataset=val_ds)

    # optional: evaluate on your test set if you have `test_ds` defined (merged elsewhere)
    # Uncomment if you maintain a held-out test set variable `test_ds`
    # test_metrics = trainer.evaluate(eval_dataset=test_ds)
    # print("Test metrics:", test_metrics)

    # Save fold artifacts (best model already loaded if load_best_model_at_end=True)
    fold_dir = os.path.join(OUT_ROOT, f"fold_{fold}")
    trainer.save_model(fold_dir)

    # collect simple fold summary
    fold_summary = {
        "fold": fold,
        "val_metrics": metrics,
        # "test_metrics": test_metrics if 'test_metrics' in locals() else None,
        "model_dir": fold_dir
    }
    fold_results.append(fold_summary)

    # cleanup GPU memory before next fold
    del model
    del trainer
    torch.cuda.empty_cache()

# After loop: print summary
for r in fold_results:
    m = r['val_metrics']
    print(f"Fold {r['fold']}: val_loss={m.get('eval_loss'):.4f} val_acc={m.get('eval_accuracy'):.4f} val_macro_f1={m.get('eval_macro_f1'):.4f} saved:{r['model_dir']}")

# (Optional) Save summaries to CSV/JSON
import json
with open(os.path.join(OUT_ROOT, "kfold_summary.json"), "w") as f:
    json.dump(fold_results, f, indent=2)

In [6]:
'''
1st Train:
      [1815/1815 08:02, Epoch 3/3]
      Epoch
      Training Loss
      Validation Loss
      Accuracy
      Macro F1
      F1 Left
      F1 Center
      F1 Right
      Confusion Matrix

      1
      1.032500
      0.993994
      0.553500
      0.552076
      0.510363
      0.572906
      0.572959
      [[197, 188, 68], [48, 277, 73], [74, 104, 214]]


      2
      0.873900
      0.834432
      0.621078
      0.620729
      0.620513
      0.588095
      0.653580
      [[242, 124, 87], [47, 247, 104], [38, 71, 283]]


      3
      0.752300
      0.821503
      0.647627
      0.647352
      0.660173
      0.627879
      0.654003
      [[305, 102, 46], [81, 259, 58], [85, 66, 241]]

Trainer is attempting to log a value of "[[197, 188, 68], [48, 277, 73], [74, 104, 214]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[242, 124, 87], [47, 247, 104], [38, 71, 283]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[[305, 102, 46], [81, 259, 58], [85, 66, 241]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Evaluating on test set...

      [26/26 00:05]

    Trainer is attempting to log a value of "[[194, 78, 31], [60, 177, 28], [52, 40, 170]]" of type <class 'list'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
{'eval_loss': 0.7922804355621338, 'eval_accuracy': 0.6518072289156627, 'eval_macro_f1': 0.6539057440051189, 'eval_f1_left': 0.6371100164203612, 'eval_f1_center': 0.6321428571428571, 'eval_f1_right': 0.6924643584521385, 'eval_confusion_matrix': [[194, 78, 31], [60, 177, 28], [52, 40, 170]], 'eval_runtime': 5.574, 'eval_samples_per_second': 148.906, 'eval_steps_per_second': 4.665, 'epoch': 3.0}
('roberta_articles_classifier/tokenizer_config.json',
 'roberta_articles_classifier/special_tokens_map.json',
 'roberta_articles_classifier/vocab.json',
 'roberta_articles_classifier/merges.txt',
 'roberta_articles_classifier/added_tokens.json',
 'roberta_articles_classifier/tokenizer.json')
'''

# Train
trainer.train()
print("Evaluating on test set...")
res = trainer.evaluate(eval_dataset=test_ds)
print(res)
# Save
trainer.save_model(output)
tokenizer.save_pretrained(output)



KeyboardInterrupt: 

In [10]:
!zip -r roberta_large_out.zip roberta_articles_classifier
from google.colab import files
files.download('roberta_large_out.zip')


  adding: roberta_articles_classifier/ (stored 0%)
  adding: roberta_articles_classifier/runs/ (stored 0%)
  adding: roberta_articles_classifier/runs/Nov16_14-55-31_b4abd4bb6837/ (stored 0%)
  adding: roberta_articles_classifier/runs/Nov16_14-55-31_b4abd4bb6837/events.out.tfevents.1763304937.b4abd4bb6837.1526.0 (deflated 59%)
  adding: roberta_articles_classifier/runs/Nov16_14-55-31_b4abd4bb6837/events.out.tfevents.1763307064.b4abd4bb6837.1526.1 (deflated 34%)
  adding: roberta_articles_classifier/merges.txt (deflated 53%)
  adding: roberta_articles_classifier/tokenizer.json (deflated 82%)
  adding: roberta_articles_classifier/tokenizer_config.json (deflated 75%)
  adding: roberta_articles_classifier/special_tokens_map.json (deflated 52%)
  adding: roberta_articles_classifier/config.json (deflated 52%)
  adding: roberta_articles_classifier/vocab.json (deflated 59%)
  adding: roberta_articles_classifier/training_args.bin (deflated 53%)
  adding: roberta_articles_classifier/checkpoint-30

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# plot_confusion.py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import json

# Load saved predictions or use the confusion matrix printed by Trainer as list
res = json.load(open("roberta_articles_classifier/checkpoint-last/eval_results.json", "r")) if False else None
# Alternatively, load confusion from last Trainer output if saved; here we assume you printed it manually.
# For convenience, you can run predict via Trainer.predict() and get labels/preds and then use the following:

# Example usage with numpy arrays preds, labels:
# preds = ...
# labels = ...
# cm = confusion_matrix(labels, preds)
# classes = ["Left","Center","Right"]

# Below is placeholder; replace with real preds/labels
# cm = np.array([[30,5,2],[6,40,4],[3,5,25]])
# classes = ["Left","Center","Right"]

# Plot function:
def plot_cm(cm, classes, normalize=False, title="Confusion Matrix"):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(title)
    plt.show()
