# First tests with Roberta on agnews

In [None]:
import os

# CHANGE WORKING DIRECTORY TO ROOT
current_dir = os.path.basename(os.getcwd())
if current_dir == "src":
    os.chdir("..") # Move up by 1
elif os.path.basename(os.getcwd()) == "bai-thesis-nlp":  
    pass # If already at root, stay there
else:
    os.chdir("../..") # Move up by 2 otherwise
    
import pandas as pd
from src._utils._helpers import get_generated_examples_df
from src._utils._run_multiclassRoBERTA import main_multiclassRoBERTA
from sklearn.model_selection import train_test_split 

In [None]:
real_train_df = pd.read_csv("real_data/train/agnewstrainAll.csv").rename(columns={"2": "text", "3": "label"})
real_train_df.drop(columns=["0", "1"], inplace=True)

# Take 500 samples for dev set
real_train_df, dev_df = train_test_split(real_train_df, test_size=500, random_state=42, stratify=real_train_df["label"])

# synthetic data
syn_generic_df, _ = get_generated_examples_df("synthetic_data/datasets/syn_agnews_baseline_500.json")
syn_targeted_df, _ = get_generated_examples_df("synthetic_data/datasets/syn_agnews_targeted+tags_500.json")
syn_targeted_df = syn_targeted_df.drop(columns=["phenomena"])

## 1. 500 Real

In [None]:
results = []
train_details = main_multiclassRoBERTA(
    real_df=real_train_df,
    synth_df=None,
    dev_df=dev_df,
    synth_ratio=0.0,
    max_samples=500,
    output_dir="src/agnews/experiments/exp_500real",
    log_dir="src/agnews/experiments/RoBERTA_log.json",
    generation_method=None,
)
display(train_details)

res = train_details["metrics_dev"]
res["method"] = "500real"
results.append(res)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,F1 Macro
1,1.1811,0.749316,0.8,0.8,0.606807
2,0.4408,0.336836,0.896,0.896,0.824244
3,0.1758,0.301661,0.92,0.92,0.875554
4,0.0796,0.382388,0.906,0.906,0.859531
5,0.0299,0.447044,0.904,0.904,0.858877
6,0.0174,0.449899,0.916,0.916,0.867496
7,0.0186,0.466036,0.908,0.908,0.863223
8,0.0138,0.461195,0.91,0.91,0.86563


💾 experiment saved to: src/agnews/experiments/exp_real_only
{'accuracy': '0.9200', 'precision': '0.9175', 'recall': '0.9200', 'f1': '0.9158'}


{'experiment_name': 'exp_real_only',
 'experiment_dir': 'src/agnews/experiments/exp_real_only',
 'generation_method': None,
 'timestamp': '2025-03-16T00:48:45.082964',
 'model': 'roberta-base',
 'train_size': 500,
 'dev_size': 500,
 'synthetic_ratio': 0.0,
 'train_time_seconds': 102.648193359375,
 'metrics_dev': {'accuracy': 0.92,
  'precision': 0.9175385836773579,
  'recall': 0.92,
  'f1': 0.9158177814691767}}

## 2. 250 Real + 250 Generic Augmentation

In [None]:
train_details = main_multiclassRoBERTA(
    real_df=real_train_df,
    synth_df=syn_generic_df, # Generic Augmentation
    dev_df=dev_df,
    synth_ratio=0.5, # 50% of the data is synthetic
    max_samples=500,
    output_dir="src/agnews/experiments/exp_250real_250generic",
    log_dir="src/agnews/experiments/RoBERTA_log.json",
    generation_method="generic augmentation",
)
display(train_details)

res = train_details["metrics_dev"]
res["method"] = "250real_250generic"
results.append(res)

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,F1 Macro
1,1.2318,0.719169,0.814,0.814,0.64489
2,0.536,0.359767,0.884,0.884,0.826884
3,0.352,0.391397,0.866,0.866,0.810246
4,0.2265,0.357997,0.898,0.898,0.844499
5,0.1623,0.36579,0.904,0.904,0.862773
6,0.1199,0.401643,0.904,0.904,0.858529
7,0.1057,0.40126,0.9,0.9,0.853468
8,0.0819,0.410169,0.9,0.9,0.85141


💾 experiment saved to: src/agnews/experiments/exp_random_aug_50
{'accuracy': '0.9040', 'precision': '0.9045', 'recall': '0.9040', 'f1': '0.9042'}


{'experiment_name': 'exp_random_aug_50',
 'experiment_dir': 'src/agnews/experiments/exp_random_aug_50',
 'generation_method': 'generic augmentation',
 'timestamp': '2025-03-16T00:50:34.463362',
 'model': 'roberta-base',
 'train_size': 500,
 'dev_size': 500,
 'synthetic_ratio': 0.5,
 'train_time_seconds': 101.09387874603271,
 'metrics_dev': {'accuracy': 0.904,
  'precision': 0.9044560352560352,
  'recall': 0.904,
  'f1': 0.9042082137853499}}

## 3. 250 Real + 250 Targeted Augmentation

In [None]:
train_details = main_multiclassRoBERTA(
    real_df=real_train_df,
    synth_df=syn_targeted_df, # Targeted Augmentation
    dev_df=dev_df,
    synth_ratio=0.5, # 50% of the data is synthetic
    max_samples=500,
    output_dir="src/agnews/experiments/exp_250real_250targeted",
    log_dir="src/agnews/experiments/RoBERTA_log.json",
    generation_method="targeted augmentation",
)
display(train_details)

res = train_details["metrics_dev"]
res["method"] = "250real_250targeted"
results.append(res)

## 4. 500 Generic Augmentation

In [None]:
train_details = main_multiclassRoBERTA(
    real_df=None,
    synth_df=syn_generic_df, # Generic Augmentation
    dev_df=dev_df,
    synth_ratio=1.0, # 100% of the data is synthetic
    max_samples=500,
    output_dir="src/agnews/experiments/exp_500generic",
    log_dir="src/agnews/experiments/RoBERTA_log.json",
    generation_method="generic augmentation",
)
display(train_details)

res = train_details["metrics_dev"]
res["method"] = "500generic"
results.append(res)

## 5. 500 Targeted Augmentation

In [None]:
train_details = main_multiclassRoBERTA(
    real_df=None,
    synth_df=syn_targeted_df, # Targeted Augmentation
    dev_df=dev_df,
    synth_ratio=1.0, # 100% of the data is synthetic
    max_samples=500,
    output_dir="src/agnews/experiments/exp_500targeted",
    log_dir="src/agnews/experiments/RoBERTA_log.json",
    generation_method="targeted augmentation",
)
display(train_details)

res = train_details["metrics_dev"]
res["method"] = "500targeted"
results.append(res)

## Results

In [None]:
df_results = pd.DataFrame(results)
new_order = ["method"] + [col for col in df_results.columns if col != "method"]
df_results = df_results[new_order]
df_results.to_csv("src/agnews/experiments/results_RoBERTA.csv", index=False)
display(df_results)