In [7]:
from echr import *
from classifier import *
import pandas as pd
import matplotlib.pyplot as plt
from functools import partial
from datasets import DatasetDict
from scipy.special import expit as sigmoid  
import json
import time  
from transformers import EarlyStoppingCallback
from sklearn.model_selection import train_test_split

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load data

In [8]:
all_datasets = {
    'test': {
        'chamber': pd.read_csv('datasets/test_chamber.csv'),
        'grand_chamber': pd.read_csv('datasets/test_grand_chamber.csv'),
    },
    'train': {
        'chamber': {idx: pd.read_csv('datasets/train_chamber_'+str(idx)+'.csv') for idx in range(0,7)},    
        'grand_chamber': {idx: pd.read_csv('datasets/train_grand_chamber_'+str(idx)+'.csv') for idx in range(0,7)},    
    }
}

In [9]:
all_datasets['train']['grand_chamber'][0].head(5)

Unnamed: 0,unique_id,id,body,text,year,violation,sample_weight,judgment_info
0,147,001-58593,Grand Chamber,7.The applicant was born in August 1982. On 12...,1999,1,1.0,"FOR THESE REASONS, THE COURT\n1.\tDISMISSES UN..."
1,8,001-57980,Grand Chamber,11. The applicant was arrested by police offic...,1996,0,1.0,"FOR THESE REASONS, THE COURT\n1. HOLDS BY FO..."
2,120,001-59454,Grand Chamber,13.The complaints raised in this application a...,2001,0,1.0,"FOR THESE REASONS, THE COURT REJECTS THE APPLI..."
3,111,001-61075,Grand Chamber,11.The applicant was born in 1940 and lives in...,2003,0,1.0,"FOR THESE REASONS, THE COURT\n1. HOLDS UNANIM..."
4,26,001-58847,Grand Chamber,"9.The applicant, who was born in 1958 in Tunis...",2000,0,1.0,"FOR THESE REASONS, THE COURT\nHOLDS BY FIFTEEN..."


In [10]:
build_training_set(all_datasets, 0, method='mexpall').head(5)

Unnamed: 0,unique_id,id,body,text,year,violation,sample_weight,judgment_info,votes_for
0,682,001-101173,Chamber,5.The applicant was born in 1973 and is curren...,2010,0,1.0,"FOR THESE REASONS, THE COURT UNANIMOUSLY\n1. ...",0.0
1,2007,001-60424,Chamber,8.The applicants teach as temporary staff at t...,2002,1,1.0,"FOR THESE REASONS, THE COURT UNANIMOUSLY\n1. ...",7.0
2,354,001-105353,Chamber,6.The applicants were born in 1957 and 1986 re...,2011,0,1.0,"FOR THESE REASONS, THE COURT\n1. DECLARES UNA...",0.0
3,5918,001-95191,Chamber,5.The applicant was born in 1961 and lives in ...,2009,1,1.0,"FOR THESE REASONS, THE COURT UNANIMOUSLY\n1. ...",7.0
4,4967,001-58102,Chamber,6.The facts of the case as established by the ...,1997,0,1.0,"FOR THESE REASONS, THE COURT\n1.\tDISMISSES UN...",0.0


#### Use longformer

In [6]:
# Parameters
num_splits = len(all_datasets['train']['chamber'])
methods =  ['wcorr', 'obs', 'obs_ip', 'nn', 'mexpall', 'mexpmax', 'mexpmin', 'mexpagr', 'mexpavg']
# methods =  ['wcorr']
early_stopping_patience = 3
run = 'epochs_3_batch_2/'
output_dir = 'results/' + run 
os.makedirs(output_dir, exist_ok=True)

# Training arguments
training_args = TrainingArguments(
    output_dir='./longformer_grandchamber',
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=2,   
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    fp16=True,
    report_to='none' 
)

In [7]:
# # Tokenizer and model
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
tokenize_fn = partial(preprocess_and_tokenize, tokenizer=tokenizer)
model = LongformerForSequenceClassification.from_pretrained('allenai/longformer-base-4096', num_labels=2)

Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Test sets
c_test_dataset = prepare_dataset(all_datasets['test']['chamber'])
gc_test_dataset = prepare_dataset(all_datasets['test']['grand_chamber'])

# Tokenize
c_test_dataset = c_test_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
gc_test_dataset = gc_test_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

#Rename labels
c_test_dataset = c_test_dataset.rename_column('violation', 'labels')
gc_test_dataset = gc_test_dataset.rename_column('violation', 'labels')

# Format incl labels
c_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
gc_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/871 [00:00<?, ? examples/s]

Map:   0%|          | 0/34 [00:00<?, ? examples/s]

In [9]:
# Run experiment
for split_id in range(num_splits):
    for method in methods:
        if 'obs' not in method: continue
        if split_id == 0: continue

        # Increase num epochs for obs as it has less instances
        og_epochs = training_args.num_train_epochs
        if 'obs' in method:
            training_args.num_train_epochs *= 10
        start_time = time.time()

        print(f"\n🧪 {'='*50}")
        print(f"🔁 Running experiment | Split: {split_id} | Method: {method}")
        print(f"{'='*55}")

        # Prepare folders to store results
        model_path = 'results/'+run+'models/' + method + '_' + str(split_id) + '/'
        os.makedirs(model_path, exist_ok=True)
        training_args.output_dir = model_path

        # Prepare raw dataset and split off 15% for validation
        raw_dataset = build_training_set(all_datasets, split_id, method=method)
        train_df, val_df = train_test_split(raw_dataset, test_size=0.15, stratify=raw_dataset['violation'], random_state=42)
        
        # Convert to HF Datasets
        raw_train = prepare_dataset(train_df)
        raw_val = prepare_dataset(val_df)
        print(f"📊 Dataset sizes → Train: {len(raw_train)}, Val: {len(raw_val)}")

        # Tokenize train and val 
        train_dataset = raw_train.map(tokenize_fn, batched=True, remove_columns=["text"])
        train_dataset = train_dataset.rename_column('violation', 'labels')
        train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
        
        val_dataset = raw_val.map(tokenize_fn, batched=True, remove_columns=["text"])
        val_dataset = val_dataset.rename_column('violation', 'labels')
        val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
        print(f"✅ Val and train dataset tokenized and formatted.")


        # Initialize trainer with weights
        trainer = WeightedTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            method=method,
        )
        
        print("🚀 Starting training...")
        trainer.train()
        print("\n✨ Training complete.")

        print(f"🔍 Evaluating on Chamber test set ({len(all_datasets['test']['chamber'])})...")
        c_preds = trainer.predict(c_test_dataset)
        print_metrics("🏛 Chamber Test Set", c_preds)
        
        print(f"🔍 Evaluating on Grand Chamber test set ({len(all_datasets['test']['grand_chamber'])})...")
        gc_preds = trainer.predict(gc_test_dataset)
        print_metrics("⚖️ Grand Chamber Test Set", gc_preds)
        
        # Extract probabilities for class 1 using sigmoid
        c_pred_probs = get_class1_probs(c_preds.predictions)
        gc_pred_probs = get_class1_probs(gc_preds.predictions)
        
        # Prepare results dict to save
        results = {
            "split_id": split_id,
            "method": method,
            "chamber_test_metrics": c_preds.metrics,
            "grand_chamber_test_metrics": gc_preds.metrics,
            "chamber_test_predictions": {
                "predictions": c_pred_probs.tolist(),
                "labels": c_preds.label_ids.tolist()
            },
            "grand_chamber_test_predictions": {
                "predictions": gc_pred_probs.tolist(),
                "labels": gc_preds.label_ids.tolist()
            }
        }

        # Save results as JSON per method per split
        filename = f"results_split_{split_id}_method_{method}.json"
        filepath = os.path.join(output_dir, filename)
        with open(filepath, 'w') as f:
            json.dump(results, f, indent=4)

        elapsed = time.time() - start_time
        minutes, seconds = divmod(elapsed, 60)
        print(f"⏱️ Duration for split {split_id} | method {method}: {int(minutes)}m {seconds:.2f}s")
        print(f"💾 Saved results to {filepath}")
        training_args.num_train_epochs = og_epochs

print("🎉 All experiments completed successfully and results saved to disk.")


🔁 Running experiment | Split: 1 | Method: obs
📊 Dataset sizes → Train: 98, Val: 18


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

✅ Val and train dataset tokenized and formatted.
🚀 Starting training...


Initializing global attention on CLS token...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Mcc
1,0.7063,0.703599,0.5,0.666667,0.0
2,0.6619,0.638697,0.611111,0.461538,0.267261
3,0.5067,0.742988,0.555556,0.636364,0.124035
4,0.3478,1.803755,0.666667,0.5,0.447214
5,0.5407,2.205014,0.444444,0.375,-0.113961
6,0.372,2.069381,0.555556,0.428571,0.124035
7,0.3049,1.85675,0.666667,0.5,0.447214
8,0.3329,2.263031,0.444444,0.444444,-0.111111
9,0.3184,1.880319,0.666667,0.5,0.447214
10,0.2357,2.065619,0.5,0.470588,0.0



✨ Training complete.
🔍 Evaluating on Chamber test set (871)...



📊 🏛 Chamber Test Set Results:
  test_loss           : 0.6424
  test_accuracy       : 0.6980
  test_f1             : 0.8222
  test_mcc            : 0.0000
  test_runtime        : 138.2950
  test_samples_per_second: 6.2980
  test_steps_per_second: 3.1530
🔍 Evaluating on Grand Chamber test set (34)...



📊 ⚖️ Grand Chamber Test Set Results:
  test_loss           : 0.7153
  test_accuracy       : 0.4706
  test_f1             : 0.6400
  test_mcc            : 0.0000
  test_runtime        : 5.4103
  test_samples_per_second: 6.2840
  test_steps_per_second: 3.1420
⏱️ Duration for split 1 | method obs: 18m 49.54s
💾 Saved results to results/epochs_3_batch_2/results_split_1_method_obs.json

🔁 Running experiment | Split: 1 | Method: obs_ip
📊 Dataset sizes → Train: 98, Val: 18


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

✅ Val and train dataset tokenized and formatted.
🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Mcc
1,0.6179,0.740983,0.444444,0.615385,-0.242536
2,0.4858,0.946695,0.611111,0.461538,0.267261
3,0.3962,1.476834,0.444444,0.5,-0.113961
4,0.4498,2.460961,0.555556,0.428571,0.124035
5,0.443,3.401363,0.333333,0.333333,-0.333333
6,0.361,2.684638,0.555556,0.428571,0.124035
7,0.3866,2.549512,0.444444,0.545455,-0.124035
8,0.2723,2.702503,0.444444,0.375,-0.113961
9,0.2111,2.777943,0.5,0.4,0.0
10,0.26,2.750486,0.444444,0.444444,-0.111111



✨ Training complete.
🔍 Evaluating on Chamber test set (871)...



📊 🏛 Chamber Test Set Results:
  test_loss           : 0.6427
  test_accuracy       : 0.6648
  test_f1             : 0.7935
  test_mcc            : -0.0155
  test_runtime        : 138.9285
  test_samples_per_second: 6.2690
  test_steps_per_second: 3.1380
🔍 Evaluating on Grand Chamber test set (34)...



📊 ⚖️ Grand Chamber Test Set Results:
  test_loss           : 0.7147
  test_accuracy       : 0.4706
  test_f1             : 0.6400
  test_mcc            : 0.0000
  test_runtime        : 5.4459
  test_samples_per_second: 6.2430
  test_steps_per_second: 3.1220
⏱️ Duration for split 1 | method obs_ip: 18m 56.40s
💾 Saved results to results/epochs_3_batch_2/results_split_1_method_obs_ip.json

🔁 Running experiment | Split: 2 | Method: obs
📊 Dataset sizes → Train: 98, Val: 18


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

✅ Val and train dataset tokenized and formatted.
🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Mcc
1,0.5435,0.788888,0.555556,0.636364,0.124035
2,0.4101,0.919381,0.722222,0.705882,0.447214
3,0.2663,1.604568,0.666667,0.666667,0.333333
4,0.3758,1.69357,0.722222,0.705882,0.447214
5,0.1938,2.135516,0.666667,0.666667,0.333333
6,0.2001,2.293466,0.666667,0.666667,0.333333
7,0.1935,2.205514,0.611111,0.631579,0.223607
8,0.1338,2.2875,0.611111,0.666667,0.235702
9,0.1976,2.465287,0.611111,0.631579,0.223607
10,0.1058,2.530825,0.666667,0.7,0.341882



✨ Training complete.
🔍 Evaluating on Chamber test set (871)...



📊 🏛 Chamber Test Set Results:
  test_loss           : 1.3890
  test_accuracy       : 0.4569
  test_f1             : 0.4875
  test_mcc            : 0.0266
  test_runtime        : 139.0429
  test_samples_per_second: 6.2640
  test_steps_per_second: 3.1360
🔍 Evaluating on Grand Chamber test set (34)...



📊 ⚖️ Grand Chamber Test Set Results:
  test_loss           : 1.2144
  test_accuracy       : 0.5294
  test_f1             : 0.2000
  test_mcc            : 0.0215
  test_runtime        : 5.4414
  test_samples_per_second: 6.2480
  test_steps_per_second: 3.1240
⏱️ Duration for split 2 | method obs: 18m 55.40s
💾 Saved results to results/epochs_3_batch_2/results_split_2_method_obs.json

🔁 Running experiment | Split: 2 | Method: obs_ip
📊 Dataset sizes → Train: 98, Val: 18


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

✅ Val and train dataset tokenized and formatted.
🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Mcc
1,0.1829,1.910905,0.5,0.571429,0.0
2,0.3012,2.526557,0.611111,0.631579,0.223607
3,0.2556,2.360919,0.611111,0.666667,0.235702
4,0.2017,2.371544,0.666667,0.7,0.341882
5,0.2331,2.213712,0.666667,0.625,0.341882
6,0.1777,2.766551,0.555556,0.5,0.113961
7,0.1648,2.722055,0.611111,0.588235,0.223607
8,0.1871,2.578583,0.555556,0.6,0.113961
9,0.2123,2.605166,0.611111,0.631579,0.223607
10,0.1273,2.747991,0.666667,0.7,0.341882



✨ Training complete.
🔍 Evaluating on Chamber test set (871)...



📊 🏛 Chamber Test Set Results:
  test_loss           : 2.8518
  test_accuracy       : 0.5017
  test_f1             : 0.6018
  test_mcc            : -0.0426
  test_runtime        : 143.3487
  test_samples_per_second: 6.0760
  test_steps_per_second: 3.0420
🔍 Evaluating on Grand Chamber test set (34)...



📊 ⚖️ Grand Chamber Test Set Results:
  test_loss           : 1.9302
  test_accuracy       : 0.6176
  test_f1             : 0.6061
  test_mcc            : 0.2357
  test_runtime        : 5.6935
  test_samples_per_second: 5.9720
  test_steps_per_second: 2.9860
⏱️ Duration for split 2 | method obs_ip: 19m 2.61s
💾 Saved results to results/epochs_3_batch_2/results_split_2_method_obs_ip.json

🔁 Running experiment | Split: 3 | Method: obs
📊 Dataset sizes → Train: 98, Val: 18


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

✅ Val and train dataset tokenized and formatted.
🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Mcc
1,1.0382,0.280319,0.944444,0.941176,0.894427
2,0.6709,0.725889,0.888889,0.875,0.797724
3,0.4527,0.395218,0.944444,0.941176,0.894427
4,0.467,0.120915,0.944444,0.941176,0.894427
5,0.3263,0.384289,0.944444,0.941176,0.894427
6,0.2577,0.337407,0.944444,0.941176,0.894427
7,0.2891,0.814186,0.888889,0.875,0.797724
8,0.2443,0.405653,0.944444,0.941176,0.894427
9,0.2266,0.731253,0.888889,0.875,0.797724
10,0.3055,0.36045,0.944444,0.941176,0.894427



✨ Training complete.
🔍 Evaluating on Chamber test set (871)...



📊 🏛 Chamber Test Set Results:
  test_loss           : 2.4917
  test_accuracy       : 0.5775
  test_f1             : 0.6979
  test_mcc            : -0.0044
  test_runtime        : 145.7645
  test_samples_per_second: 5.9750
  test_steps_per_second: 2.9910
🔍 Evaluating on Grand Chamber test set (34)...



📊 ⚖️ Grand Chamber Test Set Results:
  test_loss           : 1.8608
  test_accuracy       : 0.6176
  test_f1             : 0.6829
  test_mcc            : 0.2986
  test_runtime        : 5.5544
  test_samples_per_second: 6.1210
  test_steps_per_second: 3.0610
⏱️ Duration for split 3 | method obs: 19m 46.78s
💾 Saved results to results/epochs_3_batch_2/results_split_3_method_obs.json

🔁 Running experiment | Split: 3 | Method: obs_ip
📊 Dataset sizes → Train: 98, Val: 18


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

✅ Val and train dataset tokenized and formatted.
🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Mcc
1,0.6276,0.2468,0.944444,0.941176,0.894427
2,0.5757,0.78265,0.888889,0.875,0.797724
3,0.5679,0.31237,0.944444,0.941176,0.894427
4,0.4488,0.402793,0.944444,0.941176,0.894427
5,0.3723,0.295225,0.944444,0.941176,0.894427
6,0.3055,0.483577,0.888889,0.875,0.797724
7,0.5281,0.419077,0.888889,0.888889,0.777778
8,0.3885,0.289387,0.944444,0.941176,0.894427
9,0.3027,0.356743,0.944444,0.941176,0.894427
10,0.2153,0.373168,0.944444,0.941176,0.894427



✨ Training complete.
🔍 Evaluating on Chamber test set (871)...



📊 🏛 Chamber Test Set Results:
  test_loss           : 2.7635
  test_accuracy       : 0.5683
  test_f1             : 0.6824
  test_mcc            : 0.0102
  test_runtime        : 143.3515
  test_samples_per_second: 6.0760
  test_steps_per_second: 3.0410
🔍 Evaluating on Grand Chamber test set (34)...



📊 ⚖️ Grand Chamber Test Set Results:
  test_loss           : 2.4543
  test_accuracy       : 0.6176
  test_f1             : 0.6486
  test_mcc            : 0.2568
  test_runtime        : 5.5589
  test_samples_per_second: 6.1160
  test_steps_per_second: 3.0580
⏱️ Duration for split 3 | method obs_ip: 19m 14.70s
💾 Saved results to results/epochs_3_batch_2/results_split_3_method_obs_ip.json

🔁 Running experiment | Split: 4 | Method: obs
📊 Dataset sizes → Train: 98, Val: 18


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

✅ Val and train dataset tokenized and formatted.
🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Mcc
1,0.9355,0.890011,0.833333,0.823529,0.67082
2,0.5961,0.780132,0.888889,0.888889,0.777778
3,0.3308,0.916977,0.833333,0.842105,0.67082
4,0.4173,0.767771,0.888889,0.888889,0.777778
5,0.2172,1.107087,0.833333,0.842105,0.67082
6,0.1556,0.955651,0.833333,0.823529,0.67082
7,0.2421,1.482851,0.777778,0.8,0.569803
8,0.3067,0.91636,0.888889,0.888889,0.777778
9,0.1705,0.932207,0.888889,0.888889,0.777778
10,0.1729,1.03196,0.888889,0.888889,0.777778



✨ Training complete.
🔍 Evaluating on Chamber test set (871)...



📊 🏛 Chamber Test Set Results:
  test_loss           : 2.5351
  test_accuracy       : 0.5545
  test_f1             : 0.6638
  test_mcc            : 0.0096
  test_runtime        : 138.4766
  test_samples_per_second: 6.2900
  test_steps_per_second: 3.1490
🔍 Evaluating on Grand Chamber test set (34)...



📊 ⚖️ Grand Chamber Test Set Results:
  test_loss           : 2.4566
  test_accuracy       : 0.4706
  test_f1             : 0.5000
  test_mcc            : -0.0493
  test_runtime        : 5.4228
  test_samples_per_second: 6.2700
  test_steps_per_second: 3.1350
⏱️ Duration for split 4 | method obs: 19m 30.88s
💾 Saved results to results/epochs_3_batch_2/results_split_4_method_obs.json

🔁 Running experiment | Split: 4 | Method: obs_ip
📊 Dataset sizes → Train: 98, Val: 18


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

✅ Val and train dataset tokenized and formatted.
🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Mcc
1,0.4139,1.928325,0.777778,0.8,0.569803
2,0.4697,1.52338,0.777778,0.8,0.569803
3,0.4167,1.183244,0.777778,0.8,0.569803
4,0.2711,0.895724,0.888889,0.888889,0.777778
5,0.2937,1.184157,0.833333,0.842105,0.67082
6,0.1732,1.041871,0.833333,0.842105,0.67082
7,0.1098,1.20963,0.833333,0.842105,0.67082
8,0.2434,1.04298,0.833333,0.842105,0.67082
9,0.168,0.987764,0.888889,0.888889,0.777778
10,0.1282,1.076059,0.888889,0.888889,0.777778



✨ Training complete.
🔍 Evaluating on Chamber test set (871)...



📊 🏛 Chamber Test Set Results:
  test_loss           : 2.2516
  test_accuracy       : 0.6005
  test_f1             : 0.7225
  test_mcc            : 0.0118
  test_runtime        : 143.6670
  test_samples_per_second: 6.0630
  test_steps_per_second: 3.0350
🔍 Evaluating on Grand Chamber test set (34)...



📊 ⚖️ Grand Chamber Test Set Results:
  test_loss           : 1.7460
  test_accuracy       : 0.6176
  test_f1             : 0.6977
  test_mcc            : 0.3343
  test_runtime        : 5.5791
  test_samples_per_second: 6.0940
  test_steps_per_second: 3.0470
⏱️ Duration for split 4 | method obs_ip: 19m 7.57s
💾 Saved results to results/epochs_3_batch_2/results_split_4_method_obs_ip.json

🔁 Running experiment | Split: 5 | Method: obs
📊 Dataset sizes → Train: 98, Val: 18


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

✅ Val and train dataset tokenized and formatted.
🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Mcc
1,0.6529,1.546974,0.777778,0.8,0.569803
2,0.4232,0.726262,0.888889,0.888889,0.777778
3,0.293,1.026647,0.833333,0.842105,0.67082
4,0.2527,1.014393,0.833333,0.842105,0.67082
5,0.2577,0.826974,0.888889,0.888889,0.777778
6,0.2207,0.761632,0.888889,0.888889,0.777778
7,0.2199,1.20841,0.833333,0.842105,0.67082
8,0.1755,0.851771,0.833333,0.842105,0.67082
9,0.1777,0.902658,0.833333,0.842105,0.67082
10,0.2011,1.020301,0.833333,0.842105,0.67082



✨ Training complete.
🔍 Evaluating on Chamber test set (871)...



📊 🏛 Chamber Test Set Results:
  test_loss           : 2.3274
  test_accuracy       : 0.6062
  test_f1             : 0.7258
  test_mcc            : 0.0293
  test_runtime        : 139.1548
  test_samples_per_second: 6.2590
  test_steps_per_second: 3.1330
🔍 Evaluating on Grand Chamber test set (34)...



📊 ⚖️ Grand Chamber Test Set Results:
  test_loss           : 2.2530
  test_accuracy       : 0.5588
  test_f1             : 0.6667
  test_mcc            : 0.2251
  test_runtime        : 5.4337
  test_samples_per_second: 6.2570
  test_steps_per_second: 3.1290
⏱️ Duration for split 5 | method obs: 19m 42.76s
💾 Saved results to results/epochs_3_batch_2/results_split_5_method_obs.json

🔁 Running experiment | Split: 5 | Method: obs_ip
📊 Dataset sizes → Train: 98, Val: 18


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

✅ Val and train dataset tokenized and formatted.
🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Mcc
1,0.5879,1.335952,0.833333,0.842105,0.67082
2,0.7248,1.023181,0.833333,0.842105,0.67082
3,0.2652,1.155486,0.833333,0.842105,0.67082
4,0.2922,2.104617,0.722222,0.761905,0.471405
5,0.2797,1.280615,0.777778,0.777778,0.555556
6,0.2281,0.991502,0.777778,0.777778,0.555556
7,0.1945,1.106034,0.833333,0.842105,0.67082
8,0.2341,0.835301,0.888889,0.888889,0.777778
9,0.208,1.123541,0.833333,0.842105,0.67082
10,0.2035,1.466143,0.777778,0.777778,0.555556



✨ Training complete.
🔍 Evaluating on Chamber test set (871)...



📊 🏛 Chamber Test Set Results:
  test_loss           : 2.9698
  test_accuracy       : 0.5580
  test_f1             : 0.6695
  test_mcc            : 0.0062
  test_runtime        : 143.8098
  test_samples_per_second: 6.0570
  test_steps_per_second: 3.0320
🔍 Evaluating on Grand Chamber test set (34)...



📊 ⚖️ Grand Chamber Test Set Results:
  test_loss           : 1.5659
  test_accuracy       : 0.7059
  test_f1             : 0.7368
  test_mcc            : 0.4497
  test_runtime        : 5.6270
  test_samples_per_second: 6.0420
  test_steps_per_second: 3.0210
⏱️ Duration for split 5 | method obs_ip: 19m 9.78s
💾 Saved results to results/epochs_3_batch_2/results_split_5_method_obs_ip.json

🔁 Running experiment | Split: 6 | Method: obs
📊 Dataset sizes → Train: 98, Val: 18


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

✅ Val and train dataset tokenized and formatted.
🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Mcc
1,0.55,0.003639,1.0,1.0,1.0
2,0.2366,0.003262,1.0,1.0,1.0
3,0.1164,0.00301,1.0,1.0,1.0
4,0.2193,0.001922,1.0,1.0,1.0
5,0.2185,0.002332,1.0,1.0,1.0
6,0.1305,0.002136,1.0,1.0,1.0
7,0.1167,0.00239,1.0,1.0,1.0
8,0.0805,0.002837,1.0,1.0,1.0
9,0.1137,0.001902,1.0,1.0,1.0
10,0.0909,0.001983,1.0,1.0,1.0



✨ Training complete.
🔍 Evaluating on Chamber test set (871)...



📊 🏛 Chamber Test Set Results:
  test_loss           : 2.9672
  test_accuracy       : 0.5327
  test_f1             : 0.6356
  test_mcc            : -0.0016
  test_runtime        : 139.1153
  test_samples_per_second: 6.2610
  test_steps_per_second: 3.1340
🔍 Evaluating on Grand Chamber test set (34)...



📊 ⚖️ Grand Chamber Test Set Results:
  test_loss           : 1.2947
  test_accuracy       : 0.7353
  test_f1             : 0.7568
  test_mcc            : 0.4993
  test_runtime        : 5.4449
  test_samples_per_second: 6.2440
  test_steps_per_second: 3.1220
⏱️ Duration for split 6 | method obs: 19m 37.63s
💾 Saved results to results/epochs_3_batch_2/results_split_6_method_obs.json

🔁 Running experiment | Split: 6 | Method: obs_ip
📊 Dataset sizes → Train: 98, Val: 18


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

✅ Val and train dataset tokenized and formatted.
🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Mcc
1,0.3848,0.002729,1.0,1.0,1.0
2,0.438,0.0035,1.0,1.0,1.0
3,0.1878,0.004277,1.0,1.0,1.0
4,0.0908,0.003303,1.0,1.0,1.0
5,0.0511,0.002961,1.0,1.0,1.0
6,0.0717,0.002309,1.0,1.0,1.0
7,0.0829,0.002081,1.0,1.0,1.0
8,0.0596,0.002531,1.0,1.0,1.0
9,0.0758,0.001619,1.0,1.0,1.0
10,0.076,0.001677,1.0,1.0,1.0



✨ Training complete.
🔍 Evaluating on Chamber test set (871)...



📊 🏛 Chamber Test Set Results:
  test_loss           : 2.9376
  test_accuracy       : 0.5591
  test_f1             : 0.6718
  test_mcc            : 0.0036
  test_runtime        : 138.4610
  test_samples_per_second: 6.2910
  test_steps_per_second: 3.1490
🔍 Evaluating on Grand Chamber test set (34)...



📊 ⚖️ Grand Chamber Test Set Results:
  test_loss           : 2.1913
  test_accuracy       : 0.6471
  test_f1             : 0.7143
  test_mcc            : 0.3841
  test_runtime        : 5.4202
  test_samples_per_second: 6.2730
  test_steps_per_second: 3.1360
⏱️ Duration for split 6 | method obs_ip: 18m 46.96s
💾 Saved results to results/epochs_3_batch_2/results_split_6_method_obs_ip.json
🎉 All experiments completed successfully and results saved to disk.
