In [1]:
pip install datasets

Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m47.7/47.7 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 19.0.1
    Uninstalling pyarrow-19.0.1:
      Successfully uninstalled pyarrow-19.0.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.12.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
pylibcudf-cu12 25.2.2 requires pyarrow<20.0.0a0,>=14.0.0; platform_machine == "x86_64", but you hav

In [2]:
# ============================================
# CORE SCIENTIFIC COMPUTING
# ============================================
import numpy as np
import pandas as pd

# ============================================
# VISUALIZATION
# ============================================
import matplotlib.pyplot as plt
import seaborn as sns

# ============================================
# MACHINE LEARNING - SKLEARN
# ============================================
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import ClassifierChain
from sklearn.utils import resample

# Metrics
from sklearn.metrics import (
    f1_score, 
    hamming_loss, 
    accuracy_score, 
    jaccard_score,
    precision_score, 
    recall_score, 
    classification_report
)

# ============================================
# XGBOOST
# ============================================
from xgboost import XGBClassifier

# ============================================
# DEEP LEARNING & EMBEDDINGS
# ============================================
import torch
import gensim.downloader as api

# ============================================
# DATASETS & DATA HANDLING
# ============================================
from datasets import load_dataset

# ============================================
# UTILITIES
# ============================================
import requests
import json
from time import sleep
import joblib
import os

## perform datasets impoortation 

In [3]:
ds = load_dataset("lex_glue", "eurlex")

README.md: 0.00B [00:00, ?B/s]

eurlex/train-00000-of-00001.parquet:   0%|          | 0.00/167M [00:00<?, ?B/s]

eurlex/test-00000-of-00001.parquet:   0%|          | 0.00/24.3M [00:00<?, ?B/s]

eurlex/validation-00000-of-00001.parquet:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/55000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [4]:
print(ds),print(type(ds))

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 55000
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 5000
    })
})
<class 'datasets.dataset_dict.DatasetDict'>


(None, None)

In [5]:
ds['train']['text'][0]

'COUNCIL DECISION\nof 7 June 2005\nabrogating Decision 2005/136/EC on the existence of an excessive deficit in the Netherlands\n(2005/729/EC)\nTHE COUNCIL OF THE EUROPEAN UNION,\nHaving regard to the Treaty establishing the European Community, and in particular Article 104(12) thereof,\nHaving regard to the recommendation from the Commission,\nWhereas:\n(1)\nBy Decision 2005/136/EC (1) following a recommendation from the Commission in accordance with Article 104(6) of the Treaty, the Council decided that an excessive deficit existed in the Netherlands.\n(2)\nIn accordance with Article 104(7) of the Treaty, the Council made a Recommendation on 2 June 2004 addressed to the Netherlands with a view to bringing the excessive deficit situation to an end. This Recommendation, in conjunction with Article 3(4) of Council Regulation (EC) No 1467/97 of 7 July 1997 on speeding up and clarifying the implementation of the excessive deficit procedure (2), established a deadline of 2005 at the latest 

In [6]:
label_info = ds["train"].features["labels"]
label_names = label_info.feature.names

print(label_names[:20])   # print first 20 labels
print(len(label_names))   # number of labels

['100163', '100168', '100169', '100170', '100171', '100172', '100173', '100174', '100175', '100176', '100177', '100179', '100180', '100183', '100184', '100185', '100186', '100187', '100189', '100190']
100


In [7]:
print("Train size:", len(ds["train"]))
print("Validation size:", len(ds["validation"]))
print("Test size:", len(ds["test"]))

Train size: 55000
Validation size: 5000
Test size: 5000


In [8]:
# Get the int2str function from the ClassLabel feature
id2label = ds["train"].features["labels"].feature.int2str

# Loop over all labels
for i in range(len(ds["train"].features["labels"].feature.names)):
    if i%10==0:
        print("\n")
    print(i, "‚Üí", id2label(i),end="||")



0 ‚Üí 100163||1 ‚Üí 100168||2 ‚Üí 100169||3 ‚Üí 100170||4 ‚Üí 100171||5 ‚Üí 100172||6 ‚Üí 100173||7 ‚Üí 100174||8 ‚Üí 100175||9 ‚Üí 100176||

10 ‚Üí 100177||11 ‚Üí 100179||12 ‚Üí 100180||13 ‚Üí 100183||14 ‚Üí 100184||15 ‚Üí 100185||16 ‚Üí 100186||17 ‚Üí 100187||18 ‚Üí 100189||19 ‚Üí 100190||

20 ‚Üí 100191||21 ‚Üí 100192||22 ‚Üí 100193||23 ‚Üí 100194||24 ‚Üí 100195||25 ‚Üí 100196||26 ‚Üí 100197||27 ‚Üí 100198||28 ‚Üí 100199||29 ‚Üí 100200||

30 ‚Üí 100201||31 ‚Üí 100202||32 ‚Üí 100204||33 ‚Üí 100205||34 ‚Üí 100206||35 ‚Üí 100207||36 ‚Üí 100212||37 ‚Üí 100214||38 ‚Üí 100215||39 ‚Üí 100220||

40 ‚Üí 100221||41 ‚Üí 100222||42 ‚Üí 100223||43 ‚Üí 100224||44 ‚Üí 100226||45 ‚Üí 100227||46 ‚Üí 100229||47 ‚Üí 100230||48 ‚Üí 100231||49 ‚Üí 100232||

50 ‚Üí 100233||51 ‚Üí 100234||52 ‚Üí 100235||53 ‚Üí 100237||54 ‚Üí 100238||55 ‚Üí 100239||56 ‚Üí 100240||57 ‚Üí 100241||58 ‚Üí 100242||59 ‚Üí 100243||

60 ‚Üí 100244||61 ‚Üí 100245||62 ‚Üí 100246||63 ‚Üí 100247||64 ‚Üí 100248||65 ‚Üí 100249||66 ‚Üí

In [9]:
label2name = ds["train"].features["labels"].feature.names
def get_eurovoc_label(concept_id):
    """Fetch label from EuroVoc SPARQL endpoint"""
    sparql_endpoint = "http://publications.europa.eu/webapi/rdf/sparql"
    
    query = f"""
    PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
    SELECT ?label
    WHERE {{
      <http://eurovoc.europa.eu/{concept_id}> skos:prefLabel ?label .
      FILTER(lang(?label) = 'en')
    }}
    """
    
    try:
        response = requests.get(
            sparql_endpoint,
            params={'query': query, 'format': 'json'},
            timeout=10
        )
        if response.status_code == 200:
            data = response.json()
            if data['results']['bindings']:
                return data['results']['bindings'][0]['label']['value']
    except Exception as e:
        print(f"Error for {concept_id}: {e}")
    
    return concept_id

# Build mapping with rate limiting
label2description = {}
for i, concept_id in enumerate(label2name):
    label2description[concept_id] = get_eurovoc_label(concept_id)
    print(f"{concept_id}: {label2description[concept_id]}")
    if i % 10 == 0:  # Add delay every 10 requests
        sleep(1)

100163: 0406 political framework
100168: 0431 politics and public safety
100169: 0436 executive power and public service
100170: 0806 international affairs
100171: 0811 cooperation policy
100172: 0816 international security
100173: 0821 defence
100174: 1006 EU institutions and European civil service
100175: 1011 European Union law
100176: 1016 European construction
100177: 1021 EU finance
100179: 1211 civil law
100180: 1216 criminal law
100183: 1231 international law
100184: 1236 rights and freedoms
100185: 1606 economic policy
100186: 1611 economic conditions
100187: 1616 regions and regional policy
100189: 1626 national accounts
100190: 1631 economic analysis
100191: 2006 trade policy
100192: 2011 tariff policy
100193: 2016 trade
100194: 2021 international trade
100195: 2026 consumption
100196: 2031 marketing
100197: 2036 distributive trades
100198: 2406 monetary relations
100199: 2411 monetary economics
100200: 2416 financial institutions and credit
100201: 2421 free movement of cap

In [10]:
label2description,len(label2description)

({'100163': '0406 political framework',
  '100168': '0431 politics and public safety',
  '100169': '0436 executive power and public service',
  '100170': '0806 international affairs',
  '100171': '0811 cooperation policy',
  '100172': '0816 international security',
  '100173': '0821 defence',
  '100174': '1006 EU institutions and European civil service',
  '100175': '1011 European Union law',
  '100176': '1016 European construction',
  '100177': '1021 EU finance',
  '100179': '1211 civil law',
  '100180': '1216 criminal law',
  '100183': '1231 international law',
  '100184': '1236 rights and freedoms',
  '100185': '1606 economic policy',
  '100186': '1611 economic conditions',
  '100187': '1616 regions and regional policy',
  '100189': '1626 national accounts',
  '100190': '1631 economic analysis',
  '100191': '2006 trade policy',
  '100192': '2011 tariff policy',
  '100193': '2016 trade',
  '100194': '2021 international trade',
  '100195': '2026 consumption',
  '100196': '2031 marketi

In [11]:
# High-level category aggregation for EuroVoc classes
high_level_categories = {
    "Politics & Government": {
        "100163", "100168", "100169", "100174", "100175", "100176", "100177"
    },
    
    "International Affairs & Defense": {
        "100170", "100171", "100172", "100173", "100183", "100285"
    },
    
    "Law & Justice": {
        "100179", "100180", "100184"
    },
    
    "Economics & Finance": {
        "100185", "100186", "100189", "100190", "100198", "100199", "100200", 
        "100201", "100202", "100204", "100205", "100206", "100207"
    },
    
    "Trade & Business": {
        "100191", "100192", "100193", "100194", "100195", "100196", "100197",
        "100226", "100227", "100229", "100230", "100231"
    },
    
    "Employment & Labor": {
        "100232", "100233", "100234", "100235"
    },
    
    "Social Affairs & Health": {
        "100212", "100214", "100215"
    },
    
    "Technology & Science": {
        "100220", "100221", "100222", "100223", "100224", "100261", "100262"
    },
    
    "Transportation": {
        "100237", "100238", "100239", "100240", "100241"
    },
    
    "Environment": {
        "100242", "100243", "100244"
    },
    
    "Agriculture & Food": {
        "100245", "100246", "100247", "100248", "100249", "100250", "100252",
        "100253", "100254", "100255", "100256", "100257", "100258", "100259"
    },
    
    "Energy & Resources": {
        "100263", "100264", "100265", "100266"
    },
    
    "Industry & Manufacturing": {
        "100260", "100268", "100269", "100270", "100271", "100272", "100273",
        "100274", "100275", "100276"
    },
    
    "Geography & Regional": {
        "100187", "100277", "100278", "100279", "100280", "100281", "100282",
        "100283", "100284"
    },
}

In [12]:
ds["train"]["labels"]

Column([[28, 32, 33, 91, 96, 97], [4, 21, 23, 68], [9, 15, 16, 39], [20, 28, 61, 62], [20, 71, 72]])

In [13]:
id_code_mapping={
    i: id2label(i) for i in range(100)
}

In [14]:
categories_id={
    key:i for i,key in enumerate(high_level_categories.keys())
}
categories_id

{'Politics & Government': 0,
 'International Affairs & Defense': 1,
 'Law & Justice': 2,
 'Economics & Finance': 3,
 'Trade & Business': 4,
 'Employment & Labor': 5,
 'Social Affairs & Health': 6,
 'Technology & Science': 7,
 'Transportation': 8,
 'Environment': 9,
 'Agriculture & Food': 10,
 'Energy & Resources': 11,
 'Industry & Manufacturing': 12,
 'Geography & Regional': 13}

In [15]:
X_train ,X_test,X_val= ds["train"]["text"], ds["test"]["text"], ds["validation"]["text"]

In [16]:
#lower casing 
#limmitization 
#stop word 
#remove urls,gmails,...
#.............

In [17]:
y_train ,y_test ,y_val= ds["train"]["labels"], ds["test"]["labels"], ds["validation"]["labels"]

In [18]:
def check_category_name(id_,high_level_categories=high_level_categories,id_code_mapping=id_code_mapping):
    for key in high_level_categories.keys():
        if id_code_mapping[id_] in high_level_categories.get(key):
            return key

def generate_new_label(y):
    new_y=[]
    for target in y:
        new_target=[categories_id[check_category_name(t)] for t in target]
        new_target_unique=list(set(new_target))
        new_y.append(new_target_unique)
    return new_y

In [19]:
y_train_h = generate_new_label(y_train)
y_test_h = generate_new_label(y_test)
y_val_h = generate_new_label(y_val)

In [20]:
#label representation 
def label_to_fixed_vector(labels):
    mlb = MultiLabelBinarizer(classes=range(0, 14))  # specify 0‚Äì99 classes
    Y = mlb.fit_transform(labels)
    
    return Y

In [21]:
fixed_size_y_train ,fixed_size_y_test ,fixed_size_y_val= label_to_fixed_vector(y_train_h),label_to_fixed_vector(y_test_h),label_to_fixed_vector(y_val_h)

In [22]:
label_freq = fixed_size_y_train.mean(axis=0)   # (14,)
label_weights = 1 / (label_freq + 1e-6)

sample_weights = (fixed_size_y_train * label_weights).sum(axis=1)
sample_weights = sample_weights / sample_weights.mean()

In [23]:
tfidf = TfidfVectorizer(
    max_features=50000,      
    ngram_range=(1,2),        
    min_df=2,                 
    stop_words="english"      
)

vectorized_X_train = tfidf.fit_transform(X_train)

In [24]:
vectorized_X_test  = tfidf.transform(X_test)
vectorized_X_val  = tfidf.transform(X_val)

In [25]:
vectorized_X_val.shape

(5000, 50000)

In [26]:
# Reduce 50,000 sparse features to 300 dense features
svd = TruncatedSVD(n_components=300, random_state=42)
X_train_svd = svd.fit_transform(vectorized_X_train)
X_test_svd = svd.transform(vectorized_X_test)
X_val_svd = svd.transform(vectorized_X_val)

## XGBoost Model

In [27]:
model = XGBClassifier(
    n_estimators=500,
    max_depth=15,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    
    # --- GPU Acceleration ---
    tree_method="hist",
    device="cuda",
    
    # --- REGULARIZATION TERMS ---
    reg_alpha=10,       
    reg_lambda=10,      
    gamma=0.2,           
    min_child_weight=2,  
    
    random_state=42
)

#model = ClassifierChain(base_model, order='random', random_state=42)
model.fit(X_train_svd, fixed_size_y_train)

In [28]:
def evaluate_set(model, X_data, y_true, dataset_name, category_names=None):
    """
    Evaluates the model and prints detailed metrics for the given dataset.
    """
    print(f"\n{'='*60}")
    print(f"üìä {dataset_name.upper()} SET EVALUATION")
    print(f"{'='*60}")
    
    # 1. Generate Predictions
    print(f"Generating predictions for {dataset_name}...")
    y_pred = model.predict(X_data)
    print("Predictions complete!")
    
    # 2. Calculate Overall Metrics
    subset_acc = accuracy_score(y_true, y_pred)
    h_loss = hamming_loss(y_true, y_pred)
    micro_f1 = f1_score(y_true, y_pred, average='micro')
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    weighted_f1 = f1_score(y_true, y_pred, average='weighted')
    
    print(f"\n--- Overall Metrics ---")
    print(f"Subset Accuracy:     {subset_acc:.4f}")
    print(f"Hamming Loss:        {h_loss:.4f}")
    print(f"Micro F1 Score:      {micro_f1:.4f}")
    print(f"Macro F1 Score:      {macro_f1:.4f}")

    # 3. Per-Label Breakdown (Now enabled for ALL sets if names are provided)
    if category_names:
        print(f"\n--- Per-Label F1 Scores ({dataset_name}) ---")
        per_label_f1 = f1_score(y_true, y_pred, average=None)
        
        # Header for the table
        print(f"{'Category':<40} | {'F1 Score'}")
        print("-" * 55)
        
        for category, f1 in zip(category_names, per_label_f1):
            # Highlight weak classes in RED (if supported) or with a marker
            marker = "‚ö†Ô∏è" if f1 < 0.5 else " "
            print(f"{category:<40} | {f1:.4f} {marker}")
            
    # Return metrics for the final diagnosis
    metrics = {
        "micro_f1": micro_f1,
        "macro_f1": macro_f1,
        "per_label_f1": per_label_f1 if category_names else None
    }
    return y_pred, metrics

def print_final_diagnosis(train_metrics, val_metrics, test_metrics, category_names):
    """
    Compares Train vs Test to give specific advice per class.
    """
    print(f"\n{'#'*60}")
    print(f"üè• AUTOMATED MODEL DIAGNOSIS")
    print(f"{'#'*60}")
    
    # 1. Generalization Gap
    gap = train_metrics['micro_f1'] - test_metrics['micro_f1']
    print(f"üìâ Overall Overfitting Gap: {gap:.4f}")
    
    if gap > 0.10:
        print("‚ö†Ô∏è  High Overfitting detected overall.")
    else:
        print("‚úÖ  Good Generalization overall.")

    # 2. Per-Class Overfitting Analysis
    print(f"\nüîç Detailed Class Analysis (Train vs Test F1):")
    print(f"{'Category':<40} | {'Train':<7} | {'Test':<7} | {'Gap'}")
    print("-" * 75)
    
    train_scores = train_metrics['per_label_f1']
    test_scores = test_metrics['per_label_f1']
    
    for i, cat in enumerate(category_names):
        t_score = train_scores[i]
        v_score = test_scores[i]
        diff = t_score - v_score
        
        # Flag problematic classes
        status = ""
        if v_score < 0.4: status = "‚ùå POOR"
        elif diff > 0.2:  status = "‚ö†Ô∏è OVERFIT"
        
        print(f"{cat:<40} | {t_score:.4f}  | {v_score:.4f}  | {diff:+.4f} {status}")

# ============================================
# EXECUTE EVALUATION
# ============================================

# Ensure category names are sorted by ID (0..13)
sorted_categories = sorted(categories_id.keys(), key=lambda k: categories_id[k])

# 1. Evaluate Train (Pass sorted_categories to see per-class scores)
_, train_metrics = evaluate_set(model, X_train_svd, fixed_size_y_train, "Train", sorted_categories)

# 2. Evaluate Validation
_, val_metrics = evaluate_set(model, X_val_svd, fixed_size_y_val, "Validation", sorted_categories)

# 3. Evaluate Test
_, test_metrics = evaluate_set(model, X_test_svd, fixed_size_y_test, "Test", sorted_categories)

# 4. Final Comparison
print_final_diagnosis(train_metrics, val_metrics, test_metrics, sorted_categories)


üìä TRAIN SET EVALUATION
Generating predictions for Train...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Predictions complete!

--- Overall Metrics ---
Subset Accuracy:     0.9767
Hamming Loss:        0.0019
Micro F1 Score:      0.9956
Macro F1 Score:      0.9869

--- Per-Label F1 Scores (Train) ---
Category                                 | F1 Score
-------------------------------------------------------
Politics & Government                    | 0.9961  
International Affairs & Defense          | 0.9944  
Law & Justice                            | 0.9602  
Economics & Finance                      | 0.9965  
Trade & Business                         | 0.9980  
Employment & Labor                       | 0.9766  
Social Affairs & Health                  | 0.9917  
Technology & Science                     | 0.9937  
Transportation                           | 0.9838  
Environment                              | 0.9840  
Agriculture & Food                       | 0.9985  
Energy & Resources                       | 0.9520  
Industry & Manufacturing                 | 0.9941  
Geography & Regional

In [29]:
# 1. Calculate Support (Instance Count) for each class per split
train_support = fixed_size_y_train.sum(axis=0)
val_support = fixed_size_y_val.sum(axis=0)
test_support = fixed_size_y_test.sum(axis=0)

# 2. Retrieve F1 Scores (Assuming metrics were captured in previous step)
train_f1 = train_metrics['per_label_f1']
val_f1 = val_metrics['per_label_f1']
test_f1 = test_metrics['per_label_f1']

# 3. Create a Consolidated DataFrame
df_comparison = pd.DataFrame({
    'Category': sorted_categories,
    
    # Train Data
    'Train_Count': train_support.astype(int),
    'Train_F1': train_f1,
    
    # Validation Data
    'Val_Count': val_support.astype(int),
    'Val_F1': val_f1,
    
    # Test Data
    'Test_Count': test_support.astype(int),
    'Test_F1': test_f1
})

# 4. Calculate Percentage Representation 
total_train = fixed_size_y_train.shape[0]
df_comparison['Train_Freq_%'] = (df_comparison['Train_Count'] / total_train) * 100

# Reorder columns for readability
cols = ['Category', 'Train_Freq_%', 
        'Train_Count', 'Train_F1', 
        'Val_Count', 'Val_F1', 
        'Test_Count', 'Test_F1']
df_comparison = df_comparison[cols]

# 5. Display the DataFrame sorted by Test Count 
print("\n" + "="*80)
print("üìä CLASS FREQUENCY vs. PERFORMANCE ANALYSIS")
print("="*80)
print(df_comparison.sort_values(by='Test_Count', ascending=False).to_string(index=False, float_format="%.4f"))

# 6. Statistical Correlation Analysis
print("\n" + "="*80)
print("üìâ CORRELATION ANALYSIS (Count vs. F1 Score)")
print("="*80)
corr_train = df_comparison['Train_Count'].corr(df_comparison['Train_F1'])
corr_val = df_comparison['Val_Count'].corr(df_comparison['Val_F1'])
corr_test = df_comparison['Test_Count'].corr(df_comparison['Test_F1'])

print(f"Correlation (Train): {corr_train:.4f}  (High +ve means frequent classes score better)")
print(f"Correlation (Val):   {corr_val:.4f}")
print(f"Correlation (Test):  {corr_test:.4f}")

if corr_test > 0.5:
    print("‚ö†Ô∏è  STRONG BIAS: The model significantly favors frequent classes.")
elif corr_test > 0.2:
    print("‚ö†Ô∏è  MODERATE BIAS: Frequent classes tend to perform better.")
else:
    print("‚úÖ  LOW BIAS: The model handles rare classes relatively well.")


üìä CLASS FREQUENCY vs. PERFORMANCE ANALYSIS
                       Category  Train_Freq_%  Train_Count  Train_F1  Val_Count  Val_F1  Test_Count  Test_F1
               Trade & Business       60.9800        33539    0.9980       2924  0.8522        2994   0.7853
             Agriculture & Food       67.7364        37255    0.9985       2958  0.9762        2513   0.9616
           Geography & Regional       42.4473        23346    0.9975       2105  0.8909        2260   0.8749
          Politics & Government       26.3255        14479    0.9961       1726  0.7610        1516   0.7153
International Affairs & Defense       16.6636         9165    0.9944       1163  0.7464        1315   0.7576
            Economics & Finance       25.8200        14201    0.9965       1530  0.7275        1221   0.5515
           Technology & Science       12.8618         7074    0.9937        809  0.6339         935   0.6793
       Industry & Manufacturing       12.4436         6844    0.9941        644  

## First approach to address the imbalanced data issue: Class weighting 

In [30]:
# Calculate weight for each label: Total / (Num_Classes * Count)
label_counts = fixed_size_y_train.sum(axis=0)
class_weights = len(fixed_size_y_train) / (len(label_counts) * label_counts)

# Assign the highest weight of any label present in the sample
# If a doc has "Politics" (freq) and "Labor" (rare), it gets the "Labor" weight.
sample_weights = []
for row in fixed_size_y_train:
    indices = np.where(row == 1)[0]
    if len(indices) > 0:
        weight = np.max(class_weights[indices]) # Weight by the rarest label in the doc
    else:
        weight = 1.0
    sample_weights.append(weight)

# Fit with weights
model.fit(X_train_svd, fixed_size_y_train, sample_weight=sample_weights)

In [31]:

# Ensure category names are sorted by ID (0..13)
sorted_categories = sorted(categories_id.keys(), key=lambda k: categories_id[k])

# 1. Evaluate Train (Pass sorted_categories to see per-class scores)
_, train_metrics = evaluate_set(model, X_train_svd, fixed_size_y_train, "Train", sorted_categories)

# 2. Evaluate Validation
_, val_metrics = evaluate_set(model, X_val_svd, fixed_size_y_val, "Validation", sorted_categories)

# 3. Evaluate Test
_, test_metrics = evaluate_set(model, X_test_svd, fixed_size_y_test, "Test", sorted_categories)

# 4. Final Comparison
print_final_diagnosis(train_metrics, val_metrics, test_metrics, sorted_categories)


üìä TRAIN SET EVALUATION
Generating predictions for Train...
Predictions complete!

--- Overall Metrics ---
Subset Accuracy:     0.8312
Hamming Loss:        0.0144
Micro F1 Score:      0.9659
Macro F1 Score:      0.9613

--- Per-Label F1 Scores (Train) ---
Category                                 | F1 Score
-------------------------------------------------------
Politics & Government                    | 0.9475  
International Affairs & Defense          | 0.9345  
Law & Justice                            | 0.9620  
Economics & Finance                      | 0.9449  
Trade & Business                         | 0.9790  
Employment & Labor                       | 0.9722  
Social Affairs & Health                  | 0.9378  
Technology & Science                     | 0.9569  
Transportation                           | 0.9713  
Environment                              | 0.9756  
Agriculture & Food                       | 0.9860  
Energy & Resources                       | 0.9800  
Industry 

In [32]:
booster = model.get_booster()


trees_df = booster.trees_to_dataframe()

n_trees = trees_df['Tree'].max() + 1
n_nodes = len(trees_df)
n_leaves = len(trees_df[trees_df['Feature'] == 'Leaf'])
n_splits = n_nodes - n_leaves

print("="*40)
print("üìä COMPLEXIT√â DU MOD√àLE XGBOOST")
print("="*40)
print(f"Nombre total d'arbres :    {n_trees}")
print(f"Nombre total de n≈ìuds :    {n_nodes} (Param√®tres structurels)")
print(f"Nombre total de feuilles : {n_leaves} (Poids w appris)")
print(f"Nombre de d√©cisions (splits): {n_splits}")
print("-" * 40)
print(f"Moyenne de feuilles par arbre : {n_leaves / n_trees:.1f}")
print("="*40)

üìä COMPLEXIT√â DU MOD√àLE XGBOOST
Nombre total d'arbres :    7000
Nombre total de n≈ìuds :    838744 (Param√®tres structurels)
Nombre total de feuilles : 422872 (Poids w appris)
Nombre de d√©cisions (splits): 415872
----------------------------------------
Moyenne de feuilles par arbre : 60.4


In [33]:
filename = "mon_modele_xgboost.pkl"
joblib.dump(model, filename)

size_bytes = os.path.getsize(filename)
size_mb = size_bytes / (1024 * 1024)

print(f"üíæ Taille physique du mod√®le sur disque : {size_mb:.2f} Mo")

if size_mb < 100:
    print("‚úÖ Mod√®le l√©ger (Facile √† d√©ployer sur des petits serveurs/CPU)")
else:
    print("‚ö†Ô∏è Mod√®le lourd (N√©cessite beaucoup de RAM)")

üíæ Taille physique du mod√®le sur disque : 31.60 Mo
‚úÖ Mod√®le l√©ger (Facile √† d√©ployer sur des petits serveurs/CPU)


## Second approach : data augmentation 

In [69]:
# ==========================================
# 1. DEFINE AUGMENTATION FUNCTION
# ==========================================
def balance_multilabel_data(X, y, min_samples=3000):
    """
    Iterates through each class. If a class has fewer than 'min_samples',
    it randomly duplicates samples containing that label until the count is met.
    """
    print(f"üîÑ Starting Augmentation (Target: {min_samples} samples per class)...")
    
    # We work with copies to avoid messing up original data
    X_aug = X.copy()
    y_aug = y.copy()
    
    label_counts = y.sum(axis=0)
    n_classes = y.shape[1]
    
    # Check which classes need help
    rare_classes = [i for i in range(n_classes) if label_counts[i] < min_samples]
    
    if not rare_classes:
        print("‚úÖ No classes are below the threshold. No augmentation needed.")
        return X, y

    print(f"   Found {len(rare_classes)} rare classes needing augmentation.")

    for label_idx in rare_classes:
        current_count = y_aug[:, label_idx].sum() # Recalculate as we grow the dataset
        needed = min_samples - current_count
        
        if needed <= 0:
            continue
            
        print(f"   - Class {label_idx} ({id_code_mapping[label_idx] if 'id_code_mapping' in globals() else 'ID '+str(label_idx)}): Adding {int(needed)} samples...")
        
        # Find indices of samples that have this specific label
        # (Note: These samples might ALSO have frequent labels, that's okay)
        rare_indices = np.where(y_aug[:, label_idx] == 1)[0]
        
        # If the class is empty in training (shouldn't happen), skip
        if len(rare_indices) == 0:
            continue
            
        # Randomly sample with replacement
        new_indices = resample(rare_indices, n_samples=int(needed), replace=True, random_state=42)
        
        # Add the new samples to the big pile
        X_new = X_aug[new_indices]
        y_new = y_aug[new_indices]
        
        X_aug = np.vstack((X_aug, X_new))
        y_aug = np.vstack((y_aug, y_new))
        
    print(f"‚úÖ Augmentation Complete.")
    print(f"   Original Size: {X.shape[0]}")
    print(f"   New Size:      {X_aug.shape[0]} (+{X_aug.shape[0] - X.shape[0]} samples)")
    
    return X_aug, y_aug

# ==========================================
# 2. APPLY AUGMENTATION
# ==========================================
# Threshold: 2000 ensures even the smallest class (150 samples) gets a 10x boost
X_train_aug, y_train_aug = balance_multilabel_data(X_train_svd, fixed_size_y_train, min_samples=2000)

# ==========================================
# 3. RETRAIN MODEL ON AUGMENTED DATA
# ==========================================
print("\nüöÄ Retraining XGBoost on Augmented Data...")

# Re-initialize model with optimized parameters
model_aug = XGBClassifier(
    n_estimators=300,        # Good number for augmented data
    max_depth=10,            
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    
    # GPU Parameters
    tree_method="hist",
    device="cuda",
    
    # Regularization 
    reg_alpha=10,
    reg_lambda=10,
    min_child_weight=3,
    
    random_state=42
)

model_aug.fit(X_train_aug, y_train_aug)
print("Training Complete!")

# ==========================================
# 4. EVALUATE IMPROVEMENT
# ==========================================
# We use the evaluate_set function you defined earlier
# IMPORTANT: Test on the ORIGINAL Test set, never augment the Test set!
y_pred_aug, metrics_aug = evaluate_set(model_aug, X_test_svd, fixed_size_y_test, "Test (Augmented)", sorted_categories)

# Print Final Comparison
print("\n" + "="*50)
print("üîÑ IMPACT OF AUGMENTATION (Test Set F1)")
print("="*50)
# Compare old metrics (test_metrics) vs new metrics (metrics_aug)
if 'test_metrics' in globals():
    gap = metrics_aug['micro_f1'] - test_metrics['micro_f1']
    print(f"Original Micro F1:  {test_metrics['micro_f1']:.4f}")
    print(f"Augmented Micro F1: {metrics_aug['micro_f1']:.4f}")
    print(f"Improvement:        {gap:+.4f}")
    if gap > 0:
        print("‚úÖ Augmentation improved the model!")
    else:
        print("‚ö†Ô∏è No overall improvement (Common if duplicates cause overfitting). Check Macro F1.")

üîÑ Starting Augmentation (Target: 2000 samples per class)...
   Found 3 rare classes needing augmentation.
   - Class 2 (100169): Adding 682 samples...
   - Class 5 (100172): Adding 113 samples...
   - Class 11 (100179): Adding 880 samples...
‚úÖ Augmentation Complete.
   Original Size: 55000
   New Size:      56675 (+1675 samples)

üöÄ Retraining XGBoost on Augmented Data...
Training Complete!

üìä TEST (AUGMENTED) SET EVALUATION
Generating predictions for Test (Augmented)...
Predictions complete!

--- Overall Metrics ---
Subset Accuracy:     0.2566
Hamming Loss:        0.0916
Micro F1 Score:      0.7841
Macro F1 Score:      0.6614

--- Per-Label F1 Scores (Test (Augmented)) ---
Category                                 | F1 Score
-------------------------------------------------------
Politics & Government                    | 0.7013  
International Affairs & Defense          | 0.7495  
Law & Justice                            | 0.2876 ‚ö†Ô∏è
Economics & Finance                   

## current architecture process every class as an independent one, what is not the case, the labels can well correlate between each other 

In [72]:


# 1. Use Classifier Chains to model Label Correlations
# This respects that "Politics" and "International Relations" are related.
chain_model = ClassifierChain(
    XGBClassifier(
        n_estimators=500,
        max_depth=15,             
        learning_rate=0.05,
        objective="binary:logistic",
        tree_method="hist",
        device="cuda",
        
        # Regularization is still key
        reg_alpha=10,
        reg_lambda=10,
        min_child_weight=3,
        random_state=42
    ),
    order='random',
    random_state=42
)

print("üîó Training Classifier Chain (Learning Label Dependencies)...")
chain_model.fit(X_train_svd, fixed_size_y_train)
print("Training Complete!")

# 2. Predict Probabilities instead of hard 0/1
# We need probabilities to tune the thresholds later
y_prob_test = chain_model.predict_proba(X_test_svd)

# 3. Apply "Task-Aware" Thresholds
# Instead of 0.5 for everything, we can be more lenient for rare classes
thresholds = [0.5] * 14


# 'Law & Justice' was weak, so we lower the bar.
thresholds[2] = 0.35  
thresholds[5] = 0.35  # Employment & Labor

print("üéØ Applying Per-Class Thresholds...")
y_pred_optimized = []
for sample_probs in y_prob_test:
    # Apply threshold i to class i
    row_pred = [1 if prob >= thresh else 0 for prob, thresh in zip(sample_probs, thresholds)]
    y_pred_optimized.append(row_pred)

y_pred_optimized = np.array(y_pred_optimized)

# 4. Evaluate



# Ensure categories are sorted correctly
sorted_categories = sorted(categories_id.keys(), key=lambda k: categories_id[k])

def evaluate_chain_performance(y_true, y_pred, category_names):
    """
    Comprehensive evaluation for the Classifier Chain model.
    Highlights the performance of classes where we lowered the threshold.
    """
    print("\n" + "="*60)
    print("üîó CLASSIFIER CHAIN + OPTIMIZED THRESHOLDS EVALUATION")
    print("="*60)

    # 1. Overall Metrics
    subset_acc = accuracy_score(y_true, y_pred)
    h_loss = hamming_loss(y_true, y_pred)
    micro_f1 = f1_score(y_true, y_pred, average='micro')
    macro_f1 = f1_score(y_true, y_pred, average='macro')
    
    print(f"Subset Accuracy:     {subset_acc:.4f}  (Strict Exact Match)")
    print(f"Hamming Loss:        {h_loss:.4f}")
    print(f"Micro F1 Score:      {micro_f1:.4f}  (Overall Performance)")
    print(f"Macro F1 Score:      {macro_f1:.4f}  (Average per Class - WATCH THIS)")

    # 2. Per-Class Analysis with Threshold Tracking
    print(f"\nüîç Detailed Per-Class Performance:")
    print(f"{'Category':<40} | {'F1 Score'} | {'Threshold Used'}")
    print("-" * 75)

    per_label_f1 = f1_score(y_true, y_pred, average=None)
    
    # Reconstruct the thresholds list for display purposes
    display_thresholds = [0.5] * 14
    display_thresholds[2] = 0.35 # Law & Justice
    display_thresholds[5] = 0.35 # Employment & Labor

    for i, (cat, f1) in enumerate(zip(category_names, per_label_f1)):
        thresh = display_thresholds[i]
        
        # Add visual markers
        thresh_marker = "üîß LOW" if thresh < 0.5 else "  STD"
        score_marker = "‚ö†Ô∏è" if f1 < 0.4 else "‚úÖ" if f1 > 0.7 else " "
        
        print(f"{cat:<40} | {f1:.4f} {score_marker} | {thresh:.2f} {thresh_marker}")

    return micro_f1, macro_f1

# ==========================================
# EXECUTE EVALUATION
# ==========================================
chain_micro, chain_macro = evaluate_chain_performance(
    fixed_size_y_test, 
    y_pred_optimized, 
    sorted_categories
)

print("\n" + "#"*60)
print("üßê FINAL ANALYSIS")
print("#"*60)
print("1. Did Classifier Chains help?")
print("   - Check if 'Subset Accuracy' is higher than your previous ~0.2616.")
print("   - Chains are designed to fix inconsistent label combinations.")

print("\n2. Did Lower Thresholds help Weak Classes?")
print("   - Look at 'Law & Justice' and 'Employment & Labor'.")
print("   - If their F1 is still < 0.4, try lowering thresholds further (e.g., to 0.25).")

üîó Training Classifier Chain (Learning Label Dependencies)...
Training Complete!
üéØ Applying Per-Class Thresholds...

üîó CLASSIFIER CHAIN + OPTIMIZED THRESHOLDS EVALUATION
Subset Accuracy:     0.2558  (Strict Exact Match)
Hamming Loss:        0.0984
Micro F1 Score:      0.7695  (Overall Performance)
Macro F1 Score:      0.6556  (Average per Class - WATCH THIS)

üîç Detailed Per-Class Performance:
Category                                 | F1 Score | Threshold Used
---------------------------------------------------------------------------
Politics & Government                    | 0.6797   | 0.50   STD
International Affairs & Defense          | 0.7448 ‚úÖ | 0.50   STD
Law & Justice                            | 0.3736 ‚ö†Ô∏è | 0.35 üîß LOW
Economics & Finance                      | 0.6506   | 0.50   STD
Trade & Business                         | 0.8633 ‚úÖ | 0.50   STD
Employment & Labor                       | 0.4211   | 0.35 üîß LOW
Social Affairs & Health                  | 

In [39]:
# This will download GloVe 
# It only downloads ONCE, then caches it
print("üîÑ Downloading GloVe model... (this may take 2-3 minutes)")
print("   Don't worry, this only happens once!")

glove_model = api.load('glove-wiki-gigaword-300') 

print("\n‚úÖ Download complete!")
print(f"   Model has {len(glove_model):,} words in vocabulary")
print(f"   Each word is represented by {glove_model.vector_size} numbers")

üîÑ Downloading GloVe model... (this may take 2-3 minutes)
   Don't worry, this only happens once!
‚úÖ Download complete!
   Model has 400,000 words in vocabulary
   Each word is represented by 300 numbers


In [41]:
EMBEDDING_DIM = glove_model.vector_size

def text_to_glove(text, model):
    words = text.lower().split()
    vectors = [model[w] for w in words if w in model]

    if not vectors:
        return np.zeros(EMBEDDING_DIM)

    return np.mean(vectors, axis=0)

In [42]:
X_train_glove = np.vstack([
    text_to_glove(text, glove_model)
    for text in X_train
])

X_val_glove = np.vstack([
    text_to_glove(text, glove_model)
    for text in X_val
])

X_test_glove = np.vstack([
    text_to_glove(text, glove_model)
    for text in X_test
])

In [46]:
print("Train:", X_train_glove.shape)
print("Val  :", X_val_glove.shape)
print("Test :", X_test_glove.shape)

Train: (55000, 300)
Val  : (5000, 300)
Test : (5000, 300)


In [56]:
model = XGBClassifier(
    n_estimators=300,       
    max_depth=6,             
    learning_rate=0.05,      
    subsample=0.7,           
    colsample_bytree=0.7,   

    reg_alpha=0.1,           
    reg_lambda=1.5,          
    min_child_weight=3,    

    # GPU Params 
    device="cuda",
    tree_method="hist", 
    objective="binary:logistic",
    random_state=42
)

#model = ClassifierChain(base_model, order='random', random_state=42)
model.fit(X_train_glove, fixed_size_y_train)

In [57]:
print("="*100)
print("üéØ COMPREHENSIVE EVALUATION: GloVe Embeddings")
print("="*100)

print(f"\nüìä Dataset Shapes:")
print(f"   Train: {X_train_glove.shape}")
print(f"   Val  : {X_val_glove.shape}")
print(f"   Test : {X_test_glove.shape}")

# ============================================
# GENERATE PREDICTIONS FOR ALL SETS
# ============================================
print("\nüîÑ Generating predictions for all sets...")

y_train_pred = model.predict(X_train_glove)
y_val_pred = model.predict(X_val_glove)
y_test_pred = model.predict(X_test_glove)

print("‚úÖ Predictions complete!")

# ============================================
# EVALUATION FUNCTION
# ============================================
def evaluate_set(y_true, y_pred, set_name):
    """Comprehensive evaluation metrics"""
    
    print("\n" + "="*100)
    print(f"üìà {set_name.upper()} SET EVALUATION")
    print("="*100)
    
    # ============================================
    # OVERALL METRICS
    # ============================================
    print(f"\nüìä Overall Metrics:")
    print("-"*100)
    
    # Subset accuracy (exact match)
    subset_acc = accuracy_score(y_true, y_pred)
    
    # Hamming loss
    hamming = hamming_loss(y_true, y_pred)
    
    # F1 Scores
    micro_f1 = f1_score(y_true, y_pred, average='micro', zero_division=0)
    macro_f1 = f1_score(y_true, y_pred, average='macro', zero_division=0)
    weighted_f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    samples_f1 = f1_score(y_true, y_pred, average='samples', zero_division=0)
    
    # Precision
    micro_precision = precision_score(y_true, y_pred, average='micro', zero_division=0)
    macro_precision = precision_score(y_true, y_pred, average='macro', zero_division=0)
    weighted_precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)
    
    # Recall
    micro_recall = recall_score(y_true, y_pred, average='micro', zero_division=0)
    macro_recall = recall_score(y_true, y_pred, average='macro', zero_division=0)
    weighted_recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    
    # Jaccard
    jaccard_samples = jaccard_score(y_true, y_pred, average='samples', zero_division=0)
    jaccard_macro = jaccard_score(y_true, y_pred, average='macro', zero_division=0)
    jaccard_micro = jaccard_score(y_true, y_pred, average='micro', zero_division=0)
    
    # Print metrics table
    print(f"{'Metric':<30} | {'Micro':<12} | {'Macro':<12} | {'Weighted':<12} | {'Samples':<12}")
    print("-"*100)
    print(f"{'F1 Score':<30} | {micro_f1:<12.4f} | {macro_f1:<12.4f} | {weighted_f1:<12.4f} | {samples_f1:<12.4f}")
    print(f"{'Precision':<30} | {micro_precision:<12.4f} | {macro_precision:<12.4f} | {weighted_precision:<12.4f} | {'-':<12}")
    print(f"{'Recall':<30} | {micro_recall:<12.4f} | {macro_recall:<12.4f} | {weighted_recall:<12.4f} | {'-':<12}")
    print(f"{'Jaccard Score':<30} | {jaccard_micro:<12.4f} | {jaccard_macro:<12.4f} | {'-':<12} | {jaccard_samples:<12.4f}")
    
    print("\n" + "-"*100)
    print(f"{'Subset Accuracy (Exact Match)':<30} | {subset_acc:<12.4f}")
    print(f"{'Hamming Loss':<30} | {hamming:<12.4f}")
    
    # Label-level statistics
    avg_true_labels = y_true.sum(axis=1).mean()
    avg_pred_labels = y_pred.sum(axis=1).mean()
    std_true_labels = y_true.sum(axis=1).std()
    std_pred_labels = y_pred.sum(axis=1).std()
    
    print(f"\nüìä Label Statistics:")
    print("-"*100)
    print(f"Average labels per sample (True):     {avg_true_labels:.2f} ¬± {std_true_labels:.2f}")
    print(f"Average labels per sample (Predicted): {avg_pred_labels:.2f} ¬± {std_pred_labels:.2f}")
    
    # ============================================
    # PER-CLASS METRICS
    # ============================================
    print(f"\nüéØ Per-Class Detailed Metrics:")
    print("-"*100)
    
    category_names = list(categories_id.keys())
    
    per_class_f1 = f1_score(y_true, y_pred, average=None, zero_division=0)
    per_class_precision = precision_score(y_true, y_pred, average=None, zero_division=0)
    per_class_recall = recall_score(y_true, y_pred, average=None, zero_division=0)
    per_class_jaccard = jaccard_score(y_true, y_pred, average=None, zero_division=0)
    
    # Support
    support_true = y_true.sum(axis=0)
    support_pred = y_pred.sum(axis=0)
    
    print(f"{'Category':<40} | {'Support':<15} | {'Precision':<10} | {'Recall':<10} | {'F1':<10} | {'Jaccard':<10}")
    print(f"{'':40} | {'True':<7}/{' Pred':<7} |")
    print("-"*115)
    
    for i, cat_name in enumerate(category_names):
        print(f"{cat_name:<40} | {int(support_true[i]):>5} / {int(support_pred[i]):>5}   | "
              f"{per_class_precision[i]:<10.4f} | {per_class_recall[i]:<10.4f} | "
              f"{per_class_f1[i]:<10.4f} | {per_class_jaccard[i]:<10.4f}")
    
    # ============================================
    # PERFORMANCE CATEGORIES
    # ============================================
    print(f"\nüìã Performance Summary:")
    print("-"*100)
    
    excellent = sum(per_class_f1 >= 0.8)
    good = sum((per_class_f1 >= 0.6) & (per_class_f1 < 0.8))
    moderate = sum((per_class_f1 >= 0.4) & (per_class_f1 < 0.6))
    poor = sum(per_class_f1 < 0.4)
    
    print(f"üåü Excellent (F1 ‚â• 0.8):    {excellent:2d}/14 classes")
    print(f"‚úÖ Good (0.6 ‚â§ F1 < 0.8):   {good:2d}/14 classes")
    print(f"‚ö†Ô∏è  Moderate (0.4 ‚â§ F1 < 0.6): {moderate:2d}/14 classes")
    print(f"‚ùå Poor (F1 < 0.4):         {poor:2d}/14 classes")
    
    # Return metrics for comparison
    return {
        'set_name': set_name,
        'subset_accuracy': subset_acc,
        'hamming_loss': hamming,
        'micro_f1': micro_f1,
        'macro_f1': macro_f1,
        'weighted_f1': weighted_f1,
        'samples_f1': samples_f1,
        'micro_precision': micro_precision,
        'macro_precision': macro_precision,
        'micro_recall': micro_recall,
        'macro_recall': macro_recall,
        'jaccard_samples': jaccard_samples,
        'per_class_f1': per_class_f1
    }

# ============================================
# EVALUATE ALL SETS
# ============================================
train_results = evaluate_set(fixed_size_y_train, y_train_pred, "Training")
val_results = evaluate_set(fixed_size_y_val, y_val_pred, "Validation")
test_results = evaluate_set(fixed_size_y_test, y_test_pred, "Test")

# ============================================
# CROSS-SET COMPARISON
# ============================================
print("\n" + "="*100)
print("üîç CROSS-SET COMPARISON")
print("="*100)

comparison_metrics = [
    ('Subset Accuracy', 'subset_accuracy'),
    ('Hamming Loss', 'hamming_loss'),
    ('Micro F1', 'micro_f1'),
    ('Macro F1', 'macro_f1'),
    ('Weighted F1', 'weighted_f1'),
    ('Samples F1', 'samples_f1'),
    ('Micro Precision', 'micro_precision'),
    ('Macro Precision', 'macro_precision'),
    ('Micro Recall', 'micro_recall'),
    ('Macro Recall', 'macro_recall'),
    ('Jaccard (Samples)', 'jaccard_samples'),
]

print(f"\n{'Metric':<25} | {'Training':<12} | {'Validation':<12} | {'Test':<12} | {'Train-Val':<12} | {'Val-Test':<12}")
print("-"*100)

for metric_name, metric_key in comparison_metrics:
    train_val = train_results[metric_key]
    val_val = val_results[metric_key]
    test_val = test_results[metric_key]
    
    train_val_gap = abs(train_val - val_val)
    val_test_gap = abs(val_val - test_val)
    
    print(f"{metric_name:<25} | {train_val:<12.4f} | {val_val:<12.4f} | {test_val:<12.4f} | "
          f"{train_val_gap:<12.4f} | {val_test_gap:<12.4f}")

# ============================================
# OVERFITTING CHECK
# ============================================
print("\n" + "="*100)
print("‚ö†Ô∏è  OVERFITTING ANALYSIS")
print("="*100)

train_test_gap = train_results['macro_f1'] - test_results['macro_f1']
train_val_gap = train_results['macro_f1'] - val_results['macro_f1']

print(f"\nMacro F1 Gaps:")
print(f"  Train - Validation: {train_val_gap:+.4f}")
print(f"  Train - Test:       {train_test_gap:+.4f}")
print(f"  Validation - Test:  {val_results['macro_f1'] - test_results['macro_f1']:+.4f}")

if train_test_gap > 0.10:
    print(f"\nüî¥ SEVERE OVERFITTING DETECTED!")
    print(f"   Model performs {train_test_gap:.1%} better on training than test")
    print(f"   Recommendation: Increase regularization")
elif train_test_gap > 0.05:
    print(f"\n‚ö†Ô∏è  MODERATE OVERFITTING")
    print(f"   Model performs {train_test_gap:.1%} better on training than test")
    print(f"   Recommendation: Consider adding regularization")
elif train_test_gap < -0.05:
    print(f"\n‚ö†Ô∏è  UNDERFITTING")
    print(f"   Test performance exceeds training by {abs(train_test_gap):.1%}")
    print(f"   Recommendation: Increase model complexity")
else:
    print(f"\n‚úÖ GOOD GENERALIZATION")
    print(f"   Train-Test gap is reasonable ({train_test_gap:+.1%})")

# ============================================
# PER-CLASS COMPARISON ACROSS SETS
# ============================================
print("\n" + "="*100)
print("üìä PER-CLASS F1 COMPARISON ACROSS SETS")
print("="*100)

category_names = list(categories_id.keys())

print(f"\n{'Category':<40} | {'Train':<10} | {'Val':<10} | {'Test':<10} | {'Best':<6} | {'Trend'}")
print("-"*100)

for i, cat_name in enumerate(category_names):
    train_f1 = train_results['per_class_f1'][i]
    val_f1 = val_results['per_class_f1'][i]
    test_f1 = test_results['per_class_f1'][i]
    
    best = max(train_f1, val_f1, test_f1)
    best_set = 'Train' if train_f1 == best else 'Val' if val_f1 == best else 'Test'
    
    # Determine trend
    if test_f1 > val_f1 > train_f1:
        trend = "üìà Improving"
    elif test_f1 < val_f1 < train_f1:
        trend = "üìâ Declining"
    elif abs(test_f1 - val_f1) < 0.02:
        trend = "‚û°Ô∏è Stable"
    else:
        trend = "üîÑ Mixed"
    
    print(f"{cat_name:<40} | {train_f1:<10.4f} | {val_f1:<10.4f} | {test_f1:<10.4f} | {best_set:<6} | {trend}")

# ============================================
# FINAL SUMMARY
# ============================================
print("\n" + "="*100)
print("‚úÖ EVALUATION COMPLETE")
print("="*100)

print(f"\nüéØ Key Takeaways:")
print(f"   Test Macro F1:       {test_results['macro_f1']:.4f}")
print(f"   Test Subset Accuracy: {test_results['subset_accuracy']:.4f}")
print(f"   Generalization Gap:  {train_test_gap:+.4f}")
print(f"   Classes with F1>0.8: {sum(test_results['per_class_f1'] >= 0.8)}/14")

üéØ COMPREHENSIVE EVALUATION: GloVe Embeddings

üìä Dataset Shapes:
   Train: (55000, 300)
   Val  : (5000, 300)
   Test : (5000, 300)

üîÑ Generating predictions for all sets...
‚úÖ Predictions complete!

üìà TRAINING SET EVALUATION

üìä Overall Metrics:
----------------------------------------------------------------------------------------------------
Metric                         | Micro        | Macro        | Weighted     | Samples     
----------------------------------------------------------------------------------------------------
F1 Score                       | 0.9249       | 0.9080       | 0.9228       | 0.9233      
Precision                      | 0.9522       | 0.9595       | 0.9515       | -           
Recall                         | 0.8991       | 0.8650       | 0.8991       | -           
Jaccard Score                  | 0.8603       | 0.8360       | -            | 0.8802      

---------------------------------------------------------------------------------