In [9]:
import accelerate, transformers
print("accelerate:", accelerate.__version__)
print("transformers:", transformers.__version__)

accelerate: 1.12.0
transformers: 4.57.3


In [23]:
import pandas as pd
import numpy as np
import json
import torch , os
import matplotlib.pyplot as plt
import seaborn as sns
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from transformers import ( RobertaTokenizerFast, RobertaForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer )

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


### ÎNCĂRCARE DATE PROCESATE

 1. Încărcare DataFrames
 2. Încărcare Label Mappings
 3. Încărcare Config
 4. Tokenizer
 5. Dataset Class
 6. Creare Datasets
 7. Data Collator


In [None]:
train_df_oversampled = pd.read_csv('data/processed/train_oversampled.csv')
val_df = pd.read_csv('data/processed/val.csv')
test_df = pd.read_csv('data/processed/test.csv')

print("DONE -- data uploaded ")
print(f"   Train: {len(train_df_oversampled):,}")
print(f"   Val:   {len(val_df):,}")
print(f"   Test:  {len(test_df):,}")

with open('data/processed/label_mappings.json', 'r') as f:
    mappings = json.load(f)

label_to_id = mappings['label_to_id']
id_to_label = {int(k): v for k, v in mappings['id_to_label'].items()}
label_list = mappings['label_list']
NUMBER_OF_LABELS = mappings['num_labels']

print(f"DONE -- Labels: {label_list} -- ")

with open('data/processed/config.json', 'r') as f:
    config = json.load(f)

MODEL_NAME = config['model_name']
MAX_LENGTH = config['max_length']
BATCH_SIZE = config['batch_size']

print(f"DONE -- Config: model={MODEL_NAME}, max_len={MAX_LENGTH}, batch={BATCH_SIZE} -- ")

tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)
print(f"DONE -- Tokenizer încărcat -- ")

class EmotionDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.texts = dataframe['text'].tolist()
        self.labels = dataframe['label_id'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer( text, truncation=True, max_length=self.max_length, return_tensors=None )
        
        return { 'input_ids': encoding['input_ids'], 'attention_mask': encoding['attention_mask'], 'labels': label }


train_dataset = EmotionDataset(train_df_oversampled, tokenizer, MAX_LENGTH)
val_dataset = EmotionDataset(val_df, tokenizer, MAX_LENGTH)
test_dataset = EmotionDataset(test_df, tokenizer, MAX_LENGTH)

print(f"DONE -- Datasets create:")
print(f"   Train: {len(train_dataset):,}")
print(f"   Val:   {len(val_dataset):,}")
print(f"   Test:  {len(test_dataset):,}")


data_collator = DataCollatorWithPadding( tokenizer=tokenizer, padding=True, return_tensors='pt' )
print(f"DONE -- DataCollator configurat")


DONE -- data uploaded 
   Train: 32,454
   Val:   2,000
   Test:  2,000
DONE -- Labels: ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'] -- 
DONE -- Config: model=roberta-base, max_len=128, batch=16 -- 
 DONE -- Tokenizer încărcat -- 
 DONE -- Datasets create:
   Train: 32,454
   Val:   2,000
   Test:  2,000
DONE -- DataCollator configurat


In [28]:
BASELINE_CONFIGURATION = {
    'model_name': 'roberta-base',
    'learning_rate': 1e-5,
    'batch_size': 16,
    'num_epochs': 10,
    'dropout': 0.3,
    'weight_decay': 0.01,
    'warmup_ratio': 0.1,
    'max_length': 128,
    'num_labels': 6
}

device = "cuda" if torch.cuda.is_available() else "cpu"

id2label = {0: 'anger', 1: 'fear', 2: 'joy', 3: 'love', 4: 'sadness', 5: 'surprise'}
label2id = {v: k for k, v in id2label.items()}

os.makedirs('models/baseline_roberta', exist_ok=True)
os.makedirs('reports', exist_ok=True)

model = RobertaForSequenceClassification.from_pretrained(
    BASELINE_CONFIGURATION['model_name'],
    num_labels=BASELINE_CONFIGURATION['num_labels'],
    hidden_dropout_prob=BASELINE_CONFIGURATION['dropout'],
    attention_probs_dropout_prob=BASELINE_CONFIGURATION['dropout'],
    id2label=id2label,
    label2id=label2id
).to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nDONE -- Model încărcat pe {device}")
print(f"   Total parametri: {total_params:,}")
print(f"   Parametri antrenabili: {trainable_params:,}")



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



DONE -- Model încărcat pe cpu
   Total parametri: 124,650,246
   Parametri antrenabili: 124,650,246


In [24]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import numpy as np

def compute_metrics(eval_pred):
    """Calculează metrici pentru evaluare"""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support( labels, predictions, average='weighted' )
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support( labels, predictions, average='macro' )
    mcc = matthews_corrcoef(labels, predictions)
    
    return { 'accuracy': accuracy, 'precision_weighted': precision, 'recall_weighted': recall, 'f1_weighted': f1,
        'precision_macro': precision_macro, 'recall_macro': recall_macro, 'f1_macro': f1_macro, 'mcc': mcc
    }


In [25]:
training_args = TrainingArguments(
    output_dir='./results/baseline_roberta',

    # Training params
    num_train_epochs=BASELINE_CONFIGURATION['num_epochs'],
    per_device_train_batch_size=BASELINE_CONFIGURATION['batch_size'],
    per_device_eval_batch_size=BASELINE_CONFIGURATION['batch_size'],
    learning_rate=BASELINE_CONFIGURATION['learning_rate'],
    weight_decay=BASELINE_CONFIGURATION['weight_decay'],
    warmup_ratio=BASELINE_CONFIGURATION['warmup_ratio'],

    # Evaluation & Saving
    eval_strategy='epoch',      
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1_weighted',
    greater_is_better=True,

    # Logging
    logging_dir='./logs/baseline_roberta',
    logging_steps=100,
    logging_first_step=True,

    # Optimizări
    use_cpu=True,
    fp16=False,
    dataloader_num_workers=0,   # Windows

    # Altele
    report_to='none',
    seed=42,

    # Salvare
    save_total_limit=2
)


In [26]:
trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics )

print(f"\nDONE -- Training info:")
print(f"   Train samples: {len(train_dataset):,}")
print(f"   Val samples: {len(val_dataset):,}")
print(f"   Batches per epoch: {len(train_dataset) // BASELINE_CONFIGURATION['batch_size']}")
print(f"   Total steps: {len(train_dataset) // BASELINE_CONFIGURATION['batch_size'] * BASELINE_CONFIGURATION['num_epochs']}")


DONE -- Training info:
   Train samples: 32,454
   Val samples: 2,000
   Batches per epoch: 2028
   Total steps: 20280


  trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics )


In [None]:
# Antrenare
print("\n" + "=" * 60)
print(" Antrenare BASELINE RoBERTa")
print("=" * 60)

print(f"   Device: {device}")
print(f"   Epochs: {BASELINE_CONFIGURATION['num_epochs']}")
print(f"   Batch size: {BASELINE_CONFIGURATION['batch_size']}")
print(f"   Learning rate: {BASELINE_CONFIGURATION['learning_rate']}")
print("=" * 60)

train_result = trainer.train()

print("\nDONE -- Antrenare completă!")
print(f"   Training Loss: {train_result.training_loss:.4f}")
print(f"   Training Time: {train_result.metrics['train_runtime']:.1f}s")

# Salvare Model
trainer.save_model('./models/baseline_roberta')
tokenizer.save_pretrained('./models/baseline_roberta')

print("DONE -- Model salvat în ./models/baseline_roberta/")

# Evaluare pe Validation Set
print("\n" + "=" * 60)
print("Evalare pe validation set")
print("=" * 60)
val_results = trainer.evaluate(val_dataset)

print("\nRezultate Validation:")
for key, value in val_results.items():
    if isinstance(value, float):
        print(f"   {key}: {value:.4f}")

# Evaluare pe Test Set
print("\n" + "=" * 60)
print("Evalare pe test set")
print("=" * 60)

test_results = trainer.evaluate(test_dataset)

print("\nRezultate Test:")
for key, value in test_results.items():
    if isinstance(value, float):
        print(f"   {key}: {value:.4f}")

# Predicții detaliate pe Test Set
print("\n" + "=" * 60)
print("Analiza detaliată pe test set")
print("=" * 60)

predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = np.array([test_dataset[i]['labels'] for i in range(len(test_dataset))])

# Classification Report
print("\nClassification Report:")
print("-" * 60)
print(classification_report(y_true, y_pred, target_names=label_list))