In [1]:
#%pip install transformers datasets torch scikit-learn pandas matplotlib seaborn imbalanced-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import os 
import re

from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, matthews_corrcoef
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
def load_emotional_data(file_path, sep=';'):
    texts, labels = [], []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            parts = line.split(sep, 1)  
            if len(parts) != 2:
                continue

            texts.append(parts[0].strip())
            labels.append(parts[1].strip())

    df = pd.DataFrame({'text': texts, 'label_str': labels})

    label_map = {lab: i for i, lab in enumerate(sorted(df['label_str'].unique()))}
    df['label'] = df['label_str'].map(label_map).astype(int)

    return df, label_map

try:
    train_df, train_label_map = load_emotional_data(r"../database_used_by_paper/train.txt")
    val_df, _ = load_emotional_data(r"../database_used_by_paper/val.txt")
    test_df, _ = load_emotional_data(r"../database_used_by_paper/test.txt")

    dataframe = pd.concat([train_df, val_df, test_df], ignore_index=True)
    print("Data loaded successfully , having shape:", dataframe.shape)
except Exception as e:
    print("Error loading data:", e)


Data loaded successfully , having shape: (20000, 3)


In [None]:
print(f"\nTotal samples: {len(dataframe)}")
print(f"\nColoane: {dataframe.columns.tolist()}")
print(f"\nPrimele 5 exemple:")
print(dataframe.head())
print("\nVerificare valori lipsa pe coloane:")
print(dataframe.isnull().sum())


Total samples: 20000

Coloane: ['text', 'label_str', 'label']

Primele 5 exemple:
                                                text label_str  label
0                            i didnt feel humiliated   sadness      4
1  i can go from feeling so hopeless to so damned...   sadness      4
2   im grabbing a minute to post i feel greedy wrong     anger      0
3  i am ever feeling nostalgic about the fireplac...      love      3
4                               i am feeling grouchy     anger      0

Verificare valori lipsa pe coloane:
text         0
label_str    0
label        0
dtype: int64


In [None]:
print("=" * 50)
print("Distributia claselor din setul complet de date:")
print("=" * 50)

class_distribution = dataframe['label_str'].value_counts()
class_percentages = (class_distribution / len(dataframe)) * 100

for label, count in class_distribution.items():
    percentage = class_percentages[label]
    print(f"Clasa: {label} | NumÄƒr de exemple: {count} | Procentaj: {percentage:.2f}%")

dataframe['text_length'] = dataframe['text'].apply(len)
dataframe['word_count'] = dataframe['text'].apply(lambda x: len(x.split()))

print("\nStatistici despre lungimea textelor:")
print(dataframe[['text_length', 'word_count']].describe())


Distributia claselor din setul complet de date:
Clasa: joy | NumÄƒr de exemple: 6761 | Procentaj: 33.80%
Clasa: sadness | NumÄƒr de exemple: 5797 | Procentaj: 28.98%
Clasa: anger | NumÄƒr de exemple: 2709 | Procentaj: 13.54%
Clasa: fear | NumÄƒr de exemple: 2373 | Procentaj: 11.87%
Clasa: love | NumÄƒr de exemple: 1641 | Procentaj: 8.21%
Clasa: surprise | NumÄƒr de exemple: 719 | Procentaj: 3.60%

Statistici despre lungimea textelor:
        text_length    word_count
count  20000.000000  20000.000000
mean      96.670050     19.135050
std       55.777923     10.972016
min        7.000000      2.000000
25%       53.000000     11.000000
50%       86.000000     17.000000
75%      129.000000     25.000000
max      300.000000     66.000000


In [None]:
def clean_text(text):
    """
    CurÄƒÈ›Äƒ textul conform specificaÈ›iilor din paper:
    - EliminÄƒ URL-uri, mentions, caractere speciale
    - PÄƒstreazÄƒ punctuaÈ›ia emoÈ›ionalÄƒ (!, ?, ...)
    - NormalizeazÄƒ whitespace
    """
    if not isinstance(text, str):
        return ""
        
    # EliminÄƒm URL-uri
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # EliminÄƒm mentions (@username)
    text = re.sub(r'@\w+', '', text)
    
    # EliminÄƒm hashtags dar pÄƒstreazÄƒ textul (ex: #happy -> happy)
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # EliminÄƒm caractere speciale DAR pÄƒstreazÄƒ punctuaÈ›ia emoÈ›ionalÄƒ
    # PÄƒstrÄƒm: litere, cifre, spaÈ›ii, È™i punctuaÈ›ia emoÈ›ionalÄƒ (! ? . , ' ")
    text = re.sub(r"[^a-zA-Z0-9\s!?.,;:'\"\-]", '', text)
    
    # NormalizeazÄƒ punctuaÈ›ia repetatÄƒ (!!!! -> !!)
    text = re.sub(r'([!?.]){2,}', r'\1\1', text)
    
    # EliminÄƒm whitespace excesiv
    text = re.sub(r'\s+', ' ', text)
    
    # Strip leading/trailing whitespace
    text = text.strip()
    
    return text

test_texts = [
    "I'm SO happy!!! ðŸ˜Š https://example.com @friend #blessed",
    "This is    terrible...   I hate it!!!",
    "Why would you do that?? @someone #confused",
    "Normal text with some punctuation, like this.",
    "ANGRY!!! ðŸ˜¡ðŸ˜¡ðŸ˜¡ I can't believe this @#$%^"
]

print("Test Text Cleaning:")
print("-" * 60)
for text in test_texts:
    cleaned = clean_text(text)
    print(f"Original: {text}")
    print(f"Cleaned:  {cleaned}")
    print()

Test Text Cleaning:
------------------------------------------------------------
Original: I'm SO happy!!! ðŸ˜Š https://example.com @friend #blessed
Cleaned:  I'm SO happy!! blessed

Original: This is    terrible...   I hate it!!!
Cleaned:  This is terrible.. I hate it!!

Original: Why would you do that?? @someone #confused
Cleaned:  Why would you do that?? confused

Original: Normal text with some punctuation, like this.
Cleaned:  Normal text with some punctuation, like this.

Original: ANGRY!!! ðŸ˜¡ðŸ˜¡ðŸ˜¡ I can't believe this @#$%^
Cleaned:  ANGRY!! I can't believe this



In [None]:
print("Aplicare text cleaning pe dataset")

df = dataframe.copy()
df['text_original'] = df['text']  # PÄƒstrÄƒm originalul pentru referinÈ›Äƒ
df['text'] = df['text'].apply(clean_text)

# EliminÄƒ rÃ¢ndurile cu text gol dupÄƒ curÄƒÈ›are
empty_before = len(df)
df = df[df['text'].str.len() > 0].reset_index(drop=True)
empty_after = len(df)

print(f"DONE -- Text cleaning complet!")
print(f"   RÃ¢nduri eliminate (text gol): {empty_before - empty_after}")
print(f"   RÃ¢nduri rÄƒmase: {len(df)}")

Aplicare text cleaning pe dataset...
âœ… Text cleaning complet!
   RÃ¢nduri eliminate (text gol): 0
   RÃ¢nduri rÄƒmase: 20000


In [None]:
print("\nExemple Ã®nainte/dupÄƒ curÄƒÈ›are:")
print("=" * 70)

df['changed'] = df['text'] != df['text_original']
changed_examples = df[df['changed']].head(10)

for idx, row in changed_examples.iterrows():
    print(f"Original: {row['text_original'][:120]}...")
    print(f"Cleaned:  {row['text'][:120]}...")
    print(f"Label:    {row['label']}")
    print("-" * 70)


Exemple Ã®nainte/dupÄƒ curÄƒÈ›are:
Original: i was feeling very energetic yesterday i decided to start the a href https www...
Cleaned:  i was feeling very energetic yesterday i decided to start the a href www...
Label:    2
----------------------------------------------------------------------
Original: i feel so honoured to receive this from krista know to the blogger world as a href https www...
Cleaned:  i feel so honoured to receive this from krista know to the blogger world as a href www...
Label:    2
----------------------------------------------------------------------
Original: i got this very sexy latex outfit from their lucky chair it made me feel very naughty the hair is called hungover and it...
Cleaned:  i got this very sexy latex outfit from their lucky chair it made me feel very naughty the hair is called hungover and it...
Label:    3
----------------------------------------------------------------------
Original: im feeling generous lets make it a a href https www..

In [None]:
label_list = sorted(df['label_str'].unique())
label_to_id = { label: idx for idx, label in enumerate(label_list) }
id_to_label = { idx: label for label, idx in label_to_id.items() }
NUMBER_OF_LABELS = len(label_list)

print("Label Mapping:")
print("-" * 30)
for label, idx in label_to_id.items():
    count = len(df[df['label_str'] == label])
    print(f"  {idx}: {label:10s} ({count} samples)")

df['label_id'] = df['label_str'].map(label_to_id)

Label Mapping:
------------------------------
  0: anger      (2709 samples)
  1: fear       (2373 samples)
  2: joy        (6761 samples)
  3: love       (1641 samples)
  4: sadness    (5797 samples)
  5: surprise   (719 samples)


In [None]:
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label_id'], random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label_id'], random_state=42)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

print(f"   Train: {len(train_df):,} ({len(train_df)/len(df)*100:.1f}%)")
print(f"   Val:   {len(val_df):,} ({len(val_df)/len(df)*100:.1f}%)")
print(f"   Test:  {len(test_df):,} ({len(test_df)/len(df)*100:.1f}%)")
print(f"   Total: {len(train_df) + len(val_df) + len(test_df):,}")

   Train: 16,000 (80.0%)
   Val:   2,000 (10.0%)
   Test:  2,000 (10.0%)
   Total: 20,000


In [None]:
strat_check = pd.DataFrame({
    'Original (%)': (df['label'].value_counts(normalize=True) * 100).round(2),
    'Train (%)': (train_df['label'].value_counts(normalize=True) * 100).round(2),
    'Val (%)': (val_df['label'].value_counts(normalize=True) * 100).round(2),
    'Test (%)': (test_df['label'].value_counts(normalize=True) * 100).round(2)
})

print(strat_check)
print("\nDONE -- DistribuÈ›iile sunt similare - stratificarea a funcÈ›ionat!")

       Original (%)  Train (%)  Val (%)  Test (%)
label                                            
2             33.80      33.81    33.80     33.80
4             28.98      28.99    29.00     28.95
0             13.54      13.54    13.55     13.55
1             11.86      11.86    11.85     11.90
3              8.20       8.21     8.20      8.20
5              3.60       3.59     3.60      3.60

âœ… DistribuÈ›iile sunt similare - stratificarea a funcÈ›ionat!


In [None]:
train_dist_before = train_df['label'].value_counts()
print(train_dist_before)
print(f"\nTotal: {len(train_df)}")

label
2    5409
4    4638
0    2167
1    1898
3    1313
5     575
Name: count, dtype: int64

Total: 16000


In [None]:
ros = RandomOverSampler(random_state=42)

# Reshape pentru sklearn (necesitÄƒ 2D array)
X_train = train_df['text'].values.reshape(-1, 1)
y_train = train_df['label_id'].values

# Fit È™i resample
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# CreeazÄƒ DataFrame nou cu datele oversampled
train_df_oversampled = pd.DataFrame({
    'text': X_resampled.flatten(),
    'label_id': y_resampled
})

# AdaugÄƒ Ã®napoi label-ul text
train_df_oversampled['label'] = train_df_oversampled['label_id'].map(id_to_label)

In [None]:
train_dist_after = train_df_oversampled['label'].value_counts()
print(train_dist_after)
print(f"\nTotal: {len(train_df_oversampled)}")

print(f"\n CreÈ™tere: {len(train_df)} â†’ {len(train_df_oversampled)} (+{len(train_df_oversampled) - len(train_df)} samples)")

label
fear        5409
joy         5409
sadness     5409
love        5409
anger       5409
surprise    5409
Name: count, dtype: int64

Total: 32454

ðŸ“ˆ CreÈ™tere: 16000 â†’ 32454 (+16454 samples)


In [None]:
oversampling_stats = {
    'before': {
        'total': len(train_df),
        'distribution': train_dist_before.to_dict()
    },
    'after': {
        'total': len(train_df_oversampled),
        'distribution': train_dist_after.to_dict()
    },
    'increase': len(train_df_oversampled) - len(train_df),
    'increase_pct': (len(train_df_oversampled) - len(train_df)) / len(train_df) * 100
}

print("\nOversampling Stats:")
print(oversampling_stats)



Oversampling Stats:
{'before': {'total': 16000, 'distribution': {2: 5409, 4: 4638, 0: 2167, 1: 1898, 3: 1313, 5: 575}}, 'after': {'total': 32454, 'distribution': {'fear': 5409, 'joy': 5409, 'sadness': 5409, 'love': 5409, 'anger': 5409, 'surprise': 5409}}, 'increase': 16454, 'increase_pct': 102.8375}


In [None]:
from transformers import RobertaTokenizerFast

MODEL_NAME = 'roberta-base'
MAX_LENGTH = 128 
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

print(f"Tokenizer Ã®ncÄƒrcat: {MODEL_NAME}")
print(f"  Vocab size: {tokenizer.vocab_size:,}")
print(f"  Max length: {MAX_LENGTH}")
print(f"  Padding token: {tokenizer.pad_token}")
print(f"  Special tokens: {tokenizer.special_tokens_map}")

Tokenizer Ã®ncÄƒrcat: roberta-base
  Vocab size: 50,265
  Max length: 128
  Padding token: <pad>
  Special tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}


In [None]:
print("\nTest tokenizare:")
print("-" * 60)

test_texts = [
    "I am so happy today!",
    "This makes me really angry and frustrated.",
    "I'm kinda scared about what's gonna happen..."
]

for text in test_texts:
    tokens = tokenizer.tokenize(text)
    encoding = tokenizer(text, truncation=True, max_length=MAX_LENGTH)
    
    print(f"Text: {text}")
    print(f"Tokens ({len(tokens)}): {tokens}")
    print(f"Input IDs: {encoding['input_ids'][:15]}...")
    print()


Test tokenizare:
------------------------------------------------------------
Text: I am so happy today!
Tokens (6): ['I', 'Ä am', 'Ä so', 'Ä happy', 'Ä today', '!']
Input IDs: [0, 100, 524, 98, 1372, 452, 328, 2]...

Text: This makes me really angry and frustrated.
Tokens (8): ['This', 'Ä makes', 'Ä me', 'Ä really', 'Ä angry', 'Ä and', 'Ä frustrated', '.']
Input IDs: [0, 713, 817, 162, 269, 5800, 8, 8164, 4, 2]...

Text: I'm kinda scared about what's gonna happen...
Tokens (10): ['I', "'m", 'Ä kinda', 'Ä scared', 'Ä about', 'Ä what', "'s", 'Ä gonna', 'Ä happen', '...']
Input IDs: [0, 100, 437, 24282, 8265, 59, 99, 18, 6908, 1369, 734, 2]...



In [None]:
print("\nAnalizÄƒ lungime Ã®n tokeni (pe train oversampled):")
print("-" * 60)

token_lengths = []
for text in train_df_oversampled['text'].values[:5000]:
    tokens = tokenizer.tokenize(text)
    token_lengths.append(len(tokens))

token_lengths = np.array(token_lengths)

print(f"  Mean:   {token_lengths.mean():.1f} tokens")
print(f"  Median: {np.median(token_lengths):.1f} tokens")
print(f"  Max:    {token_lengths.max()} tokens")
print(f"  Min:    {token_lengths.min()} tokens")
print(f"  Std:    {token_lengths.std():.1f} tokens")

exceeds = (token_lengths > MAX_LENGTH - 2).sum()
print(f"\n  DepÄƒÈ™esc {MAX_LENGTH} tokens: {exceeds} ({exceeds/len(token_lengths)*100:.2f}%)")

p95 = np.percentile(token_lengths, 95)
p99 = np.percentile(token_lengths, 99)
print(f"\n  Percentila 95: {p95:.0f} tokens")
print(f"  Percentila 99: {p99:.0f} tokens")
print(f"\nDONE -- MAX_LENGTH={MAX_LENGTH} acoperÄƒ {(token_lengths <= MAX_LENGTH-2).sum()/len(token_lengths)*100:.1f}% din texte")


AnalizÄƒ lungime Ã®n tokeni (pe train oversampled):
------------------------------------------------------------
  Mean:   19.9 tokens
  Median: 18.0 tokens
  Max:    67 tokens
  Min:    2 tokens
  Std:    11.4 tokens

  DepÄƒÈ™esc 128 tokens: 0 (0.00%)

  Percentila 95: 42 tokens
  Percentila 99: 53 tokens

âœ… MAX_LENGTH=128 acoperÄƒ 100.0% din texte


In [None]:
class EmotionDataset(Dataset):
    """
    PyTorch Dataset pentru emotion classification.
    Tokenizarea se face lazy (la __getitem__) pentru eficienÈ›Äƒ.
    """
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.texts = dataframe['text'].tolist()
        self.labels = dataframe['label_id'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Tokenizare - NU facem padding aici (se face Ã®n DataCollator)
        encoding = self.tokenizer( text, truncation=True, max_length=self.max_length,
            return_tensors=None  # ReturneazÄƒ liste, nu tensori
        )
        
        return { 'input_ids': encoding['input_ids'], 'attention_mask': encoding['attention_mask'], 'labels': label }

In [None]:
train_dataset = EmotionDataset(train_df_oversampled, tokenizer, MAX_LENGTH)
val_dataset = EmotionDataset(val_df, tokenizer, MAX_LENGTH)
test_dataset = EmotionDataset(test_df, tokenizer, MAX_LENGTH)

print(f"\DONE -- Datasets create:")
print(f"   Train (oversampled): {len(train_dataset):,}")
print(f"   Val:                 {len(val_dataset):,}")
print(f"   Test:                {len(test_dataset):,}")


âœ… Datasets create:
   Train (oversampled): 32,454
   Val:                 2,000
   Test:                2,000


In [None]:
sample = train_dataset[0]
print("\nSample din train_dataset:")
print(f"  input_ids ({len(sample['input_ids'])} tokens): {sample['input_ids'][:10]}...")
print(f"  attention_mask: {sample['attention_mask'][:10]}...")
print(f"  label: {sample['labels']} ({id_to_label[sample['labels']]})")
print(f"\n  Decoded: {tokenizer.decode(sample['input_ids'])}")


Sample din train_dataset:
  input_ids (31 tokens): [0, 118, 524, 259, 456, 2157, 10985, 9, 99, 16]...
  attention_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]...
  label: 1 (fear)

  Decoded: <s>i am here again feeling confused of what is happening around me looking for a plane to grasp a reality to settle that feels like it is my own</s>


In [None]:
data_collator = DataCollatorWithPadding( tokenizer=tokenizer, padding=True, return_tensors='pt' )

print("DONE --  DataCollatorWithPadding configurat!")
print("   - Padding: dynamic (la max length din batch)")
print("   - Return: PyTorch tensors")

âœ… DataCollatorWithPadding configurat!
   - Padding: dynamic (la max length din batch)
   - Return: PyTorch tensors


In [None]:
BATCH_SIZE = 16  

train_loader = DataLoader( train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=data_collator, num_workers=0, pin_memory=True if torch.cuda.is_available() else False )
val_loader = DataLoader( val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator, num_workers=0, pin_memory=True if torch.cuda.is_available() else False )
test_loader = DataLoader( test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator, num_workers=0, pin_memory=True if torch.cuda.is_available() else False )

print(f"\nDONE -- DataLoaders create (batch_size={BATCH_SIZE}):")
print(f"   Train: {len(train_loader)} batches")
print(f"   Val:   {len(val_loader)} batches")
print(f"   Test:  {len(test_loader)} batches")


âœ… DataLoaders create (batch_size=16):
   Train: 2029 batches
   Val:   125 batches
   Test:  125 batches


In [None]:
print("\nTest Dynamic Padding:")
print("-" * 60)

for i, batch in enumerate(train_loader):
    if i >= 3:
        break
    print(f"Batch {i+1}:")
    print(f"  input_ids shape:      {batch['input_ids'].shape}")
    print(f"  attention_mask shape: {batch['attention_mask'].shape}")
    print(f"  labels shape:         {batch['labels'].shape}")
    print(f"  Max seq length Ã®n batch: {batch['input_ids'].shape[1]}")
    print()



Test Dynamic Padding:
------------------------------------------------------------
Batch 1:
  input_ids shape:      torch.Size([16, 48])
  attention_mask shape: torch.Size([16, 48])
  labels shape:         torch.Size([16])
  Max seq length Ã®n batch: 48

Batch 2:
  input_ids shape:      torch.Size([16, 38])
  attention_mask shape: torch.Size([16, 38])
  labels shape:         torch.Size([16])
  Max seq length Ã®n batch: 38

Batch 3:
  input_ids shape:      torch.Size([16, 44])
  attention_mask shape: torch.Size([16, 44])
  labels shape:         torch.Size([16])
  Max seq length Ã®n batch: 44

âœ… Dynamic padding funcÈ›ioneazÄƒ! (dimensiunea variazÄƒ per batch)


In [None]:
import json
import os

os.makedirs('data/processed', exist_ok=True)
os.makedirs('reports', exist_ok=True)

train_df_oversampled.to_csv('data/processed/train_oversampled.csv', index=False)
train_df.to_csv('data/processed/train_original.csv', index=False)
val_df.to_csv('data/processed/val.csv', index=False)
test_df.to_csv('data/processed/test.csv', index=False)

mappings = {
    'label_to_id': label_to_id,
    'id_to_label': {str(k): v for k, v in id_to_label.items()},
    'num_labels': NUMBER_OF_LABELS,
    'label_list': label_list
}

with open('data/processed/label_mappings.json', 'w') as f:
    json.dump(mappings, f, indent=2)

config = {
    'model_name': MODEL_NAME,
    'max_length': MAX_LENGTH,
    'batch_size': BATCH_SIZE,
    'random_state': 42,
    'num_labels': NUMBER_OF_LABELS,
    'split_sizes': {
        'train_original': len(train_df),
        'train_oversampled': len(train_df_oversampled),
        'val': len(val_df),
        'test': len(test_df)
    }
}

with open('data/processed/config.json', 'w') as f:
    json.dump(config, f, indent=2)

with open('reports/oversampling_stats.json', 'w') as f:
    json.dump(oversampling_stats, f, indent=2)

print("âœ… Date salvate!")

âœ… Date salvate!
