In [1]:
#%pip install transformers datasets torch scikit-learn pandas matplotlib seaborn imbalanced-learn

In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import os 
import re
import json
import torch

from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, matthews_corrcoef
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding, RobertaTokenizerFast

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cpu


## Configuration

In [48]:
DATA_DIR = DATA_PATH = "../database_used_by_paper"
TRAIN_PATH = os.path.join(DATA_DIR, 'train.txt')
TEST_PATH = os.path.join(DATA_DIR, 'test.txt')
VAL_PATH = os.path.join(DATA_DIR, 'val.txt')

MODEL_NAME = "roberta-base"
MAX_LENGTH = 128
BATCH_SIZE = 16
RANDOM_STATE = 42

TFIDF_TRASHHOLD = 3.5
MIN_TOKENS = 4

os.makedirs('data/processed', exist_ok=True)
os.makedirs('reports', exist_ok=True)

## Loading data 

In [6]:
def load_emotional_data(file_path, sep=';'):

    texts, labels = [], []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line: continue

            parts = line.split(sep, 1)  
            if len(parts) != 2:
                continue

            texts.append(parts[0].strip())
            labels.append(parts[1].strip())

    return pd.DataFrame({'text': texts, 'label_str': labels})

try:
    print("Loading data...")
    train_df_raw = load_emotional_data(TRAIN_PATH)
    val_df_raw = load_emotional_data(VAL_PATH)
    test_df_raw = load_emotional_data(TEST_PATH)

    print("Data loaded successfully.")
    print(f"   Train set size:, {len(train_df_raw):,}samples")
    print(f"   Validation set size:, {len(val_df_raw):,} samples")
    print(f"   Test set size:, {len(test_df_raw):,} samples")
    print(f"\nTotal samples:, {len(train_df_raw) + len(val_df_raw) + len(test_df_raw):,} samples")
except Exception as e:
    print("Error loading data:", e)


Loading data...
Data loaded successfully.
   Train set size:, 16,000samples
   Validation set size:, 2,000 samples
   Test set size:, 2,000 samples

Total samples:, 20,000 samples


## Slang-Aware cleaning + text cleaning

In [7]:
SLANG_MAP = {
    # Common abbreviations
    "u": "you", "ur": "your", "r": "are", "n": "and", "b": "be", "c": "see", "y": "why",
    "k": "okay", "ok": "okay", "pls": "please", "plz": "please", "thx": "thanks", "thnx": "thanks",
    "ty": "thank you", "bc": "because", "cuz": "because", "bcuz": "because", "bcoz": "because",
    "cos": "because", "coz": "because","w": "with", "wo": "without", "abt": "about", "bout": "about",
    "bf": "boyfriend", "gf": "girlfriend", "bff": "best friend", "rn": "right now",
    "imo": "in my opinion", "imho": "in my humble opinion", "tbh": "to be honest",
    "fyi": "for your information", "btw": "by the way", "afaik": "as far as i know",
    "irl": "in real life", "jk": "just kidding", "omw": "on my way", "brb": "be right back",
    "gtg": "got to go", "g2g": "got to go", "ttyl": "talk to you later", "nvm": "never mind",
    "idk": "i do not know", "idc": "i do not care", "idgaf": "i do not care", "dm": "direct message",
    "rt": "retweet", "fb": "facebook", "ig": "instagram", "yt": "youtube", "smh": "shaking my head",
    "fomo": "fear of missing out", "yolo": "you only live once", "bae": "babe", "fam": "family",
    "lit": "exciting", "slay": "amazing", "goat": "greatest of all time", "af": "very",
    "asf": "very", "lowkey": "somewhat", "highkey": "very", "srsly": "seriously", "tho": "though",
    "thru": "through", "kinda": "kind of", "sorta": "sort of", "gonna": "going to",
    "wanna": "want to", "gotta": "got to", "coulda": "could have", "shoulda": "should have",
    "woulda": "would have", "musta": "must have", "lemme": "let me", "gimme": "give me",
    "dunno": "do not know", "whatcha": "what are you", "gotcha": "got you", "outta": "out of", "lotta": "lot of",
    "lotsa": "lots of", "kinda": "kind of", "innit": "is it not", "aint": "is not", "ain't": "is not",
    "yall": "you all", "y'all": "you all",
    
    # Emotion-related expressions 
    "lol": "laughing", "lmao": "laughing", "lmfao": "laughing", "rofl": "laughing",
    "roflmao": "laughing", "haha": "laughing", "hahaha": "laughing", "hehe": "laughing",
    "hihi": "laughing", "xd": "laughing", "xD": "laughing", "omg": "oh my god", "omfg": "oh my god",
    "wtf": "what the heck", "wth": "what the heck", "stfu": "shut up", "ugh": "frustrated",
    "meh": "indifferent", "oof": "ouch", "yikes": "shocked", "eww": "disgusted", "aww": "touched",
    "yay": "excited", "woohoo": "excited", "woah": "surprised", "whoa": "surprised",
    "wow": "surprised", "damn": "frustrated", "dammit": "frustrated", "damnit": "frustrated",
    
    # Missing apostrophe contractions
    "im": "i am", "ive": "i have", "id": "i would", "ill": "i will", "youre": "you are",
    "youve": "you have", "youd": "you would", "youll": "you will", "hes": "he is",
    "shes": "she is", "thats": "that is", "whats": "what is", "whos": "who is",
    "wheres": "where is", "heres": "here is", "theres": "there is", "theyre": "they are",
    "theyve": "they have", "theyd": "they would", "theyll": "they will", "weve": "we have",
    "wed": "we would", "dont": "do not", "doesnt": "does not", "didnt": "did not",
    "wont": "will not", "wouldnt": "would not", "couldnt": "could not", "shouldnt": "should not",
    "cant": "cannot", "cannot": "can not", "isnt": "is not", "arent": "are not",
    "wasnt": "was not", "werent": "were not", "hasnt": "has not", "havent": "have not",
    "hadnt": "had not", "mustnt": "must not", "lets": "let us",
}

# Text emoticons to emotion words 
EMOTICON_MAP = {
    ":)": " happy ", ":(": " sad ", ":D": " very happy ", ":-)": " happy ", ":-(": " sad ",
    ";)": " playful ", ";-)": " playful ", ":p": " playful ", ":P": " playful ", ":-p": " playful ",
    ":-P": " playful ", ":o": " surprised ", ":O": " surprised ", ":-o": " surprised ",
    ":-O": " surprised ", ":/": " unsure ", ":-/": " unsure ", ":\\": " unsure ", ":-\\": " unsure ",
    "<3": " love ", "</3": " heartbroken ", "xo": " love ", "xoxo": " love ",
}

print(f"Loaded {len(SLANG_MAP)} slang mappings")
print(f"Loaded {len(EMOTICON_MAP)} emoticon mappings")

Loaded 157 slang mappings
Loaded 23 emoticon mappings


In [8]:
def normalize_elongated_words(text):
    """
    Reduce elongated characters to max 2 repetitions.
    'soooooo' -> 'soo', 'happyyy' -> 'happyy'
    This preserves some emphasis while normalizing.
    """

    return re.sub(r'(.)\1{2,}', r'\1\1', text)

def replace_slang(text):
    """
    Replace slang/abbreviations with full forms.
    Uses word boundaries to avoid partial replacements.
    """

    words = text.split()
    result = []
    for word in words:
        word_lower = word.lower()
        if word_lower in SLANG_MAP:
            result.append(SLANG_MAP[word_lower])
        else:
            result.append(word)
    return ' '.join(result)

def replace_emoticons(text):
    """Replace text emoticons with emotion words."""

    for emoticon, replacement in EMOTICON_MAP.items():
        text = text.replace(emoticon, replacement)
    return text

def replace_emoticons(text):
    """Replace text emoticons with emotion words."""

    for emoticon, replacement in EMOTICON_MAP.items():
        text = text.replace(emoticon, replacement)
    return text

def clean_text(text):
    """Comprehensive text cleaning for social media data."""
    
    if not isinstance(text, str):
        return ""
    
    # 1. Replace emoticons with emotion words (before stripping special chars)
    text = replace_emoticons(text)
    
    # 2. Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # 3. Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)
    
    # 4. Convert hashtags to words (#happy -> happy)
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # 5. Normalize elongated words (soooo -> soo)
    text = normalize_elongated_words(text)
    
    # 6. Replace slang and abbreviations
    text = replace_slang(text)
    
    # 7. Remove special characters but keep emotional punctuation
    # Keep: letters, numbers, spaces, and ! ? . , ' - "
    text = re.sub(r"[^a-zA-Z0-9\s!?.,;:'\"\-]", '', text)
    
    # 8. Normalize repeated punctuation (!!!! -> !!)
    text = re.sub(r'([!?.]){2,}', r'\1\1', text)
    
    # 9. Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # 10. Strip and lowercase
    text = text.strip().lower()
    
    return text

In [9]:
test_texts = [
    "I'm SO happy!!! ðŸ˜Š https://example.com @friend #blessed",
    "omg u r soooo funny lol :D",
    "idk why im feeling sad rn tbh :(",
    "cant believe this happened smh",
    "gonna miss u bf </3 ttyl",
    "thx for the help! ur the best <3",
    "ANGRY!!! ðŸ˜¡ I can't believe this wtf",
    "im lowkey scared ngl",
    "yall shouldnt have done that",
    "Normal text with some punctuation, like this."
]

print("=" * 70)
print("SLANG-AWARE TEXT CLEANING EXAMPLES")
print("=" * 70)
for text in test_texts:
    cleaned = clean_text(text)
    print(f"\nOriginal: {text}")
    print(f"Cleaned:  {cleaned}")

SLANG-AWARE TEXT CLEANING EXAMPLES

Original: I'm SO happy!!! ðŸ˜Š https://example.com @friend #blessed
Cleaned:  i'm so happy!! unsure example.com blessed

Original: omg u r soooo funny lol :D
Cleaned:  oh my god you are soo funny laughing very happy

Original: idk why im feeling sad rn tbh :(
Cleaned:  i do not know why i am feeling sad right now to be honest sad

Original: cant believe this happened smh
Cleaned:  cannot believe this happened shaking my head

Original: gonna miss u bf </3 ttyl
Cleaned:  going to miss you boyfriend heartbroken talk to you later

Original: thx for the help! ur the best <3
Cleaned:  thanks for the help! your the best love

Original: ANGRY!!! ðŸ˜¡ I can't believe this wtf
Cleaned:  angry!! i can't believe this what the heck

Original: im lowkey scared ngl
Cleaned:  i am somewhat scared ngl

Original: yall shouldnt have done that
Cleaned:  you all should not have done that

Original: Normal text with some punctuation, like this.
Cleaned:  normal text with

## Label Mapping

In [11]:
train_labels = sorted(train_df_raw['label_str'].unique())

label_to_id = {label: idx for idx, label in enumerate(train_labels)}
id_to_label = {idx: label for label, idx in label_to_id.items()}
NUM_LABELS = len(train_labels)

print("Label Mapping (built from TRAIN only):")
print("-" * 40)
for label, idx in label_to_id.items():
    count = len(train_df_raw[train_df_raw['label_str'] == label])
    print(f"  {idx}: {label:10s} ({count:,} train samples)")

val_labels = set(val_df_raw['label_str'].unique())
test_labels = set(test_df_raw['label_str'].unique())
train_label_set = set(train_labels)

unknown_val = val_labels - train_label_set
unknown_test = test_labels - train_label_set


print(f"\nâœ… All val/test labels exist in train - mapping is consistent!")

Label Mapping (built from TRAIN only):
----------------------------------------
  0: anger      (2,159 train samples)
  1: fear       (1,937 train samples)
  2: joy        (5,362 train samples)
  3: love       (1,304 train samples)
  4: sadness    (4,666 train samples)
  5: surprise   (572 train samples)

âœ… All val/test labels exist in train - mapping is consistent!


In [13]:
def apply_label_mapping(df, label_to_id):

    df = df.copy()    
    df['label'] = df['label_str'].map(label_to_id)
    return df

train_df_raw = apply_label_mapping(train_df_raw, label_to_id)
val_df_raw = apply_label_mapping(val_df_raw, label_to_id)
test_df_raw = apply_label_mapping(test_df_raw, label_to_id)

print("âœ… Consistent label mapping applied to all splits!")

print("\nSample from each split:")
print(f"Train[0]: '{train_df_raw.iloc[0]['label_str']}' -> {train_df_raw.iloc[0]['label']}")
print(f"Val[0]:   '{val_df_raw.iloc[0]['label_str']}' -> {val_df_raw.iloc[0]['label']}")
print(f"Test[0]:  '{test_df_raw.iloc[0]['label_str']}' -> {test_df_raw.iloc[0]['label']}")

âœ… Consistent label mapping applied to all splits!

Sample from each split:
Train[0]: 'sadness' -> 4
Val[0]:   'sadness' -> 4
Test[0]:  'sadness' -> 4


## Apply Text Cleaning

In [None]:
def apply_cleaning(df):
    """Apply text cleaning and remove empty results."""
    
    df = df.copy()
    df['text_original'] = df['text']
    df['text'] = df['text'].apply(clean_text)
    
    before = len(df)
    df = df[df['text'].str.len() > 0].reset_index(drop=True)
    after = len(df)
    
    return df, before - after

print("Applying slang-aware text cleaning...")

train_df, train_removed = apply_cleaning(train_df_raw)
val_df, val_removed = apply_cleaning(val_df_raw)
test_df, test_removed = apply_cleaning(test_df_raw)

print(f"\nâœ… Text cleaning complete!")
print(f"   Train: {len(train_df):,} samples (removed {train_removed})")
print(f"   Val:   {len(val_df):,} samples (removed {val_removed})")
print(f"   Test:  {len(test_df):,} samples (removed {test_removed})")

Applying slang-aware text cleaning...

âœ… Text cleaning complete!
   Train: 16,000 samples (removed 0)
   Val:   2,000 samples (removed 0)
   Test:  2,000 samples (removed 0)


In [15]:
train_df['changed'] = train_df['text'] != train_df['text_original'].apply(lambda x: x.lower())
changed = train_df[train_df['changed']].head(10)

print("Examples of cleaned text:")
print("=" * 70)
for idx, row in changed.iterrows():
    print(f"\nOriginal: {row['text_original'][:80]}...")
    print(f"Cleaned:  {row['text'][:80]}...")
    print(f"Label:    {row['label_str']}")

Examples of cleaned text:

Original: i didnt feel humiliated...
Cleaned:  i did not feel humiliated...
Label:    sadness

Original: im grabbing a minute to post i feel greedy wrong...
Cleaned:  i am grabbing a minute to post i feel greedy wrong...
Label:    anger

Original: ive been feeling a little burdened lately wasnt sure why that was...
Cleaned:  i have been feeling a little burdened lately was not sure why that was...
Label:    sadness

Original: ive been taking or milligrams or times recommended amount and ive fallen asleep ...
Cleaned:  i have been taking or milligrams or times recommended amount and i have fallen a...
Label:    surprise

Original: i didnt really feel that embarrassed...
Cleaned:  i did not really feel that embarrassed...
Label:    sadness

Original: i already feel like i fucked up though because i dont usually eat at all in the ...
Cleaned:  i already feel like i fucked up though because i do not usually eat at all in th...
Label:    anger

Original: i feel so

## Check Class Distribution

In [17]:
print("Class Distribution Across Splits:")
print("=" * 60)

dist_df = pd.DataFrame({
    'Train': train_df['label_str'].value_counts(),
    'Val': val_df['label_str'].value_counts(),
    'Test': test_df['label_str'].value_counts()
})

dist_df['Train %'] = (dist_df['Train'] / len(train_df) * 100).round(2)
dist_df['Val %'] = (dist_df['Val'] / len(val_df) * 100).round(2)
dist_df['Test %'] = (dist_df['Test'] / len(test_df) * 100).round(2)

print(dist_df[['Train', 'Train %', 'Val', 'Val %', 'Test', 'Test %']])

Class Distribution Across Splits:
           Train  Train %  Val  Val %  Test  Test %
label_str                                          
joy         5362    33.51  704  35.20   695   34.75
sadness     4666    29.16  550  27.50   581   29.05
anger       2159    13.49  275  13.75   275   13.75
fear        1937    12.11  212  10.60   224   11.20
love        1304     8.15  178   8.90   159    7.95
surprise     572     3.58   81   4.05    66    3.30


## Oversampling (Train Only)

In [18]:
print("Train distribution BEFORE oversampling:")
print(train_df['label_str'].value_counts())

ros = RandomOverSampler(random_state=RANDOM_STATE)

X_train = train_df['text'].values.reshape(-1, 1)
y_train = train_df['label'].values
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Create oversampled DataFrame
train_df_oversampled = pd.DataFrame({ 'text': X_resampled.flatten(), 'label': y_resampled })
train_df_oversampled['label_str'] = train_df_oversampled['label'].map(id_to_label)

print(f"\nTrain distribution AFTER oversampling:")
print(train_df_oversampled['label_str'].value_counts())

print(f"\nâœ… Oversampling complete!")
print(f"   Before: {len(train_df):,}")
print(f"   After:  {len(train_df_oversampled):,}")
print(f"   Added:  {len(train_df_oversampled) - len(train_df):,} samples")

Train distribution BEFORE oversampling:
label_str
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

Train distribution AFTER oversampling:
label_str
sadness     5362
anger       5362
love        5362
surprise    5362
fear        5362
joy         5362
Name: count, dtype: int64

âœ… Oversampling complete!
   Before: 16,000
   After:  32,172
   Added:  16,172 samples


In [22]:
# Save oversampling statistics
oversampling_stats = {
    'before': {
        'total': len(train_df),
        'distribution': train_df['label_str'].value_counts().to_dict()
    },
    'after': {
        'total': len(train_df_oversampled),
        'distribution': train_df_oversampled['label_str'].value_counts().to_dict()
    },
    'increase': len(train_df_oversampled) - len(train_df),
    'increase_pct': round((len(train_df_oversampled) - len(train_df)) / len(train_df) * 100, 2)
}

with open('reports/oversampling_stats.json', 'w') as f:
    json.dump(oversampling_stats, f, indent=2)
    
print("Oversampling stats saved to reports/oversampling_stats.json")
print(oversampling_stats)

Oversampling stats saved to reports/oversampling_stats.json
{'before': {'total': 16000, 'distribution': {'joy': 5362, 'sadness': 4666, 'anger': 2159, 'fear': 1937, 'love': 1304, 'surprise': 572}}, 'after': {'total': 32172, 'distribution': {'sadness': 5362, 'anger': 5362, 'love': 5362, 'surprise': 5362, 'fear': 5362, 'joy': 5362}}, 'increase': 16172, 'increase_pct': 101.08}


## 6. TF-IDF Gating Mechanism

From the paper:
> "The TF-IDF based gating mechanism is introduced to refine attention distribution... computed after oversampling, filtering out words with low significance using a threshold of 3.5... minimum of four tokens per sample is preserved."

The paper uses TF-IDF for **two purposes**:
1. **Hard filtering** (preprocessing): removes low TF-IDF words before tokenization
2. **Soft gating** (model): modulates token embeddings based on TF-IDF scores

In [None]:
class TFIDFGating:
    def __init__(self, threshold=3.5, min_tokens=4, min_df=2, max_df=0.95):
        self.threshold = threshold
        self.min_tokens = min_tokens
        self.vectorizer = TfidfVectorizer(
            ngram_range=(1, 1),
            min_df=min_df,           # we are ignoring words appearing in < 2 documents
            max_df=max_df,           # we are ignoring as well  words appearing in > 95% of documents 
            token_pattern=r"(?u)\b\w+\b",
            lowercase=True,
            norm = None
        )
        self._is_fitted = False
    
    def fit(self, texts):
        """Fit on oversampled training corpus."""

        print(f"Fitting TF-IDF on {len(texts):,} samples...")
        self.vectorizer.fit(texts)
        self._is_fitted = True
        print(f"  Vocabulary size: {len(self.vectorizer.vocabulary_):,}")
        return self
    
    def get_word_scores(self, text):
        """Get TF-IDF scores for each word."""

        analyzer = self.vectorizer.build_analyzer()
        words = analyzer(text.lower())
        if not words:
            return [], np.array([])
        
        tfidf = self.vectorizer.transform([text.lower()])
        vocab = self.vectorizer.vocabulary_
        
        scores = []
        for word in words:
            idx = vocab.get(word)
            scores.append(float(tfidf[0, idx]) if idx is not None else 0.0)
        
        return words, np.array(scores, dtype=np.float32)
    
    def get_keep_mask(self, scores):
        """Apply threshold with minimum tokens guarantee."""

        if len(scores) == 0:
            return np.array([], dtype=bool)
        
        keep = scores >= self.threshold
        
        # Ensure minimum tokens
        if keep.sum() < self.min_tokens:
            if len(scores) >= self.min_tokens:
                top_k = np.argsort(-scores)[:self.min_tokens]
                keep = np.zeros_like(keep, dtype=bool)
                keep[top_k] = True
            else:
                keep = np.ones_like(keep, dtype=bool)
        
        return keep
    
    def filter_text(self, text):
        """Hard filtering: remove low TF-IDF words."""

        words, scores = self.get_word_scores(text)
        if len(words) == 0:
            return text
        
        keep = self.get_keep_mask(scores)
        filtered = [w for w, k in zip(words, keep) if k]
        return " ".join(filtered) if filtered else text

    def compute_gates(self, text, normalize=True):
        words, scores = self.get_word_scores(text)
        if len(words) == 0:
            return words, np.array([])

        keep = self.get_keep_mask(scores)

        gates = scores.copy()
        gates[~keep] = 0.0

        # Ensure certain words always have strong gates
        NEGATION_WHITELIST = {"not", "no", "never", "n't", "none", "nothing", "nowhere"}
        INTENSIFIERS = {"very", "so", "too", "really", "extremely", "quite"}

        for i, w in enumerate(words):
            if w in NEGATION_WHITELIST or w in INTENSIFIERS:
                gates[i] = max(gates[i], 1.0)  # force strong signal

        if normalize and gates.max() > 0:
            gates = gates / gates.max()

        return words, gates.astype(np.float32)
    
    def get_stats(self, texts):
        """Get filtering statistics."""
        
        orig_lens, filt_lens = [], []
        for text in texts:
            words, scores = self.get_word_scores(text)
            keep = self.get_keep_mask(scores)
            orig_lens.append(len(words))
            filt_lens.append(keep.sum())
        
        return {
            'avg_original': np.mean(orig_lens),
            'avg_filtered': np.mean(filt_lens),
            'avg_retention': np.mean([f/o if o > 0 else 1 for o, f in zip(orig_lens, filt_lens)])
        }

In [76]:
tfidf_gating = TFIDFGating( threshold=TFIDF_TRASHHOLD, min_tokens=MIN_TOKENS )
tfidf_gating.fit(train_df_oversampled['text'].tolist())

print(f"\nTF-IDF Configuration:")
print(f"  Threshold: {TFIDF_TRASHHOLD}")
print(f"  Min tokens: {MIN_TOKENS}")

Fitting TF-IDF on 32,172 samples...
  Vocabulary size: 10,081

TF-IDF Configuration:
  Threshold: 3.5
  Min tokens: 4


In [77]:
print("\nFiltering Statistics:")
print("=" * 40)

for name, df in [('Train (OS)', train_df_oversampled), ('Val', val_df), ('Test', test_df)]:
    stats = tfidf_gating.get_stats(df['text'].tolist())
    print(f"{name}:")
    print(f"  Avg words before: {stats['avg_original']:.1f}")
    print(f"  Avg words after:  {stats['avg_filtered']:.1f}")
    print(f"  Retention rate:   {stats['avg_retention']*100:.1f}%")


Filtering Statistics:
Train (OS):
  Avg words before: 19.6
  Avg words after:  12.8
  Retention rate:   62.8%
Val:
  Avg words before: 19.2
  Avg words after:  11.9
  Retention rate:   60.2%
Test:
  Avg words before: 19.5
  Avg words after:  12.1
  Retention rate:   60.0%


In [95]:
print("\nFiltering Examples:")
print("=" * 70)

sample_texts = train_df_oversampled['text'].head(5).tolist()

for text in sample_texts:
    filtered = tfidf_gating.filter_text(text)
    words, gates = tfidf_gating.compute_gates(text)
    
    print(f"\nOriginal ({len(text.split())} words):")
    print(f"  {text[:80]}..." if len(text) > 80 else f"  {text}")
    print(f"Filtered ({len(filtered.split())} words):")
    print(f"  {filtered[:80]}..." if len(filtered) > 80 else f"  {filtered}")
    
    print(f"Gates: ", end="")
    for w, g in list(zip(words, gates))[:8]:
        print(f"{w}:{g:.2f}", end=" ")
    if len(words) > 8:
        print("...")
    else:
        print()


Filtering Examples:

Original (5 words):
  i did not feel humiliated
Filtered (4 words):
  did not feel humiliated
Gates: i:0.00 did:0.60 not:0.37 feel:0.20 humiliated:1.00 

Original (21 words):
  i can go from feeling so hopeless to so damned hopeful just from being around so...
Filtered (16 words):
  can go from so hopeless so damned hopeful just from being around someone who car...
Gates: i:0.00 can:0.44 go:0.54 from:0.96 feeling:0.00 so:0.67 hopeless:0.79 to:0.00 ...

Original (11 words):
  i am grabbing a minute to post i feel greedy wrong
Filtered (5 words):
  grabbing minute post greedy wrong
Gates: i:0.00 am:0.00 grabbing:1.00 a:0.00 minute:0.88 to:0.00 post:0.69 i:0.00 ...

Original (18 words):
  i am ever feeling nostalgic about the fireplace i will know that it is still on ...
Filtered (9 words):
  ever nostalgic the fireplace will know still the property
Gates: i:0.00 am:0.00 ever:0.57 feeling:0.00 nostalgic:0.65 about:0.00 the:0.43 fireplace:1.00 ...

Original (4 words):

In [96]:
USE_HARD_FILTERING = False  # Set to False to skip hard filtering

if USE_HARD_FILTERING:
    print("Applying hard filtering...")
    
    train_df_oversampled['text_filtered'] = train_df_oversampled['text'].apply(tfidf_gating.filter_text)
    val_df['text_filtered'] = val_df['text'].apply(tfidf_gating.filter_text)
    test_df['text_filtered'] = test_df['text'].apply(tfidf_gating.filter_text)
    
    orig_len = train_df_oversampled['text'].apply(lambda x: len(x.split())).mean()
    filt_len = train_df_oversampled['text_filtered'].apply(lambda x: len(x.split())).mean()
    
    print(f"\nâœ… Hard filtering applied!")
    print(f"   Avg words: {orig_len:.1f} -> {filt_len:.1f} ({filt_len/orig_len*100:.1f}% retained)")
else:
    print("Skipping hard filtering (will use soft gating only)")
    train_df_oversampled['text_filtered'] = train_df_oversampled['text']
    val_df['text_filtered'] = val_df['text']
    test_df['text_filtered'] = test_df['text']

Skipping hard filtering (will use soft gating only)


### Dataset with Soft Gating Support


In [97]:
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)
print(f"Tokenizer: {MODEL_NAME}")

Tokenizer: roberta-base


In [None]:
class EmotionDatasetWithGates(Dataset):
    
    def __init__(self, texts, labels, tokenizer, tfidf_gating, max_length=128, compute_gates=True):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.tfidf_gating = tfidf_gating
        self.max_length = max_length
        self.compute_gates = compute_gates
    
    def __len__(self): 
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Tokenize with offset mapping for gate alignment
        encoding = self.tokenizer( text, truncation=True, max_length=self.max_length, padding=False, return_offsets_mapping=True, return_tensors=None )

        item = { 'input_ids': encoding['input_ids'], 'attention_mask': encoding['attention_mask'], 'labels': label }
        if self.compute_gates:
            # Compute word-level gates
            words, word_gates = self.tfidf_gating.compute_gates(text, normalize=True)
            
            # Map word gates to token gates
            offsets = encoding['offset_mapping']
            token_gates = self._map_word_gates_to_tokens(text, words, word_gates, offsets)
            item['gates'] = token_gates
        
        return item
    
    def _map_word_gates_to_tokens(self, text, words, word_gates, offsets):
        """Map word-level gates to subtoken-level gates."""

        seq_len = len(offsets)
        token_gates = [0.0] * seq_len  
        
        if len(words) == 0:
            return token_gates
        
        # Build word spans
        word_spans = []
        pos = 0
        text_lower = text.lower()
        for word in words:
            start = text_lower.find(word, pos)
            if start == -1:
                word_spans.append(None)
                continue
            end = start + len(word)
            word_spans.append((start, end))
            pos = end
        
        # Map tokens to words
        for token_idx, (char_start, char_end) in enumerate(offsets):
            if char_start == char_end:  
                token_gates[token_idx] = 1.0
                continue
            
            # Find overlapping word
            for word_idx, (ws, we) in enumerate(word_spans):
                if not (char_end <= ws or char_start >= we):  # Overlap
                    token_gates[token_idx] = float(word_gates[word_idx])
                    break
        
        return token_gates

In [82]:
class DataCollatorWithGates:
    """Collator that handles padding for both tokens and gates."""
    
    def __init__(self, tokenizer, include_gates=True):
        self.tokenizer = tokenizer
        self.include_gates = include_gates
        self.pad_id = tokenizer.pad_token_id
    
    def __call__(self, features):
        max_len = max(len(f['input_ids']) for f in features)
        
        batch = {'input_ids': [], 'attention_mask': [], 'labels': []}
        if self.include_gates and 'gates' in features[0]:
            batch['gates'] = []
        
        for f in features:
            pad_len = max_len - len(f['input_ids'])
            
            batch['input_ids'].append(f['input_ids'] + [self.pad_id] * pad_len)
            batch['attention_mask'].append(f['attention_mask'] + [0] * pad_len)
            batch['labels'].append(f['labels'])
            
            if 'gates' in batch:
                batch['gates'].append(f['gates'] + [0.0] * pad_len)
        
        batch['input_ids'] = torch.tensor(batch['input_ids'], dtype=torch.long)
        batch['attention_mask'] = torch.tensor(batch['attention_mask'], dtype=torch.long)
        batch['labels'] = torch.tensor(batch['labels'], dtype=torch.long)
        
        if 'gates' in batch:
            batch['gates'] = torch.tensor(batch['gates'], dtype=torch.float32)
        
        return batch

In [99]:
# Choose which text to use
USE_FILTERED_TEXT = False  # True = use hard-filtered text, False = use original
USE_SOFT_GATING = True    # True = compute gates for embedding modulation

text_col = 'text_filtered' if USE_FILTERED_TEXT else 'text'

train_dataset = EmotionDatasetWithGates(
    texts=train_df_oversampled[text_col].tolist(),
    labels=train_df_oversampled['label'].tolist(),
    tokenizer=tokenizer,
    tfidf_gating=tfidf_gating,
    max_length=MAX_LENGTH,
    compute_gates=USE_SOFT_GATING
)

val_dataset = EmotionDatasetWithGates(
    texts=val_df[text_col].tolist(),
    labels=val_df['label'].tolist(),
    tokenizer=tokenizer,
    tfidf_gating=tfidf_gating,
    max_length=MAX_LENGTH,
    compute_gates=USE_SOFT_GATING
)

test_dataset = EmotionDatasetWithGates(
    texts=test_df[text_col].tolist(),
    labels=test_df['label'].tolist(),
    tokenizer=tokenizer,
    tfidf_gating=tfidf_gating,
    max_length=MAX_LENGTH,
    compute_gates=USE_SOFT_GATING
)

print(f"âœ… Datasets created:")
print(f"   Train: {len(train_dataset):,}")
print(f"   Val:   {len(val_dataset):,}")
print(f"   Test:  {len(test_dataset):,}")
print(f"\n   Using: {text_col}")
print(f"   Soft gating: {USE_SOFT_GATING}")

âœ… Datasets created:
   Train: 32,172
   Val:   2,000
   Test:  2,000

   Using: text
   Soft gating: True


In [100]:
collator = DataCollatorWithGates(tokenizer, include_gates=USE_SOFT_GATING)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collator)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collator)

print(f"âœ… DataLoaders created (batch_size={BATCH_SIZE})")

âœ… DataLoaders created (batch_size=16)


In [101]:
batch = next(iter(train_loader))

print("Sample batch:")
print(f"  input_ids:      {batch['input_ids'].shape}")
print(f"  attention_mask: {batch['attention_mask'].shape}")
print(f"  labels:         {batch['labels'].shape}")

if 'gates' in batch:
    print(f"  gates:          {batch['gates'].shape}")
    print(f"  gates range:    [{batch['gates'].min():.3f}, {batch['gates'].max():.3f}]")
    
    print("\nToken-Gate alignment (sample 0):")
    tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][0].tolist())
    gates = batch['gates'][0].tolist()
    for t, g in list(zip(tokens, gates))[:15]:
        print(f"  {t:15s} -> {g:.3f}")

Sample batch:
  input_ids:      torch.Size([16, 52])
  attention_mask: torch.Size([16, 52])
  labels:         torch.Size([16])
  gates:          torch.Size([16, 52])
  gates range:    [0.000, 1.000]

Token-Gate alignment (sample 0):
  <s>             -> 1.000
  i               -> 0.000
  Ä feel           -> 0.000
  Ä terrible       -> 1.000
  Ä for            -> 0.000
  Ä him            -> 0.655
  Ä but            -> 0.000
  Ä oh             -> 0.998
  Ä my             -> 0.000
  Ä god            -> 0.839
  </s>            -> 1.000
  <pad>           -> 0.000
  <pad>           -> 0.000
  <pad>           -> 0.000
  <pad>           -> 0.000


In [None]:
import joblib

train_df_oversampled.to_csv('data/processed/gate/train_oversampled.csv', index=False)
train_df.to_csv('data/processed/gate/train_original.csv', index=False)
val_df.to_csv('data/processed/gate/val.csv', index=False)
test_df.to_csv('data/processed/gate/test.csv', index=False)

tfidf_state = {
    'vectorizer': tfidf_gating.vectorizer,
    'threshold': tfidf_gating.threshold,
    'min_tokens': tfidf_gating.min_tokens
}
joblib.dump(tfidf_state, 'data/processed/gate/tfidf_gating.joblib')

mappings = {
    'label_to_id': label_to_id,
    'id_to_label': {str(k): v for k, v in id_to_label.items()},
    'num_labels': NUM_LABELS,
}
with open('data/processed/gate/label_mappings.json', 'w') as f:
    json.dump(mappings, f, indent=2)

config = {
    'model_name': MODEL_NAME,
    'max_length': MAX_LENGTH,
    'batch_size': BATCH_SIZE,
    'tfidf_threshold': TFIDF_TRASHHOLD,
    'tfidf_min_tokens': MIN_TOKENS,
    'use_hard_filtering': USE_HARD_FILTERING,
    'use_soft_gating': USE_SOFT_GATING,
    'split_sizes': {
        'train_original': len(train_df),
        'train_oversampled': len(train_df_oversampled),
        'val': len(val_df),
        'test': len(test_df)
    }
}
with open('data/processed/gate/config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("âœ… All data saved!")
print("\nFiles:")
print("  - data/processed/gate/train_oversampled.csv")
print("  - data/processed/gate/tfidf_gating.joblib")
print("  - data/processed/gate/label_mappings.json")
print("  - data/processed/gate/config.json")

âœ… All data saved!

Files:
  - data/processed/gate/train_oversampled.csv
  - data/processed/gate/tfidf_gating.joblib
  - data/processed/gate/label_mappings.json
  - data/processed/gate/config.json


## Tokenization & Dataset

In [30]:
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)

print(f"Tokenizer loaded: {MODEL_NAME}")
print(f"  Vocab size: {tokenizer.vocab_size:,}")
print(f"  Max length: {MAX_LENGTH}")
print(f"  Padding token: {tokenizer.pad_token}")
print(f"  Special tokens: {tokenizer.special_tokens_map}")

Tokenizer loaded: roberta-base
  Vocab size: 50,265
  Max length: 128
  Padding token: <pad>
  Special tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}


In [31]:
print("\nTest tokenizare:")
print("=" * 60)

test_texts = [
    "I am so happy today!",
    "This makes me really angry and frustrated.",
    "I'm kinda scared about what's gonna happen..."
]

for text in test_texts:
    tokens = tokenizer.tokenize(text)
    encoding = tokenizer(text, truncation=True, max_length=MAX_LENGTH)
    
    print(f"Text: {text}")
    print(f"Tokens ({len(tokens)}): {tokens}")
    print(f"Input IDs: {encoding['input_ids'][:15]}...")
    print()


Test tokenizare:
Text: I am so happy today!
Tokens (6): ['I', 'Ä am', 'Ä so', 'Ä happy', 'Ä today', '!']
Input IDs: [0, 100, 524, 98, 1372, 452, 328, 2]...

Text: This makes me really angry and frustrated.
Tokens (8): ['This', 'Ä makes', 'Ä me', 'Ä really', 'Ä angry', 'Ä and', 'Ä frustrated', '.']
Input IDs: [0, 713, 817, 162, 269, 5800, 8, 8164, 4, 2]...

Text: I'm kinda scared about what's gonna happen...
Tokens (10): ['I', "'m", 'Ä kinda', 'Ä scared', 'Ä about', 'Ä what', "'s", 'Ä gonna', 'Ä happen', '...']
Input IDs: [0, 100, 437, 24282, 8265, 59, 99, 18, 6908, 1369, 734, 2]...



In [34]:
class EmotionDataset(Dataset):
    """
    PyTorch Dataset pentru emotion classification.
    Tokenizarea se face lazy (la __getitem__) pentru eficienÈ›Äƒ.
    """
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.texts = dataframe['text'].tolist()
        self.labels = dataframe['label'].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # Tokenizare - NU facem padding aici (se face Ã®n DataCollator)
        encoding = self.tokenizer( text, truncation=True, max_length=self.max_length, padding = False,
            return_tensors=None  # ReturneazÄƒ liste, nu tensori
        )
        
        return { 'input_ids': encoding['input_ids'], 'attention_mask': encoding['attention_mask'], 'labels': label }

In [35]:
train_dataset = EmotionDataset(train_df_oversampled, tokenizer, MAX_LENGTH)
val_dataset = EmotionDataset(val_df, tokenizer, MAX_LENGTH)
test_dataset = EmotionDataset(test_df, tokenizer, MAX_LENGTH)

print(f"\nâœ… Datasets created:")
print(f"   Train (oversampled): {len(train_dataset):,}")
print(f"   Val:                 {len(val_dataset):,}")
print(f"   Test:                {len(test_dataset):,}")


âœ… Datasets created:
   Train (oversampled): 32,172
   Val:                 2,000
   Test:                2,000


In [37]:
sample = train_dataset[0]
print("\nSample din train_dataset:")
print(f"  input_ids ({len(sample['input_ids'])} tokens): {sample['input_ids'][:10]}...")
print(f"  attention_mask: {sample['attention_mask'][:10]}...")
print(f"  label: {sample['labels']} ({id_to_label[sample['labels']]})")
print(f"\n  Decoded: {tokenizer.decode(sample['input_ids'])}")


Sample din train_dataset:
  input_ids (7 tokens): [0, 118, 222, 45, 619, 32386, 2]...
  attention_mask: [1, 1, 1, 1, 1, 1, 1]...
  label: 4 (sadness)

  Decoded: <s>i did not feel humiliated</s>


In [40]:
data_collator = DataCollatorWithPadding( tokenizer=tokenizer, padding=True, return_tensors='pt' )

print("âœ… DataCollatorWithPadding configured!")
print("   - Padding: dynamic (to max length in batch)")
print("   - Return: PyTorch tensors")

âœ… DataCollatorWithPadding configured!
   - Padding: dynamic (to max length in batch)
   - Return: PyTorch tensors


In [43]:
BATCH_SIZE = 32  

train_loader = DataLoader( train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=data_collator, num_workers=0, pin_memory=True if torch.cuda.is_available() else False )
val_loader = DataLoader( val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator, num_workers=0, pin_memory=True if torch.cuda.is_available() else False )
test_loader = DataLoader( test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator, num_workers=0, pin_memory=True if torch.cuda.is_available() else False )

print(f"\nâœ… DataLoaders created (batch_size={BATCH_SIZE}):")
print(f"   Train: {len(train_loader):,} batches")
print(f"   Val:   {len(val_loader):,} batches")
print(f"   Test:  {len(test_loader):,} batches")


âœ… DataLoaders created (batch_size=32):
   Train: 1,006 batches
   Val:   63 batches
   Test:  63 batches


In [44]:
print("\nTest batch from train_loader:")
print("=" * 50)

batch = next(iter(train_loader))
print(f"input_ids shape:      {batch['input_ids'].shape}")
print(f"attention_mask shape: {batch['attention_mask'].shape}")
print(f"labels shape:         {batch['labels'].shape}")
print(f"\nDecoded example:")
print(f"  {tokenizer.decode(batch['input_ids'][0], skip_special_tokens=True)[:100]}...")
print(f"  Label: {batch['labels'][0].item()} ({id_to_label[batch['labels'][0].item()]})")



Test batch from train_loader:
input_ids shape:      torch.Size([32, 54])
attention_mask shape: torch.Size([32, 54])
labels shape:         torch.Size([32])

Decoded example:
  i am starting to dislike the feeling of not caring about what is going to happen tomorrow...
  Label: 3 (love)


## Save Processed Data

In [45]:
train_df_oversampled.to_csv('data/processed/train_oversampled.csv', index=False)
train_df.to_csv('data/processed/train_original.csv', index=False)
val_df.to_csv('data/processed/val.csv', index=False)
test_df.to_csv('data/processed/test.csv', index=False)


mappings = {
    'label_to_id': label_to_id,
    'id_to_label': {str(k): v for k, v in id_to_label.items()},
    'num_labels': NUM_LABELS,
    'label_list': list(label_to_id.keys())
}

with open('data/processed/label_mappings.json', 'w') as f:
    json.dump(mappings, f, indent=2)

config = {
    'model_name': MODEL_NAME,
    'max_length': MAX_LENGTH,
    'batch_size': BATCH_SIZE,
    'random_state': RANDOM_STATE,
    'num_labels': NUM_LABELS,
    'slang_aware': True,
    'split_sizes': {
        'train_original': len(train_df),
        'train_oversampled': len(train_df_oversampled),
        'val': len(val_df),
        'test': len(test_df)
    }
}

with open('data/processed/config.json', 'w') as f:
    json.dump(config, f, indent=2)
    
print("âœ… All data saved!")
print("\nFiles saved:")
print("  - data/processed/train_oversampled.csv")
print("  - data/processed/train_original.csv")
print("  - data/processed/val.csv")
print("  - data/processed/test.csv")
print("  - data/processed/label_mappings.json")
print("  - data/processed/config.json")
print("  - reports/oversampling_stats.json")
print("  - reports/class_distribution.png")

âœ… All data saved!

Files saved:
  - data/processed/train_oversampled.csv
  - data/processed/train_original.csv
  - data/processed/val.csv
  - data/processed/test.csv
  - data/processed/label_mappings.json
  - data/processed/config.json
  - reports/oversampling_stats.json
  - reports/class_distribution.png
