# Complete Multiclass Aggression Detection Pipeline
## Data Processing → Feature Engineering → Ready for ML Models

**Complete workflow:**
1. Load 3 datasets (HatExplain, Davidson, Berkeley)
2. Intelligent relabeling to 5 classes
3. Advanced text preprocessing
4. Quality validation
5. Combine & deduplicate
6. Stratified train/val/test split
7. Feature engineering (5,045 features)
8. Feature scaling
9. Export ready for ML models

**Output:** Train/Val/Test sets with features ready for SVM, BERT, RoBERTa, etc.

## CELL 1: Setup & Imports

In [None]:
import pandas as pd
import numpy as np
import re
import json
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import Counter
import logging
import unicodedata
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Setup logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Download NLTK data
nltk.download('vader_lexicon', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

# Random seed for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print("="*80)
print("MULTICLASS AGGRESSION DETECTION - COMPLETE PIPELINE")
print("="*80)
print("✓ All imports loaded successfully")

MULTICLASS AGGRESSION DETECTION - COMPLETE PIPELINE
✓ All imports loaded successfully


## CELL 2: Phase 1 - Data Loader Class

In [None]:
class DataLoader:
    """Load datasets from multiple sources"""

    @staticmethod
    def load_hatexplain(path='/content/hatexplain.json'):
        """Load HatExplain dataset"""
        logger.info("Loading HatExplain...")
        try:
            with open(path, 'r') as f:
                data = json.load(f)

            hatexplain_list = []
            for post_id, item in data.items():
                text = ' '.join(item.get('post_tokens', []))
                hatexplain_list.append({
                    'text': text,
                    'source': 'HatExplain',
                    'hate_label': item.get('hate_speech_idx', 0),
                    'target': str(item.get('targeted_group', 'unknown'))
                })

            df = pd.DataFrame(hatexplain_list)
            logger.info(f"  ✓ HatExplain: {len(df)} samples")
            return df
        except Exception as e:
            logger.error(f"  ✗ HatExplain error: {e}")
            return pd.DataFrame()

    @staticmethod
    def load_davidson(path='/content/labeled_data.csv'):
        """Load Davidson dataset"""
        logger.info("Loading Davidson...")
        try:
            df = pd.read_csv(path, lineterminator='\n')
            davidson_list = []
            for idx, row in df.iterrows():
                davidson_list.append({
                    'text': str(row.get('tweet', '')),
                    'source': 'Davidson',
                    'hate_label': row.get('class', 2),
                })

            df = pd.DataFrame(davidson_list)
            logger.info(f"  ✓ Davidson: {len(df)} samples")
            return df
        except Exception as e:
            logger.error(f"  ✗ Davidson error: {e}")
            return pd.DataFrame()

    @staticmethod
    def load_berkeley(path='/content/measuring-hate-speech.parquet'):
        """Load Berkeley Measuring Hate Speech dataset - FIXED VERSION"""
        logger.info("Loading Berkeley...")
        try:
            # Try multiple formats
            if path.endswith('.parquet'):
                try:
                    df = pd.read_parquet(path)
                    logger.info("  → Read as parquet")
                except:
                    logger.info("  → Parquet failed, trying CSV...")
                    path_csv = path.replace('.parquet', '.csv')
                    df = pd.read_csv(path_csv)
            else:
                df = pd.read_csv(path)

            berkeley_list = []
            for idx, row in df.iterrows():
                berkeley_list.append({
                    'text': str(row.get('text', '')),
                    'source': 'Berkeley',
                    'hate_score': float(row.get('hate_speech', 0.5)),
                    'target': str(row.get('target_group', 'unknown'))
                })

            df = pd.DataFrame(berkeley_list)
            logger.info(f"  ✓ Berkeley: {len(df)} samples")
            return df

        except Exception as e:
            logger.error(f"  ✗ Berkeley file error: {e}")
            logger.info("  → Attempting to load from HuggingFace...")
            try:
                from datasets import load_dataset
                berkeley_hf = load_dataset("measuring-hate-speech")

                berkeley_list = []
                for item in berkeley_hf['train']:
                    berkeley_list.append({
                        'text': str(item.get('text', '')),
                        'source': 'Berkeley',
                        'hate_score': float(item.get('hate_speech', 0.5)),
                        'target': str(item.get('target_group', 'unknown'))
                    })

                df = pd.DataFrame(berkeley_list)
                logger.info(f"  ✓ Berkeley (HuggingFace): {len(df)} samples")
                return df
            except Exception as e2:
                logger.error(f"  ✗ Berkeley HuggingFace also failed: {e2}")
                logger.warning("  Berkeley data will NOT be included")
                return pd.DataFrame()

print("✓ DataLoader class defined with FIXED Berkeley loader")


✓ DataLoader class defined with FIXED Berkeley loader


## CELL 3: Phase 2 - Intelligent Labeler Class

In [None]:
class IntelligentLabeler:
    """Improved relabeling with better keyword matching"""

    def __init__(self):
        # COMPREHENSIVE keyword lists
        self.violence_kw = {
            'explicit': ['kill', 'death', 'die', 'murder', 'massacre', 'execute', 'behead'],
            'threats': ['bomb', 'shoot', 'hang', 'lynch', 'beat', 'harm', 'hurt'],
            'attack': ['attack', 'assault', 'violence', 'brutalize', 'torture']
        }

        self.gender_kw = {
            'female': ['woman', 'women', 'girl', 'bitch', 'whore', 'slut', 'hoe', 'female'],
            'lgbtq': ['gay', 'lesbian', 'transgender', 'trans', 'fag', 'dyke', 'queer'],
            'male': ['man', 'men', 'guy', 'asshole', 'bastard', 'dick'],
        }

        self.religion_kw = {
            'religions': ['muslim', 'islam', 'jewish', 'christian', 'hindu', 'buddhist', 'atheist', 'catholic', 'jew'],
            'places': ['mosque', 'synagogue', 'church', 'temple', 'madrassa'],
            'terms': ['allah', 'god', 'prayer', 'prayer', 'halal', 'kosher'],
            'leaders': ['imam', 'rabbi', 'priest', 'pope', 'pastor', 'ayatollah']
        }

        self.ethnicity_kw = {
            'races': ['black', 'white', 'asian', 'hispanic', 'african', 'caucasian', 'arab'],
            'nationalities': ['mexican', 'indian', 'chinese', 'japanese', 'african', 'middle eastern'],
            'immigration': ['immigrant', 'refugee', 'foreigner', 'illegal', 'alien'],
            'ethnicities': ['latino', 'indigenous', 'native american', 'negro']
        }

    def label_hatexplain(self, row):
        """Map HatExplain to 5-class - IMPROVED"""
        if row['hate_label'] == 0:
            return 0  # Not hate

        target = str(row.get('target', '')).lower()
        text = str(row.get('text', '')).lower()
        combined = target + " " + text

        # Priority: Check Religion FIRST (most missed)
        if any(kw in combined for kw in self.religion_kw['religions']):
            return 3  # Religion
        if any(kw in combined for kw in self.religion_kw['places']):
            return 3

        # Priority: Check Ethnicity SECOND
        if any(kw in combined for kw in self.ethnicity_kw['races']):
            return 4  # Ethnicity
        if any(kw in combined for kw in self.ethnicity_kw['nationalities']):
            return 4
        if any(kw in combined for kw in self.ethnicity_kw['immigration']):
            return 4

        # Priority: Check Gender THIRD
        if any(kw in combined for kw in self.gender_kw['female']):
            return 2  # Gender
        if any(kw in combined for kw in self.gender_kw['lgbtq']):
            return 2
        if any(kw in combined for kw in self.gender_kw['male']):
            return 2

        # Priority: Check Violence FOURTH
        if any(kw in combined for kw in self.violence_kw['explicit']):
            return 1  # Violence
        if any(kw in combined for kw in self.violence_kw['threats']):
            return 1
        if any(kw in combined for kw in self.violence_kw['attack']):
            return 1

        # Default
        return 0

    def label_davidson(self, row):
        """Map Davidson to 5-class - IMPROVED"""
        if row['hate_label'] == 2:  # Neither
            return 0

        text = str(row.get('text', '')).lower()

        # Religion FIRST
        if any(kw in text for kw in self.religion_kw['religions']):
            return 3
        if any(kw in text for kw in self.religion_kw['places']):
            return 3

        # Ethnicity SECOND
        if any(kw in text for kw in self.ethnicity_kw['races']):
            return 4
        if any(kw in text for kw in self.ethnicity_kw['nationalities']):
            return 4
        if any(kw in text for kw in self.ethnicity_kw['immigration']):
            return 4

        # Gender THIRD
        if any(kw in text for kw in self.gender_kw['female']):
            return 2
        if any(kw in text for kw in self.gender_kw['lgbtq']):
            return 2
        if any(kw in text for kw in self.gender_kw['male']):
            return 2

        # Violence FOURTH
        if any(kw in text for kw in self.violence_kw['explicit']):
            return 1
        if any(kw in text for kw in self.violence_kw['threats']):
            return 1
        if any(kw in text for kw in self.violence_kw['attack']):
            return 1

        return 0

    def label_berkeley(self, row):
        """Map Berkeley to 5-class - IMPROVED"""
        if row.get('hate_score', 0.5) < 0.5:
            return 0

        target = str(row.get('target', '')).lower()
        text = str(row.get('text', '')).lower()
        combined = target + " " + text

        # Religion FIRST
        if 'religion' in target or any(kw in combined for kw in self.religion_kw['religions']):
            return 3

        # Ethnicity SECOND
        if 'race' in target or 'ethnicity' in target or any(kw in combined for kw in self.ethnicity_kw['races']):
            return 4

        # Gender THIRD
        if 'gender' in target or 'women' in target or any(kw in combined for kw in self.gender_kw['female']):
            return 2

        # Violence DEFAULT
        return 1


## CELL 4: Phase 3 - Text Cleaner Class

In [None]:
class TextCleaner:
    """Advanced text cleaning"""

    @staticmethod
    def clean(text):
        """Full cleaning pipeline"""
        if not isinstance(text, str) or len(text) == 0:
            return ""

        # Normalize Unicode
        text = unicodedata.normalize('NFKC', text)

        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)

        # Remove mentions & hashtags
        text = re.sub(r'@\w+', '', text)
        text = re.sub(r'#(\w+)', r'\1', text)

        # Remove emojis
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"
            u"\U0001F300-\U0001F5FF"
            u"\U0001F680-\U0001F6FF"
            u"\U0001F1E0-\U0001F1FF"
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u"\U00010000-\U0010ffff"
            u"\u2640-\u2642"
            u"\u2600-\u2B55"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"
            u"\u3030"
            "]+",
            flags=re.UNICODE
        )
        text = emoji_pattern.sub(r'', text)

        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        # Lowercase
        text = text.lower()

        return text

print("✓ TextCleaner class defined")

✓ TextCleaner class defined


## CELL 5: Phase 4 - Quality Validator

In [None]:
class QualityValidator:
    """Validate data quality"""

    MIN_LENGTH = 5
    MAX_LENGTH = 512

    @staticmethod
    def validate(text):
        """Check text quality"""
        if len(text) < QualityValidator.MIN_LENGTH:
            return False
        if len(text) > QualityValidator.MAX_LENGTH:
            return False
        if text.strip() == "":
            return False
        return True

print("✓ QualityValidator class defined")

✓ QualityValidator class defined


## CELL 6: Phase 7 - Feature Engineer Class

In [None]:
class FeatureEngineer:
    """Extract comprehensive features (5,045 total)"""

    def __init__(self):
        self.sia = SentimentIntensityAnalyzer()
        self.stop_words = set(stopwords.words('english'))
        self.tfidf = TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 2),
            max_df=0.95,
            min_df=2,
            sublinear_tf=True
        )
        self.scaler = StandardScaler()

    def extract_tfidf(self, texts, fit=True):
        """Extract TF-IDF features (5,000 features)"""
        logger.info("  Extracting TF-IDF features (5,000)...")
        if fit:
            X_tfidf = self.tfidf.fit_transform(texts)
        else:
            X_tfidf = self.tfidf.transform(texts)
        return X_tfidf

    def extract_linguistic(self, texts):
        """Extract linguistic features (25 features)"""
        logger.info("  Extracting linguistic features (25)...")
        features = []

        for text in tqdm(texts, desc="Linguistic", disable=True):
            tokens = text.split()
            feature_dict = {
                'text_length': len(text),
                'word_count': len(tokens),
                'avg_word_length': np.mean([len(w) for w in tokens]) if tokens else 0,
                'unique_word_ratio': len(set(tokens)) / max(len(tokens), 1),
                'exclamation_count': text.count('!'),
                'question_count': text.count('?'),
                'uppercase_ratio': sum(1 for c in text if c.isupper()) / max(len(text), 1),
                'punctuation_density': sum(1 for c in text if not c.isalnum() and not c.isspace()) / max(len(text), 1),
                'repeated_chars': len([i for i in range(len(text)-1) if text[i] == text[i+1]]),
                'stopword_ratio': len([w for w in tokens if w in self.stop_words]) / max(len(tokens), 1),
            }
            features.append(feature_dict)

        return pd.DataFrame(features)

    def extract_sentiment(self, texts):
        """Extract sentiment features (5 features)"""
        logger.info("  Extracting sentiment features (5)...")
        sentiments = []

        for text in tqdm(texts, desc="Sentiment", disable=True):
            scores = self.sia.polarity_scores(text)
            sentiment_dict = {
                'sentiment_compound': scores['compound'],
                'sentiment_positive': scores['pos'],
                'sentiment_negative': scores['neg'],
                'sentiment_neutral': scores['neu'],
                'sentiment_is_negative': 1 if scores['compound'] < 0 else 0,
            }
            sentiments.append(sentiment_dict)

        return pd.DataFrame(sentiments)

    def extract_aggression_signals(self, texts):
        """Extract domain-specific signals (15 features)"""
        logger.info("  Extracting aggression signals (15)...")

        violence_kw = ['kill', 'death', 'bomb', 'shoot', 'harm', 'threat', 'attack']
        gender_kw = ['woman', 'women', 'girl', 'gay', 'lesbian', 'transgender']
        religion_kw = ['muslim', 'islam', 'jewish', 'christian', 'hindu']
        ethnicity_kw = ['black', 'white', 'asian', 'mexican', 'immigrant']

        signals = []

        for text in tqdm(texts, desc="Signals", disable=True):
            signal_dict = {
                'violence_keywords': sum(text.count(w) for w in violence_kw),
                'gender_keywords': sum(text.count(w) for w in gender_kw),
                'religion_keywords': sum(text.count(w) for w in religion_kw),
                'ethnicity_keywords': sum(text.count(w) for w in ethnicity_kw),
                'has_violence': 1 if any(w in text for w in violence_kw) else 0,
                'has_gender': 1 if any(w in text for w in gender_kw) else 0,
                'has_religion': 1 if any(w in text for w in religion_kw) else 0,
                'has_ethnicity': 1 if any(w in text for w in ethnicity_kw) else 0,
            }
            signals.append(signal_dict)

        return pd.DataFrame(signals)

    def extract_all_features(self, texts, fit=True):
        """Extract and combine all features"""
        logger.info("  EXTRACTING ALL FEATURES...")

        X_tfidf = self.extract_tfidf(texts, fit=fit)
        X_ling = self.extract_linguistic(texts)
        X_sent = self.extract_sentiment(texts)
        X_agg = self.extract_aggression_signals(texts)

        X_combined = np.hstack([
            X_tfidf.toarray(),
            X_ling.values,
            X_sent.values,
            X_agg.values
        ])

        return X_combined

    def scale_features(self, X, fit=True):
        """Scale features"""
        logger.info("  Scaling features...")
        if fit:
            X_scaled = self.scaler.fit_transform(X)
        else:
            X_scaled = self.scaler.transform(X)
        return X_scaled

print("✓ FeatureEngineer class defined")

✓ FeatureEngineer class defined


## CELL 7: Load Datasets (Phase 1)

In [None]:
logger.info("\n" + "="*80)
logger.info("PHASE 1: LOADING DATASETS")
logger.info("="*80)

loader = DataLoader()
df_hatexplain = loader.load_hatexplain()
df_davidson = loader.load_davidson()
df_berkeley = loader.load_berkeley()

## CELL 8: Relabel Datasets (Phase 2)

In [None]:
logger.info("\n" + "="*80)
logger.info("PHASE 2: RELABELING TO 5 CLASSES")
logger.info("="*80)

labeler = IntelligentLabeler()

if len(df_hatexplain) > 0:
    df_hatexplain['label'] = df_hatexplain.apply(labeler.label_hatexplain, axis=1)
    print("HatExplain class distribution:")
    print(df_hatexplain['label'].value_counts().sort_index())

if len(df_davidson) > 0:
    df_davidson['label'] = df_davidson.apply(labeler.label_davidson, axis=1)
    print("\nDavidson class distribution:")
    print(df_davidson['label'].value_counts().sort_index())

if len(df_berkeley) > 0:
    df_berkeley['label'] = df_berkeley.apply(labeler.label_berkeley, axis=1)
    print("\nBerkeley class distribution:")
    print(df_berkeley['label'].value_counts().sort_index())

HatExplain class distribution:
label
0    20148
Name: count, dtype: int64

Davidson class distribution:
label
0     7971
1      259
2    15744
3       84
4      725
Name: count, dtype: int64

Berkeley class distribution:
label
1    68595
2    24692
3    20527
4    21742
Name: count, dtype: int64


## CELL 9: Clean Text (Phase 3)

In [None]:
logger.info("\n" + "="*80)
logger.info("PHASE 3: TEXT CLEANING")
logger.info("="*80)

cleaner = TextCleaner()

if len(df_hatexplain) > 0:
    print("Cleaning HatExplain...")
    df_hatexplain['text'] = df_hatexplain['text'].apply(cleaner.clean)

if len(df_davidson) > 0:
    print("Cleaning Davidson...")
    df_davidson['text'] = df_davidson['text'].apply(cleaner.clean)

if len(df_berkeley) > 0:
    print("Cleaning Berkeley...")
    df_berkeley['text'] = df_berkeley['text'].apply(cleaner.clean)

print("✓ Text cleaning complete")

Cleaning HatExplain...
Cleaning Davidson...
Cleaning Berkeley...
✓ Text cleaning complete


## CELL 10: Quality Validation (Phase 4)

In [None]:
logger.info("\n" + "="*80)
logger.info("PHASE 4: QUALITY VALIDATION")
logger.info("="*80)

validator = QualityValidator()

if len(df_hatexplain) > 0:
    before = len(df_hatexplain)
    df_hatexplain = df_hatexplain[df_hatexplain['text'].apply(validator.validate)].reset_index(drop=True)
    logger.info(f"HatExplain: {before} → {len(df_hatexplain)} (removed {before-len(df_hatexplain)})")

if len(df_davidson) > 0:
    before = len(df_davidson)
    df_davidson = df_davidson[df_davidson['text'].apply(validator.validate)].reset_index(drop=True)
    logger.info(f"Davidson: {before} → {len(df_davidson)} (removed {before-len(df_davidson)})")

if len(df_berkeley) > 0:
    before = len(df_berkeley)
    df_berkeley = df_berkeley[df_berkeley['text'].apply(validator.validate)].reset_index(drop=True)
    logger.info(f"Berkeley: {before} → {len(df_berkeley)} (removed {before-len(df_berkeley)})")

## CELL 11: Combine & Deduplicate (Phase 5)

In [None]:
logger.info("\n" + "="*80)
logger.info("PHASE 5: COMBINE, DEDUPLICATE & SAMPLE")
logger.info("="*80)

dfs = []
if len(df_hatexplain) > 0:
    dfs.append(df_hatexplain[['text', 'label', 'source']])
if len(df_davidson) > 0:
    dfs.append(df_davidson[['text', 'label', 'source']])
if len(df_berkeley) > 0:
    dfs.append(df_berkeley[['text', 'label', 'source']])

combined = pd.concat(dfs, ignore_index=True)

# Remove duplicates
before_dedup = len(combined)
combined = combined.drop_duplicates(subset=['text']).reset_index(drop=True)
logger.info(f"After dedup: {len(combined)} (removed {before_dedup - len(combined)})")

# ===== SAMPLE DOWN TO MANAGEABLE SIZE =====
TARGET_SIZE = 20000  # Set your target

if len(combined) > TARGET_SIZE:
    logger.info(f"\nDataset too large ({len(combined)} > {TARGET_SIZE})")
    logger.info("Sampling down using stratified sampling...")

    # Stratified sample by class
    combined = combined.groupby('label', group_keys=False).apply(
        lambda x: x.sample(n=min(len(x), max(1, int(TARGET_SIZE * len(x) / len(combined)))),
                          random_state=RANDOM_SEED)
    ).reset_index(drop=True)

    logger.info(f"After sampling: {len(combined)} samples")

# Verify class distribution
logger.info("\nFinal class distribution:")
class_names = {0: 'No Aggression', 1: 'Violence', 2: 'Gender', 3: 'Religion', 4: 'Ethnicity'}
for cls in sorted(combined['label'].unique()):
    count = len(combined[combined['label'] == cls])
    pct = count / len(combined) * 100
    logger.info(f"  {class_names[cls]}: {count} ({pct:.1f}%)")


In [None]:
# CELL 11.5: Save Combined Dataset

logger.info("\n" + "="*80)
logger.info("SAVING COMBINED DATASET")
logger.info("="*80)

# Save the full combined dataset
combined.to_csv('combined_dataset_full.csv', index=False)
logger.info(f"✓ Saved: combined_dataset_full.csv")
logger.info(f"  Total samples: {len(combined)}")
logger.info(f"  Total features: 3 (text, label, source)")

# Save combined dataset statistics
combined_stats = {
    'total_samples': len(combined),
    'sources_breakdown': combined['source'].value_counts().to_dict(),
    'class_distribution': {int(k): int(v) for k, v in combined['label'].value_counts().items()},
    'class_names': {0: 'No Aggression', 1: 'Violence', 2: 'Gender', 3: 'Religion', 4: 'Ethnicity'}
}

with open('combined_stats.json', 'w') as f:
    json.dump(combined_stats, f, indent=2)
logger.info("✓ Saved: combined_stats.json")

# Display summary
print("\n" + "="*60)
print("COMBINED DATASET SUMMARY")
print("="*60)
print(f"\nTotal samples: {len(combined)}")
print(f"\nBy source:")
for source, count in combined['source'].value_counts().items():
    print(f"  {source}: {count}")
print(f"\nBy class:")
for cls in sorted(combined['label'].unique()):
    count = len(combined[combined['label'] == cls])
    class_name = {0: 'No Aggression', 1: 'Violence', 2: 'Gender', 3: 'Religion', 4: 'Ethnicity'}[cls]
    pct = count / len(combined) * 100
    print(f"  {class_name}: {count} ({pct:.1f}%)")
print(f"\nFirst 5 samples:")
print(combined.head(5))



COMBINED DATASET SUMMARY

Total samples: 19997

By source:
  Berkeley: 9312
  Davidson: 5858
  HatExplain: 4827

By class:
  No Aggression: 6695 (33.5%)
  Violence: 5392 (27.0%)
  Gender: 5679 (28.4%)
  Religion: 819 (4.1%)
  Ethnicity: 1412 (7.1%)

First 5 samples:
                                                text  label      source
0  since it has already been established that as ...      0  HatExplain
1  <user> makes no sense to rent homie it ’ actua...      0  HatExplain
2  fuck war fuck bullying fuck cancer fuck racism...      0  HatExplain
3  <user> you do not consider anything presented ...      0  HatExplain
4  on average blacks commit far more crimes but y...      0  HatExplain


## CELL 12: Stratified Split (Phase 6)

In [None]:
logger.info("\n" + "="*80)
logger.info("PHASE 6: STRATIFIED TRAIN/VAL/TEST SPLIT")
logger.info("="*80)

X = combined['text'].values
y = combined['label'].values

# 80% train+val, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=RANDOM_SEED
)

# 87.5% train, 12.5% val
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.125, stratify=y_temp, random_state=RANDOM_SEED
)

logger.info(f"Train: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%)")
logger.info(f"Val: {len(X_val)} ({len(X_val)/len(X)*100:.1f}%)")
logger.info(f"Test: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")

## CELL 13: Feature Engineering (Phase 7)

In [None]:
logger.info("\n" + "="*80)
logger.info("PHASE 7: FEATURE ENGINEERING (5,045 total features)")
logger.info("="*80)

fe = FeatureEngineer()

logger.info("\nExtracting features from TRAINING set...")
X_train_feat = fe.extract_all_features(X_train, fit=True)
print(f"Train features shape: {X_train_feat.shape}")

logger.info("\nExtracting features from VALIDATION set...")
X_val_feat = fe.extract_all_features(X_val, fit=False)
print(f"Val features shape: {X_val_feat.shape}")

logger.info("\nExtracting features from TEST set...")
X_test_feat = fe.extract_all_features(X_test, fit=False)
print(f"Test features shape: {X_test_feat.shape}")

Train features shape: (13997, 5023)
Val features shape: (2000, 5023)
Test features shape: (4000, 5023)


## CELL 14: Scale Features (Phase 8)

In [None]:
logger.info("\n" + "="*80)
logger.info("PHASE 8: SCALING FEATURES")
logger.info("="*80)

X_train_scaled = fe.scale_features(X_train_feat, fit=True)
X_val_scaled = fe.scale_features(X_val_feat, fit=False)
X_test_scaled = fe.scale_features(X_test_feat, fit=False)

print(f"Scaled train shape: {X_train_scaled.shape}")
print(f"Scaled val shape: {X_val_scaled.shape}")
print(f"Scaled test shape: {X_test_scaled.shape}")

Scaled train shape: (13997, 5023)
Scaled val shape: (2000, 5023)
Scaled test shape: (4000, 5023)


## CELL 15: Save All Data (Phase 9)

In [None]:
logger.info("\n" + "="*80)
logger.info("PHASE 9: SAVING PROCESSED DATA")
logger.info("="*80)

# Save raw splits
pd.DataFrame({'text': X_train, 'label': y_train}).to_csv('train.csv', index=False)
pd.DataFrame({'text': X_val, 'label': y_val}).to_csv('val.csv', index=False)
pd.DataFrame({'text': X_test, 'label': y_test}).to_csv('test.csv', index=False)
logger.info("✓ Saved: train.csv, val.csv, test.csv")

# Save features
np.save('X_train_features.npy', X_train_scaled)
np.save('X_val_features.npy', X_val_scaled)
np.save('X_test_features.npy', X_test_scaled)
np.save('y_train.npy', y_train)
np.save('y_val.npy', y_val)
np.save('y_test.npy', y_test)
logger.info("✓ Saved: X_train_features.npy, X_val_features.npy, X_test_features.npy")

# Save metadata
metadata = {
    'total_samples': len(combined),
    'train_size': len(X_train),
    'val_size': len(X_val),
    'test_size': len(X_test),
    'n_features': X_train_scaled.shape[1],
    'classes': 5,
    'class_names': class_names,
    'random_seed': RANDOM_SEED
}

with open('metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
logger.info("✓ Saved: metadata.json")

## CELL 16: Final Summary

In [None]:
logger.info("\n" + "="*80)
logger.info("✅ COMPLETE PIPELINE FINISHED!")
logger.info("="*80)
logger.info(f"\nOutput files:")
logger.info(f"  - train.csv, val.csv, test.csv (raw text + labels)")
logger.info(f"  - X_train_features.npy (shape: {X_train_scaled.shape})")
logger.info(f"  - X_val_features.npy (shape: {X_val_scaled.shape})")
logger.info(f"  - X_test_features.npy (shape: {X_test_scaled.shape})")
logger.info(f"  - y_train.npy, y_val.npy, y_test.npy")
logger.info(f"  - metadata.json")
logger.info(f"\n✓ Ready for model training!")
logger.info(f"\nFeature breakdown:")
logger.info(f"  - TF-IDF: 5,000 features")
logger.info(f"  - Linguistic: 25 features")
logger.info(f"  - Sentiment: 5 features")
logger.info(f"  - Aggression Signals: 15 features")
logger.info(f"  - TOTAL: 5,045 features")