In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from collections import defaultdict, Counter

# === 1️⃣ Load data ===
df = pd.read_csv("/kaggle/input/twitter-and-reddit-sentimental-analysis-dataset/Reddit_Data.csv")
df = df[['clean_comment', 'category']].dropna()

print("Original dataset distribution:")
print(df['category'].value_counts())
print(f"\nClass proportions:")
print(df['category'].value_counts(normalize=True))

# === STRATEGY 1: BALANCED UNDERSAMPLING ===
print("\n" + "="*60)
print("STRATEGY 1: Balanced Undersampling")
print("="*60)

# Find the minority class size
min_class_size = df['category'].value_counts().min()
print(f"Minority class size: {min_class_size}")

# Sample equal amounts from each class
df_balanced = pd.concat([
    df[df['category'] == -1].sample(min_class_size, random_state=42),
    df[df['category'] == 0].sample(min_class_size, random_state=42),
    df[df['category'] == 1].sample(min_class_size, random_state=42)
]).sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

print(f"Balanced dataset size: {len(df_balanced)}")
print("Balanced distribution:")
print(df_balanced['category'].value_counts())

# Split balanced data
train_df_balanced, test_df_balanced = train_test_split(
    df_balanced, test_size=0.3, random_state=42, stratify=df_balanced['category']
)

# === STRATEGY 2: STRATIFIED SPLIT (keeps original distribution) ===
print("\n" + "="*60)
print("STRATEGY 2: Stratified Split (Original Distribution)")
print("="*60)

train_df_stratified, test_df_stratified = train_test_split(
    df, test_size=0.3, random_state=42, stratify=df['category']
)

print("Training set distribution:")
print(train_df_stratified['category'].value_counts(normalize=True))
print("\nTest set distribution:")
print(test_df_stratified['category'].value_counts(normalize=True))

# === STRATEGY 3: WEIGHTED BALANCED (Oversample minority, undersample majority) ===
print("\n" + "="*60)
print("STRATEGY 3: Hybrid Sampling (Target size between min and max)")
print("="*60)

class_counts = df['category'].value_counts()
target_size = int(np.mean(class_counts))  # Use average as target
print(f"Target size per class: {target_size}")

df_hybrid = pd.concat([
    df[df['category'] == -1].sample(
        min(target_size, class_counts[-1]), 
        replace=(target_size > class_counts[-1]), 
        random_state=42
    ),
    df[df['category'] == 0].sample(
        min(target_size, class_counts[0]), 
        replace=(target_size > class_counts[0]), 
        random_state=42
    ),
    df[df['category'] == 1].sample(
        min(target_size, class_counts[1]), 
        replace=(target_size > class_counts[1]), 
        random_state=42
    )
]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Hybrid dataset size: {len(df_hybrid)}")
print("Hybrid distribution:")
print(df_hybrid['category'].value_counts())

train_df_hybrid, test_df_hybrid = train_test_split(
    df_hybrid, test_size=0.3, random_state=42, stratify=df_hybrid['category']
)

# === Function to train and evaluate ===
def train_and_evaluate(train_df, test_df, strategy_name):
    print("\n" + "="*60)
    print(f"Training with: {strategy_name}")
    print("="*60)
    
    # Preprocessing
    negation_words = {"not", "no", "nor", "never", "n't", "don't", "dont", "cannot"}
    stop_words = set(stopwords.words("english")) - negation_words
    
    def clean_tweet(text):
        if not text:
            return []
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)
        text = re.sub(r'[^A-Za-z\s]', '', text)
        text = text.lower().split()
        return [w for w in text if w and w not in stop_words]
    
    train_df["tokens"] = train_df["clean_comment"].astype(str).apply(clean_tweet)
    test_df["tokens"] = test_df["clean_comment"].astype(str).apply(clean_tweet)
    
    # Build word probabilities
    vocab_stats = defaultdict(lambda: [0, 0, 0])
    for _, row in train_df.iterrows():
        cat = int(row["category"])
        for w in row["tokens"]:
            if cat == -1:
                vocab_stats[w][0] += 1
            elif cat == 0:
                vocab_stats[w][1] += 1
            elif cat == 1:
                vocab_stats[w][2] += 1
    
    word_probs = {}
    for w, counts in vocab_stats.items():
        total = sum(counts)
        if total >= 2:  # Minimum 2 occurrences
            probs = np.array(counts) / total
            word_probs[w] = probs
    
    word_prob_df = pd.DataFrame(word_probs).T
    word_prob_df.columns = ['p(-1)', 'p(0)', 'p(1)']
    
    # Compute thresholds
    max_p = word_prob_df.max(axis=1)
    second_p = word_prob_df.apply(lambda x: sorted(x, reverse=True)[1], axis=1)
    ratio = max_p / (second_p + 1e-9)
    strong_thresh = np.percentile(ratio, 75)
    mild_thresh = np.percentile(ratio, 55)
    
    # Tag words
    def tag_word(row):
        probs = row.values
        labels = ['neg', 'neu', 'pos']
        max_idx = np.argmax(probs)
        sorted_probs = np.sort(probs)[::-1]
        ratio = sorted_probs[0] / (sorted_probs[1] + 1e-9)
        
        if ratio >= strong_thresh:
            return labels[max_idx]
        elif ratio >= mild_thresh:
            if labels[max_idx] == 'pos' and probs[1] > 0.15:
                return 'mild_pos'
            elif labels[max_idx] == 'neg' and probs[1] > 0.15:
                return 'mild_neg'
            else:
                return labels[max_idx]
        else:
            return 'contextual'
    
    word_prob_df["tag"] = word_prob_df.apply(tag_word, axis=1)
    word_tag_dict = word_prob_df["tag"].to_dict()
    
    # Tag tweets
    def tag_tweet(tokens):
        return [word_tag_dict[w] for w in tokens if w in word_tag_dict]
    
    train_df["tags"] = train_df["tokens"].apply(tag_tweet)
    test_df["tags"] = test_df["tokens"].apply(tag_tweet)
    
    # Compress tags
    def compress_tags(tags, max_tags=6):
        tags = [t for t in tags if t != "contextual"]
        if not tags:
            return "contextual_only"
        priority = {'pos': 3, 'neg': 3, 'mild_pos': 2, 'mild_neg': 2, 'neu': 1}
        tags_sorted = sorted(tags, key=lambda t: priority.get(t, 0), reverse=True)[:max_tags]
        tag_count = Counter(tags_sorted)
        return "_".join([f"{k}:{v}" for k, v in sorted(tag_count.items())])
    
    train_df["signature"] = train_df["tags"].apply(compress_tags)
    
    # Build probability table
    alpha = 1
    tag_class_counts = defaultdict(lambda: Counter())
    
    for _, row in train_df.iterrows():
        tag_class_counts[row["signature"]][int(row["category"])] += 1
    
    prob_table = {}
    for sig, class_counts in tag_class_counts.items():
        total = sum(class_counts.values())
        probs = {}
        for c in [-1, 0, 1]:
            probs[c] = (class_counts[c] + alpha) / (total + alpha * 3)
        prob_table[sig] = probs
    
    prob_df = pd.DataFrame(prob_table).T.fillna(1/3)
    prob_df.columns = ['p(-1)', 'p(0)', 'p(1)']
    
    # Drop rare signatures
    min_occ = 3  # Lower threshold for balanced data
    valid_sigs = [sig for sig, counts in tag_class_counts.items() if sum(counts.values()) >= min_occ]
    prob_df = prob_df.loc[valid_sigs]
    
    print(f"Vocabulary size: {len(word_tag_dict)}")
    print(f"Valid signatures: {len(valid_sigs)}")
    
    # Predict
    def predict_argmax(tags):
        sig = compress_tags(tags)
        if sig not in prob_df.index:
            return 0
        row = prob_df.loc[sig]
        max_col = row.idxmax()
        class_val = int(max_col.split('(')[1].split(')')[0])
        return class_val
    
    preds = [predict_argmax(tags) for tags in test_df["tags"]]
    test_df["pred"] = preds
    
    accuracy = np.mean(test_df["pred"] == test_df["category"])
    
    # Metrics
    from sklearn.metrics import classification_report, confusion_matrix
    
    print(f"\nACCURACY: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print("\nClassification Report:")
    print(classification_report(test_df["category"], test_df["pred"], 
                              target_names=['Negative', 'Neutral', 'Positive'],
                              zero_division=0))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(test_df["category"], test_df["pred"])
    cm_df = pd.DataFrame(cm, 
                         index=['True Neg', 'True Neu', 'True Pos'],
                         columns=['Pred Neg', 'Pred Neu', 'Pred Pos'])
    print(cm_df)
    
    return accuracy

# === Run all strategies ===
results = {}

print("\n" + "#"*60)
print("COMPARING ALL SAMPLING STRATEGIES")
print("#"*60)

results['Balanced Undersampling'] = train_and_evaluate(
    train_df_balanced.copy(), test_df_balanced.copy(), 
    "Balanced Undersampling"
)

results['Stratified (Original)'] = train_and_evaluate(
    train_df_stratified.copy(), test_df_stratified.copy(), 
    "Stratified Split (Original Distribution)"
)

results['Hybrid Sampling'] = train_and_evaluate(
    train_df_hybrid.copy(), test_df_hybrid.copy(), 
    "Hybrid Sampling"
)

# === Final comparison ===
print("\n" + "="*60)
print("FINAL COMPARISON")
print("="*60)
for strategy, acc in results.items():
    print(f"{strategy:35s}: {acc:.4f} ({acc*100:.2f}%)")

best_strategy = max(results, key=results.get)
print(f"\n🏆 BEST STRATEGY: {best_strategy}")
print(f"   Accuracy: {results[best_strategy]:.4f} ({results[best_strategy]*100:.2f}%)")

Original dataset distribution:
category
 1    15830
 0    13042
-1     8277
Name: count, dtype: int64

Class proportions:
category
 1    0.426122
 0    0.351073
-1    0.222805
Name: proportion, dtype: float64

STRATEGY 1: Balanced Undersampling
Minority class size: 8277
Balanced dataset size: 24831
Balanced distribution:
category
-1    8277
 1    8277
 0    8277
Name: count, dtype: int64

STRATEGY 2: Stratified Split (Original Distribution)
Training set distribution:
category
 1    0.426127
 0    0.351061
-1    0.222812
Name: proportion, dtype: float64

Test set distribution:
category
 1    0.426110
 0    0.351099
-1    0.222790
Name: proportion, dtype: float64

STRATEGY 3: Hybrid Sampling (Target size between min and max)
Target size per class: 12383
Hybrid dataset size: 33043
Hybrid distribution:
category
 1    12383
 0    12383
-1     8277
Name: count, dtype: int64

############################################################
COMPARING ALL SAMPLING STRATEGIES
#######################

In [2]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from collections import defaultdict, Counter

# === 1️⃣ Load data ===
df = pd.read_csv("/kaggle/input/twitter-and-reddit-sentimental-analysis-dataset/Twitter_Data.csv")
df = df[['clean_text', 'category']].dropna()

print("="*60)
print("ORIGINAL DATASET")
print("="*60)
print(f"Total samples: {len(df)}")
print("\nClass distribution:")
print(df['category'].value_counts())
print(f"\nClass proportions:")
print(df['category'].value_counts(normalize=True))

# === STRATEGY 1: BALANCED UNDERSAMPLING ===
print("\n" + "="*60)
print("STRATEGY 1: Balanced Undersampling")
print("="*60)

# Find the minority class size
class_counts = df['category'].value_counts()
min_class_size = class_counts.min()
print(f"Original class counts:")
for cat in [-1, 0, 1]:
    print(f"  Class {cat}: {class_counts.get(cat, 0)} samples")
print(f"\nMinority class size: {min_class_size}")

# Sample equal amounts from each class
df_balanced = pd.concat([
    df[df['category'] == -1].sample(min_class_size, random_state=42),
    df[df['category'] == 0].sample(min_class_size, random_state=42),
    df[df['category'] == 1].sample(min_class_size, random_state=42)
]).sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

print(f"\n✂️ Sampled {min_class_size} from each class")
print(f"📦 Total balanced dataset: {len(df_balanced)} samples ({len(df_balanced)/len(df)*100:.1f}% of original)")
print(f"🗑️ Removed: {len(df) - len(df_balanced)} samples ({(len(df)-len(df_balanced))/len(df)*100:.1f}%)")
print("\nBalanced distribution:")
print(df_balanced['category'].value_counts())

# Split: Use ALL balanced data for training, test on ORIGINAL distribution
# This is more realistic - train balanced, test on real-world imbalanced data
train_df_balanced = df_balanced.copy()

# Create a separate test set from ORIGINAL data (not used in training)
remaining_indices = df.index.difference(df_balanced.index)
test_df_balanced = df.loc[remaining_indices].sample(
    frac=0.3, random_state=42
)  # 30% of remaining data

print(f"\n📚 Training: {len(train_df_balanced):,} samples (balanced)")
print(f"🧪 Testing: {len(test_df_balanced):,} samples (from original distribution)")
print(f"   Test set is {len(test_df_balanced)/len(df)*100:.1f}% of original data")
print("\nTest set distribution (original imbalance):")
print(test_df_balanced['category'].value_counts(normalize=True))

# === STRATEGY 2: STRATIFIED SPLIT (keeps original distribution) ===
print("\n" + "="*60)
print("STRATEGY 2: Stratified Split (Original Distribution)")
print("="*60)

train_df_stratified, test_df_stratified = train_test_split(
    df, test_size=0.3, random_state=42, stratify=df['category']
)

print(f"📦 Total dataset: {len(df)} samples (100% of original)")
print(f"   Training: {len(train_df_stratified)} samples (70%)")
print(f"   Testing: {len(test_df_stratified)} samples (30%)")
print("\nTraining set distribution:")
print(train_df_stratified['category'].value_counts())
print("\nTest set distribution:")
print(test_df_stratified['category'].value_counts())

# === STRATEGY 3: WEIGHTED BALANCED (Oversample minority, undersample majority) ===
print("\n" + "="*60)
print("STRATEGY 3: Hybrid Sampling (Target size between min and max)")
print("="*60)

class_counts = df['category'].value_counts()
target_size = int(np.mean(class_counts))  # Use average as target
print(f"Original class sizes:")
for cat in [-1, 0, 1]:
    count = class_counts.get(cat, 0)
    print(f"  Class {cat}: {count} samples")
print(f"\nTarget size per class: {target_size} (average of all classes)")

samples_per_class = {}
for cat in [-1, 0, 1]:
    original_count = class_counts.get(cat, 0)
    sampled_count = min(target_size, original_count)
    will_oversample = target_size > original_count
    samples_per_class[cat] = {
        'original': original_count,
        'sampled': sampled_count if not will_oversample else target_size,
        'oversampled': will_oversample
    }

print("\nSampling details:")
for cat in [-1, 0, 1]:
    info = samples_per_class[cat]
    status = "⬆️ OVERSAMPLING" if info['oversampled'] else "⬇️ UNDERSAMPLING"
    print(f"  Class {cat}: {info['original']:,} → {info['sampled']:,} {status}")

df_hybrid = pd.concat([
    df[df['category'] == -1].sample(
        min(target_size, class_counts[-1]), 
        replace=(target_size > class_counts[-1]), 
        random_state=42
    ),
    df[df['category'] == 0].sample(
        min(target_size, class_counts[0]), 
        replace=(target_size > class_counts[0]), 
        random_state=42
    ),
    df[df['category'] == 1].sample(
        min(target_size, class_counts[1]), 
        replace=(target_size > class_counts[1]), 
        random_state=42
    )
]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\n📦 Total hybrid dataset: {len(df_hybrid)} samples ({len(df_hybrid)/len(df)*100:.1f}% of original)")
print("Hybrid distribution:")
print(df_hybrid['category'].value_counts())

# Split: Use ALL hybrid data for training, test on remaining original data
train_df_hybrid = df_hybrid.copy()

# Test on data not used in hybrid sampling
remaining_indices_hybrid = df.index.difference(df_hybrid.index)
if len(remaining_indices_hybrid) > 0:
    test_df_hybrid = df.loc[remaining_indices_hybrid].sample(
        min(len(remaining_indices_hybrid), int(len(df)*0.3)), 
        random_state=42
    )
else:
    # If hybrid used all data, split normally
    train_df_hybrid, test_df_hybrid = train_test_split(
        df_hybrid, test_size=0.3, random_state=42, stratify=df_hybrid['category']
    )

print(f"\n📚 Training: {len(train_df_hybrid):,} samples (hybrid balanced)")
print(f"🧪 Testing: {len(test_df_hybrid):,} samples")
print(f"   Test set is {len(test_df_hybrid)/len(df)*100:.1f}% of original data")

# === Function to train and evaluate ===
def train_and_evaluate(train_df, test_df, strategy_name):
    print("\n" + "="*60)
    print(f"Training with: {strategy_name}")
    print("="*60)
    
    # Preprocessing
    negation_words = {"not", "no", "nor", "never", "n't", "don't", "dont", "cannot"}
    stop_words = set(stopwords.words("english")) - negation_words
    
    def clean_tweet(text):
        if not text:
            return []
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)
        text = re.sub(r'[^A-Za-z\s]', '', text)
        text = text.lower().split()
        return [w for w in text if w and w not in stop_words]
    
    train_df["tokens"] = train_df["clean_text"].astype(str).apply(clean_tweet)
    test_df["tokens"] = test_df["clean_text"].astype(str).apply(clean_tweet)
    
    # Build word probabilities
    vocab_stats = defaultdict(lambda: [0, 0, 0])
    for _, row in train_df.iterrows():
        cat = int(row["category"])
        for w in row["tokens"]:
            if cat == -1:
                vocab_stats[w][0] += 1
            elif cat == 0:
                vocab_stats[w][1] += 1
            elif cat == 1:
                vocab_stats[w][2] += 1
    
    word_probs = {}
    for w, counts in vocab_stats.items():
        total = sum(counts)
        if total >= 2:  # Minimum 2 occurrences
            probs = np.array(counts) / total
            word_probs[w] = probs
    
    word_prob_df = pd.DataFrame(word_probs).T
    word_prob_df.columns = ['p(-1)', 'p(0)', 'p(1)']
    
    # Compute thresholds
    max_p = word_prob_df.max(axis=1)
    second_p = word_prob_df.apply(lambda x: sorted(x, reverse=True)[1], axis=1)
    ratio = max_p / (second_p + 1e-9)
    strong_thresh = np.percentile(ratio, 75)
    mild_thresh = np.percentile(ratio, 55)
    
    # Tag words
    def tag_word(row):
        probs = row.values
        labels = ['neg', 'neu', 'pos']
        max_idx = np.argmax(probs)
        sorted_probs = np.sort(probs)[::-1]
        ratio = sorted_probs[0] / (sorted_probs[1] + 1e-9)
        
        if ratio >= strong_thresh:
            return labels[max_idx]
        elif ratio >= mild_thresh:
            if labels[max_idx] == 'pos' and probs[1] > 0.15:
                return 'mild_pos'
            elif labels[max_idx] == 'neg' and probs[1] > 0.15:
                return 'mild_neg'
            else:
                return labels[max_idx]
        else:
            return 'contextual'
    
    word_prob_df["tag"] = word_prob_df.apply(tag_word, axis=1)
    word_tag_dict = word_prob_df["tag"].to_dict()
    
    # Tag tweets
    def tag_tweet(tokens):
        return [word_tag_dict[w] for w in tokens if w in word_tag_dict]
    
    train_df["tags"] = train_df["tokens"].apply(tag_tweet)
    test_df["tags"] = test_df["tokens"].apply(tag_tweet)
    
    # Compress tags
    def compress_tags(tags, max_tags=6):
        tags = [t for t in tags if t != "contextual"]
        if not tags:
            return "contextual_only"
        priority = {'pos': 3, 'neg': 3, 'mild_pos': 2, 'mild_neg': 2, 'neu': 1}
        tags_sorted = sorted(tags, key=lambda t: priority.get(t, 0), reverse=True)[:max_tags]
        tag_count = Counter(tags_sorted)
        return "_".join([f"{k}:{v}" for k, v in sorted(tag_count.items())])
    
    train_df["signature"] = train_df["tags"].apply(compress_tags)
    
    # Build probability table
    alpha = 1
    tag_class_counts = defaultdict(lambda: Counter())
    
    for _, row in train_df.iterrows():
        tag_class_counts[row["signature"]][int(row["category"])] += 1
    
    prob_table = {}
    for sig, class_counts in tag_class_counts.items():
        total = sum(class_counts.values())
        probs = {}
        for c in [-1, 0, 1]:
            probs[c] = (class_counts[c] + alpha) / (total + alpha * 3)
        prob_table[sig] = probs
    
    prob_df = pd.DataFrame(prob_table).T.fillna(1/3)
    prob_df.columns = ['p(-1)', 'p(0)', 'p(1)']
    
    # Drop rare signatures
    min_occ = 3  # Lower threshold for balanced data
    valid_sigs = [sig for sig, counts in tag_class_counts.items() if sum(counts.values()) >= min_occ]
    prob_df = prob_df.loc[valid_sigs]
    
    print(f"Vocabulary size: {len(word_tag_dict)}")
    print(f"Valid signatures: {len(valid_sigs)}")
    
    # Predict
    def predict_argmax(tags):
        sig = compress_tags(tags)
        if sig not in prob_df.index:
            return 0
        row = prob_df.loc[sig]
        max_col = row.idxmax()
        class_val = int(max_col.split('(')[1].split(')')[0])
        return class_val
    
    preds = [predict_argmax(tags) for tags in test_df["tags"]]
    test_df["pred"] = preds
    
    accuracy = np.mean(test_df["pred"] == test_df["category"])
    
    # Metrics
    from sklearn.metrics import classification_report, confusion_matrix
    
    print(f"\nACCURACY: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print("\nClassification Report:")
    print(classification_report(test_df["category"], test_df["pred"], 
                              target_names=['Negative', 'Neutral', 'Positive'],
                              zero_division=0))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(test_df["category"], test_df["pred"])
    cm_df = pd.DataFrame(cm, 
                         index=['True Neg', 'True Neu', 'True Pos'],
                         columns=['Pred Neg', 'Pred Neu', 'Pred Pos'])
    print(cm_df)
    
    return accuracy

# === Run all strategies ===
results = {}

print("\n" + "#"*60)
print("COMPARING ALL SAMPLING STRATEGIES")
print("#"*60)

results['Balanced Undersampling'] = train_and_evaluate(
    train_df_balanced.copy(), test_df_balanced.copy(), 
    "Balanced Undersampling"
)

results['Stratified (Original)'] = train_and_evaluate(
    train_df_stratified.copy(), test_df_stratified.copy(), 
    "Stratified Split (Original Distribution)"
)

results['Hybrid Sampling'] = train_and_evaluate(
    train_df_hybrid.copy(), test_df_hybrid.copy(), 
    "Hybrid Sampling"
)

# === Final comparison ===
print("\n" + "="*60)
print("FINAL COMPARISON")
print("="*60)

print("\nData Usage Summary:")
print("-" * 60)
print(f"Original dataset: {len(df):,} samples")
print(f"  Strategy 1 (Balanced):  {len(df_balanced):,} samples ({len(df_balanced)/len(df)*100:.1f}%)")
print(f"  Strategy 2 (Stratified): {len(df):,} samples (100.0%)")
print(f"  Strategy 3 (Hybrid):     {len(df_hybrid):,} samples ({len(df_hybrid)/len(df)*100:.1f}%)")

print("\nAccuracy Results:")
print("-" * 60)
for strategy, acc in results.items():
    print(f"{strategy:35s}: {acc:.4f} ({acc*100:.2f}%)")

best_strategy = max(results, key=results.get)
print(f"\n🏆 BEST STRATEGY: {best_strategy}")
print(f"   Accuracy: {results[best_strategy]:.4f} ({results[best_strategy]*100:.2f}%)")

ORIGINAL DATASET
Total samples: 162969

Class distribution:
category
 1.0    72249
 0.0    55211
-1.0    35509
Name: count, dtype: int64

Class proportions:
category
 1.0    0.443330
 0.0    0.338782
-1.0    0.217888
Name: proportion, dtype: float64

STRATEGY 1: Balanced Undersampling
Original class counts:
  Class -1: 35509 samples
  Class 0: 55211 samples
  Class 1: 72249 samples

Minority class size: 35509

✂️ Sampled 35509 from each class
📦 Total balanced dataset: 106527 samples (65.4% of original)
🗑️ Removed: 56442 samples (34.6%)

Balanced distribution:
category
 0.0    35509
-1.0    35509
 1.0    35509
Name: count, dtype: int64

📚 Training: 106,527 samples (balanced)
🧪 Testing: 16,933 samples (from original distribution)
   Test set is 10.4% of original data

Test set distribution (original imbalance):
category
 1.0    0.438729
 0.0    0.338511
-1.0    0.222760
Name: proportion, dtype: float64

STRATEGY 2: Stratified Split (Original Distribution)
📦 Total dataset: 162969 samples 

In [3]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from collections import defaultdict, Counter

# === 1️⃣ Load data ===
df = pd.read_csv("/kaggle/input/twitter-and-reddit-sentimental-analysis-dataset/Reddit_Data.csv")
df = df[['clean_comment', 'category']].dropna()

print("="*60)
print("ORIGINAL DATASET")
print("="*60)
print(f"Total samples: {len(df)}")
print("\nClass distribution:")
print(df['category'].value_counts())
print(f"\nClass proportions:")
print(df['category'].value_counts(normalize=True))

# === HOLDOUT TEST SET: 20% of original data (NEVER used in training) ===
print("\n" + "="*60)
print("CREATING HOLDOUT TEST SET")
print("="*60)

# Split: 80% for training/sampling, 20% for final testing
df_available, df_holdout_test = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['category']
)

print(f"📦 Available for training: {len(df_available):,} samples (80%)")
print(f"🔒 HOLDOUT test set: {len(df_holdout_test):,} samples (20%)")
print("\nHoldout test distribution:")
print(df_holdout_test['category'].value_counts())
print(df_holdout_test['category'].value_counts(normalize=True))

# Now use df_available instead of df for all sampling strategies
df = df_available  # Replace df with the 80% available data

# === STRATEGY 1: BALANCED UNDERSAMPLING ===
print("\n" + "="*60)
print("STRATEGY 1: Balanced Undersampling")
print("="*60)

# Find the minority class size
class_counts = df['category'].value_counts()
min_class_size = class_counts.min()
print(f"Original class counts:")
for cat in [-1, 0, 1]:
    print(f"  Class {cat}: {class_counts.get(cat, 0)} samples")
print(f"\nMinority class size: {min_class_size}")

# Sample equal amounts from each class
df_balanced = pd.concat([
    df[df['category'] == -1].sample(min_class_size, random_state=42),
    df[df['category'] == 0].sample(min_class_size, random_state=42),
    df[df['category'] == 1].sample(min_class_size, random_state=42)
]).sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

print(f"\n✂️ Sampled {min_class_size} from each class")
print(f"📦 Total balanced dataset: {len(df_balanced)} samples ({len(df_balanced)/len(df)*100:.1f}% of original)")
print(f"🗑️ Removed: {len(df) - len(df_balanced)} samples ({(len(df)-len(df_balanced))/len(df)*100:.1f}%)")
print("\nBalanced distribution:")
print(df_balanced['category'].value_counts())

# Split: Use ALL balanced data for training, test on ORIGINAL distribution
# This is more realistic - train balanced, test on real-world imbalanced data
train_df_balanced = df_balanced.copy()

# Create a separate test set from ORIGINAL data (not used in training)
remaining_indices = df.index.difference(df_balanced.index)
test_df_balanced = df.loc[remaining_indices].sample(
    frac=0.3, random_state=42
)  # 30% of remaining data

print(f"\n📚 Training: {len(train_df_balanced):,} samples (balanced)")
print(f"🧪 Testing: {len(test_df_balanced):,} samples (from original distribution)")
print(f"   Test set is {len(test_df_balanced)/len(df)*100:.1f}% of original data")
print("\nTest set distribution (original imbalance):")
print(test_df_balanced['category'].value_counts(normalize=True))

# === STRATEGY 2: STRATIFIED SPLIT (keeps original distribution) ===
print("\n" + "="*60)
print("STRATEGY 2: Stratified Split (Original Distribution)")
print("="*60)

train_df_stratified, test_df_stratified = train_test_split(
    df, test_size=0.3, random_state=42, stratify=df['category']
)

print(f"📦 Total dataset: {len(df)} samples (100% of original)")
print(f"   Training: {len(train_df_stratified)} samples (70%)")
print(f"   Testing: {len(test_df_stratified)} samples (30%)")
print("\nTraining set distribution:")
print(train_df_stratified['category'].value_counts())
print("\nTest set distribution:")
print(test_df_stratified['category'].value_counts())

# === STRATEGY 3: WEIGHTED BALANCED (Oversample minority, undersample majority) ===
print("\n" + "="*60)
print("STRATEGY 3: Hybrid Sampling (Target size between min and max)")
print("="*60)

class_counts = df['category'].value_counts()
target_size = int(np.mean(class_counts))  # Use average as target
print(f"Original class sizes:")
for cat in [-1, 0, 1]:
    count = class_counts.get(cat, 0)
    print(f"  Class {cat}: {count} samples")
print(f"\nTarget size per class: {target_size} (average of all classes)")

samples_per_class = {}
for cat in [-1, 0, 1]:
    original_count = class_counts.get(cat, 0)
    sampled_count = min(target_size, original_count)
    will_oversample = target_size > original_count
    samples_per_class[cat] = {
        'original': original_count,
        'sampled': sampled_count if not will_oversample else target_size,
        'oversampled': will_oversample
    }

print("\nSampling details:")
for cat in [-1, 0, 1]:
    info = samples_per_class[cat]
    status = "⬆️ OVERSAMPLING" if info['oversampled'] else "⬇️ UNDERSAMPLING"
    print(f"  Class {cat}: {info['original']:,} → {info['sampled']:,} {status}")

df_hybrid = pd.concat([
    df[df['category'] == -1].sample(
        min(target_size, class_counts[-1]), 
        replace=(target_size > class_counts[-1]), 
        random_state=42
    ),
    df[df['category'] == 0].sample(
        min(target_size, class_counts[0]), 
        replace=(target_size > class_counts[0]), 
        random_state=42
    ),
    df[df['category'] == 1].sample(
        min(target_size, class_counts[1]), 
        replace=(target_size > class_counts[1]), 
        random_state=42
    )
]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\n📦 Total hybrid dataset: {len(df_hybrid)} samples ({len(df_hybrid)/len(df)*100:.1f}% of original)")
print("Hybrid distribution:")
print(df_hybrid['category'].value_counts())

# Split: Use ALL hybrid data for training, test on remaining original data
train_df_hybrid = df_hybrid.copy()

# Test on data not used in hybrid sampling
remaining_indices_hybrid = df.index.difference(df_hybrid.index)
if len(remaining_indices_hybrid) > 0:
    test_df_hybrid = df.loc[remaining_indices_hybrid].sample(
        min(len(remaining_indices_hybrid), int(len(df)*0.3)), 
        random_state=42
    )
else:
    # If hybrid used all data, split normally
    train_df_hybrid, test_df_hybrid = train_test_split(
        df_hybrid, test_size=0.3, random_state=42, stratify=df_hybrid['category']
    )

print(f"\n📚 Training: {len(train_df_hybrid):,} samples (hybrid balanced)")
print(f"🧪 Testing: {len(test_df_hybrid):,} samples")
print(f"   Test set is {len(test_df_hybrid)/len(df)*100:.1f}% of original data")

# === Function to train and evaluate ===
def train_and_evaluate(train_df, test_df, strategy_name):
    print("\n" + "="*60)
    print(f"Training with: {strategy_name}")
    print("="*60)
    
    # Preprocessing
    negation_words = {"not", "no", "nor", "never", "n't", "don't", "dont", "cannot"}
    stop_words = set(stopwords.words("english")) - negation_words
    
    def clean_tweet(text):
        if not text:
            return []
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)
        text = re.sub(r'[^A-Za-z\s]', '', text)
        text = text.lower().split()
        return [w for w in text if w and w not in stop_words]
    
    train_df["tokens"] = train_df["clean_comment"].astype(str).apply(clean_tweet)
    test_df["tokens"] = test_df["clean_comment"].astype(str).apply(clean_tweet)
    
    # Build word probabilities
    vocab_stats = defaultdict(lambda: [0, 0, 0])
    for _, row in train_df.iterrows():
        cat = int(row["category"])
        for w in row["tokens"]:
            if cat == -1:
                vocab_stats[w][0] += 1
            elif cat == 0:
                vocab_stats[w][1] += 1
            elif cat == 1:
                vocab_stats[w][2] += 1
    
    word_probs = {}
    for w, counts in vocab_stats.items():
        total = sum(counts)
        if total >= 2:  # Minimum 2 occurrences
            probs = np.array(counts) / total
            word_probs[w] = probs
    
    word_prob_df = pd.DataFrame(word_probs).T
    word_prob_df.columns = ['p(-1)', 'p(0)', 'p(1)']
    
    # Compute thresholds
    max_p = word_prob_df.max(axis=1)
    second_p = word_prob_df.apply(lambda x: sorted(x, reverse=True)[1], axis=1)
    ratio = max_p / (second_p + 1e-9)
    strong_thresh = np.percentile(ratio, 75)
    mild_thresh = np.percentile(ratio, 55)
    
    # Tag words
    def tag_word(row):
        probs = row.values
        labels = ['neg', 'neu', 'pos']
        max_idx = np.argmax(probs)
        sorted_probs = np.sort(probs)[::-1]
        ratio = sorted_probs[0] / (sorted_probs[1] + 1e-9)
        
        if ratio >= strong_thresh:
            return labels[max_idx]
        elif ratio >= mild_thresh:
            if labels[max_idx] == 'pos' and probs[1] > 0.15:
                return 'mild_pos'
            elif labels[max_idx] == 'neg' and probs[1] > 0.15:
                return 'mild_neg'
            else:
                return labels[max_idx]
        else:
            return 'contextual'
    
    word_prob_df["tag"] = word_prob_df.apply(tag_word, axis=1)
    word_tag_dict = word_prob_df["tag"].to_dict()
    
    # Tag tweets
    def tag_tweet(tokens):
        return [word_tag_dict[w] for w in tokens if w in word_tag_dict]
    
    train_df["tags"] = train_df["tokens"].apply(tag_tweet)
    test_df["tags"] = test_df["tokens"].apply(tag_tweet)
    
    # Compress tags
    def compress_tags(tags, max_tags=6):
        tags = [t for t in tags if t != "contextual"]
        if not tags:
            return "contextual_only"
        priority = {'pos': 3, 'neg': 3, 'mild_pos': 2, 'mild_neg': 2, 'neu': 1}
        tags_sorted = sorted(tags, key=lambda t: priority.get(t, 0), reverse=True)[:max_tags]
        tag_count = Counter(tags_sorted)
        return "_".join([f"{k}:{v}" for k, v in sorted(tag_count.items())])
    
    train_df["signature"] = train_df["tags"].apply(compress_tags)
    
    # Build probability table
    alpha = 1
    tag_class_counts = defaultdict(lambda: Counter())
    
    for _, row in train_df.iterrows():
        tag_class_counts[row["signature"]][int(row["category"])] += 1
    
    prob_table = {}
    for sig, class_counts in tag_class_counts.items():
        total = sum(class_counts.values())
        probs = {}
        for c in [-1, 0, 1]:
            probs[c] = (class_counts[c] + alpha) / (total + alpha * 3)
        prob_table[sig] = probs
    
    prob_df = pd.DataFrame(prob_table).T.fillna(1/3)
    prob_df.columns = ['p(-1)', 'p(0)', 'p(1)']
    
    # Drop rare signatures
    min_occ = 3  # Lower threshold for balanced data
    valid_sigs = [sig for sig, counts in tag_class_counts.items() if sum(counts.values()) >= min_occ]
    prob_df = prob_df.loc[valid_sigs]
    
    print(f"Vocabulary size: {len(word_tag_dict)}")
    print(f"Valid signatures: {len(valid_sigs)}")
    
    # Predict
    def predict_argmax(tags):
        sig = compress_tags(tags)
        if sig not in prob_df.index:
            return 0
        row = prob_df.loc[sig]
        max_col = row.idxmax()
        class_val = int(max_col.split('(')[1].split(')')[0])
        return class_val
    
    preds = [predict_argmax(tags) for tags in test_df["tags"]]
    test_df["pred"] = preds
    
    accuracy = np.mean(test_df["pred"] == test_df["category"])
    
    # Metrics
    from sklearn.metrics import classification_report, confusion_matrix
    
    print(f"\nACCURACY: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print("\nClassification Report:")
    print(classification_report(test_df["category"], test_df["pred"], 
                              target_names=['Negative', 'Neutral', 'Positive'],
                              zero_division=0))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(test_df["category"], test_df["pred"])
    cm_df = pd.DataFrame(cm, 
                         index=['True Neg', 'True Neu', 'True Pos'],
                         columns=['Pred Neg', 'Pred Neu', 'Pred Pos'])
    print(cm_df)
    
    return accuracy

# === Run all strategies ===
results = {}
results_holdout = {}

print("\n" + "#"*60)
print("COMPARING ALL SAMPLING STRATEGIES")
print("#"*60)

results['Balanced Undersampling'] = train_and_evaluate(
    train_df_balanced.copy(), test_df_balanced.copy(), 
    "Balanced Undersampling"
)

results['Stratified (Original)'] = train_and_evaluate(
    train_df_stratified.copy(), test_df_stratified.copy(), 
    "Stratified Split (Original Distribution)"
)

results['Hybrid Sampling'] = train_and_evaluate(
    train_df_hybrid.copy(), test_df_hybrid.copy(), 
    "Hybrid Sampling"
)

# === NOW TEST ON HOLDOUT SET ===
print("\n" + "#"*60)
print("TESTING ON HOLDOUT SET (20% UNSEEN DATA)")
print("#"*60)

print("\n🔒 Testing all 3 strategies on completely unseen 20% holdout data...")

# We need to save the trained models from best strategy
# Let's retrain and test on holdout for each strategy

def test_on_holdout(train_df, holdout_df, strategy_name):
    """Train on strategy data, test on holdout"""
    print("\n" + "="*60)
    print(f"HOLDOUT TEST: {strategy_name}")
    print("="*60)
    
    # Preprocessing
    negation_words = {"not", "no", "nor", "never", "n't", "don't", "dont", "cannot"}
    stop_words = set(stopwords.words("english")) - negation_words
    
    def clean_tweet(text):
        if not text:
            return []
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)
        text = re.sub(r'[^A-Za-z\s]', '', text)
        text = text.lower().split()
        return [w for w in text if w and w not in stop_words]
    
    # Process training data
    train_df["tokens"] = train_df["clean_comment"].astype(str).apply(clean_tweet)
    # Process HOLDOUT data
    holdout_df["tokens"] = holdout_df["clean_comment"].astype(str).apply(clean_tweet)
    
    # Build word probabilities from TRAINING only
    vocab_stats = defaultdict(lambda: [0, 0, 0])
    for _, row in train_df.iterrows():
        cat = int(row["category"])
        for w in row["tokens"]:
            if cat == -1:
                vocab_stats[w][0] += 1
            elif cat == 0:
                vocab_stats[w][1] += 1
            elif cat == 1:
                vocab_stats[w][2] += 1
    
    word_probs = {}
    for w, counts in vocab_stats.items():
        total = sum(counts)
        if total >= 2:
            probs = np.array(counts) / total
            word_probs[w] = probs
    
    word_prob_df = pd.DataFrame(word_probs).T
    word_prob_df.columns = ['p(-1)', 'p(0)', 'p(1)']
    
    # Compute thresholds
    max_p = word_prob_df.max(axis=1)
    second_p = word_prob_df.apply(lambda x: sorted(x, reverse=True)[1], axis=1)
    ratio = max_p / (second_p + 1e-9)
    strong_thresh = np.percentile(ratio, 75)
    mild_thresh = np.percentile(ratio, 55)
    
    # Tag words
    def tag_word(row):
        probs = row.values
        labels = ['neg', 'neu', 'pos']
        max_idx = np.argmax(probs)
        sorted_probs = np.sort(probs)[::-1]
        ratio = sorted_probs[0] / (sorted_probs[1] + 1e-9)
        
        if ratio >= strong_thresh:
            return labels[max_idx]
        elif ratio >= mild_thresh:
            if labels[max_idx] == 'pos' and probs[1] > 0.15:
                return 'mild_pos'
            elif labels[max_idx] == 'neg' and probs[1] > 0.15:
                return 'mild_neg'
            else:
                return labels[max_idx]
        else:
            return 'contextual'
    
    word_prob_df["tag"] = word_prob_df.apply(tag_word, axis=1)
    word_tag_dict = word_prob_df["tag"].to_dict()
    
    # Tag tweets
    def tag_tweet(tokens):
        return [word_tag_dict[w] for w in tokens if w in word_tag_dict]
    
    train_df["tags"] = train_df["tokens"].apply(tag_tweet)
    holdout_df["tags"] = holdout_df["tokens"].apply(tag_tweet)
    
    # Compress tags
    def compress_tags(tags, max_tags=6):
        tags = [t for t in tags if t != "contextual"]
        if not tags:
            return "contextual_only"
        priority = {'pos': 3, 'neg': 3, 'mild_pos': 2, 'mild_neg': 2, 'neu': 1}
        tags_sorted = sorted(tags, key=lambda t: priority.get(t, 0), reverse=True)[:max_tags]
        tag_count = Counter(tags_sorted)
        return "_".join([f"{k}:{v}" for k, v in sorted(tag_count.items())])
    
    train_df["signature"] = train_df["tags"].apply(compress_tags)
    
    # Build probability table from TRAINING only
    alpha = 1
    tag_class_counts = defaultdict(lambda: Counter())
    
    for _, row in train_df.iterrows():
        tag_class_counts[row["signature"]][int(row["category"])] += 1
    
    prob_table = {}
    for sig, class_counts in tag_class_counts.items():
        total = sum(class_counts.values())
        probs = {}
        for c in [-1, 0, 1]:
            probs[c] = (class_counts[c] + alpha) / (total + alpha * 3)
        prob_table[sig] = probs
    
    prob_df = pd.DataFrame(prob_table).T.fillna(1/3)
    prob_df.columns = ['p(-1)', 'p(0)', 'p(1)']
    
    min_occ = 3
    valid_sigs = [sig for sig, counts in tag_class_counts.items() if sum(counts.values()) >= min_occ]
    prob_df = prob_df.loc[valid_sigs]
    
    # Predict on HOLDOUT
    def predict_argmax(tags):
        sig = compress_tags(tags)
        if sig not in prob_df.index:
            return 0
        row = prob_df.loc[sig]
        max_col = row.idxmax()
        class_val = int(max_col.split('(')[1].split(')')[0])
        return class_val
    
    preds = [predict_argmax(tags) for tags in holdout_df["tags"]]
    holdout_df["pred"] = preds
    
    accuracy = np.mean(holdout_df["pred"] == holdout_df["category"])
    
    # Metrics
    from sklearn.metrics import classification_report, confusion_matrix
    
    print(f"\n🔒 HOLDOUT ACCURACY: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print("\nClassification Report:")
    print(classification_report(holdout_df["category"], holdout_df["pred"], 
                              target_names=['Negative', 'Neutral', 'Positive'],
                              zero_division=0))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(holdout_df["category"], holdout_df["pred"])
    cm_df = pd.DataFrame(cm, 
                         index=['True Neg', 'True Neu', 'True Pos'],
                         columns=['Pred Neg', 'Pred Neu', 'Pred Pos'])
    print(cm_df)
    
    return accuracy

# Test each strategy on holdout
results_holdout['Balanced Undersampling'] = test_on_holdout(
    train_df_balanced.copy(), df_holdout_test.copy(),
    "Balanced Undersampling"
)

results_holdout['Stratified (Original)'] = test_on_holdout(
    train_df_stratified.copy(), df_holdout_test.copy(),
    "Stratified Split (Original Distribution)"
)

results_holdout['Hybrid Sampling'] = test_on_holdout(
    train_df_hybrid.copy(), df_holdout_test.copy(),
    "Hybrid Sampling"
)

# === Final comparison ===
print("\n" + "="*60)
print("FINAL COMPARISON")
print("="*60)

print("\nData Usage Summary:")
print("-" * 60)
print(f"Original dataset: {len(df) + len(df_holdout_test):,} samples")
print(f"  Available for training: {len(df):,} samples (80%)")
print(f"  Holdout test set: {len(df_holdout_test):,} samples (20%)")
print(f"\n  Strategy 1 (Balanced):  {len(df_balanced):,} samples ({len(df_balanced)/len(df)*100:.1f}% of available)")
print(f"  Strategy 2 (Stratified): {len(df):,} samples (100.0% of available)")
print(f"  Strategy 3 (Hybrid):     {len(df_hybrid):,} samples ({len(df_hybrid)/len(df)*100:.1f}% of available)")

print("\n" + "="*60)
print("VALIDATION SET ACCURACY (from 80% data)")
print("="*60)
for strategy, acc in results.items():
    print(f"{strategy:35s}: {acc:.4f} ({acc*100:.2f}%)")

print("\n" + "="*60)
print("🔒 HOLDOUT TEST SET ACCURACY (20% unseen data)")
print("="*60)
for strategy, acc in results_holdout.items():
    print(f"{strategy:35s}: {acc:.4f} ({acc*100:.2f}%)")

print("\n" + "="*60)
print("COMPARISON: Validation vs Holdout")
print("="*60)
for strategy in results.keys():
    val_acc = results[strategy]
    holdout_acc = results_holdout[strategy]
    diff = holdout_acc - val_acc
    diff_pct = (diff / val_acc * 100) if val_acc > 0 else 0
    print(f"{strategy:35s}: {val_acc:.4f} → {holdout_acc:.4f} ({diff:+.4f}, {diff_pct:+.1f}%)")

best_strategy_val = max(results, key=results.get)
best_strategy_holdout = max(results_holdout, key=results_holdout.get)

print(f"\n🏆 BEST ON VALIDATION: {best_strategy_val}")
print(f"   Accuracy: {results[best_strategy_val]:.4f}")

print(f"\n🏆 BEST ON HOLDOUT: {best_strategy_holdout}")
print(f"   Accuracy: {results_holdout[best_strategy_holdout]:.4f}")

if best_strategy_val == best_strategy_holdout:
    print(f"\n✅ Same strategy wins on both! Model generalizes well.")
else:
    print(f"\n⚠️  Different winners - possible overfitting or variance.")

ORIGINAL DATASET
Total samples: 37149

Class distribution:
category
 1    15830
 0    13042
-1     8277
Name: count, dtype: int64

Class proportions:
category
 1    0.426122
 0    0.351073
-1    0.222805
Name: proportion, dtype: float64

CREATING HOLDOUT TEST SET
📦 Available for training: 29,719 samples (80%)
🔒 HOLDOUT test set: 7,430 samples (20%)

Holdout test distribution:
category
 1    3166
 0    2609
-1    1655
Name: count, dtype: int64
category
 1    0.426110
 0    0.351144
-1    0.222746
Name: proportion, dtype: float64

STRATEGY 1: Balanced Undersampling
Original class counts:
  Class -1: 6622 samples
  Class 0: 10433 samples
  Class 1: 12664 samples

Minority class size: 6622

✂️ Sampled 6622 from each class
📦 Total balanced dataset: 19866 samples (66.8% of original)
🗑️ Removed: 9853 samples (33.2%)

Balanced distribution:
category
 0    6622
 1    6622
-1    6622
Name: count, dtype: int64

📚 Training: 19,866 samples (balanced)
🧪 Testing: 4,149 samples (from original distribu

In [4]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from collections import defaultdict, Counter

# === 1️⃣ Load data ===
df = pd.read_csv("/kaggle/input/twitter-and-reddit-sentimental-analysis-dataset/Twitter_Data.csv")
df = df[['clean_text', 'category']].dropna()

print("="*60)
print("ORIGINAL DATASET")
print("="*60)
print(f"Total samples: {len(df)}")
print("\nClass distribution:")
print(df['category'].value_counts())
print(f"\nClass proportions:")
print(df['category'].value_counts(normalize=True))

# === HOLDOUT TEST SET: 20% of original data (NEVER used in training) ===
print("\n" + "="*60)
print("CREATING HOLDOUT TEST SET")
print("="*60)

# Split: 80% for training/sampling, 20% for final testing
df_available, df_holdout_test = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['category']
)

print(f"📦 Available for training: {len(df_available):,} samples (80%)")
print(f"🔒 HOLDOUT test set: {len(df_holdout_test):,} samples (20%)")
print("\nHoldout test distribution:")
print(df_holdout_test['category'].value_counts())
print(df_holdout_test['category'].value_counts(normalize=True))

# Now use df_available instead of df for all sampling strategies
df = df_available  # Replace df with the 80% available data

# === STRATEGY 1: BALANCED UNDERSAMPLING ===
print("\n" + "="*60)
print("STRATEGY 1: Balanced Undersampling")
print("="*60)

# Find the minority class size
class_counts = df['category'].value_counts()
min_class_size = class_counts.min()
print(f"Original class counts:")
for cat in [-1, 0, 1]:
    print(f"  Class {cat}: {class_counts.get(cat, 0)} samples")
print(f"\nMinority class size: {min_class_size}")

# Sample equal amounts from each class
df_balanced = pd.concat([
    df[df['category'] == -1].sample(min_class_size, random_state=42),
    df[df['category'] == 0].sample(min_class_size, random_state=42),
    df[df['category'] == 1].sample(min_class_size, random_state=42)
]).sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

print(f"\n✂️ Sampled {min_class_size} from each class")
print(f"📦 Total balanced dataset: {len(df_balanced)} samples ({len(df_balanced)/len(df)*100:.1f}% of original)")
print(f"🗑️ Removed: {len(df) - len(df_balanced)} samples ({(len(df)-len(df_balanced))/len(df)*100:.1f}%)")
print("\nBalanced distribution:")
print(df_balanced['category'].value_counts())

# Split: Use ALL balanced data for training, test on ORIGINAL distribution
# This is more realistic - train balanced, test on real-world imbalanced data
train_df_balanced = df_balanced.copy()

# Create a separate test set from ORIGINAL data (not used in training)
remaining_indices = df.index.difference(df_balanced.index)
test_df_balanced = df.loc[remaining_indices].sample(
    frac=0.3, random_state=42
)  # 30% of remaining data

print(f"\n📚 Training: {len(train_df_balanced):,} samples (balanced)")
print(f"🧪 Testing: {len(test_df_balanced):,} samples (from original distribution)")
print(f"   Test set is {len(test_df_balanced)/len(df)*100:.1f}% of original data")
print("\nTest set distribution (original imbalance):")
print(test_df_balanced['category'].value_counts(normalize=True))

# === STRATEGY 2: STRATIFIED SPLIT (keeps original distribution) ===
print("\n" + "="*60)
print("STRATEGY 2: Stratified Split (Original Distribution)")
print("="*60)

train_df_stratified, test_df_stratified = train_test_split(
    df, test_size=0.3, random_state=42, stratify=df['category']
)

print(f"📦 Total dataset: {len(df)} samples (100% of original)")
print(f"   Training: {len(train_df_stratified)} samples (70%)")
print(f"   Testing: {len(test_df_stratified)} samples (30%)")
print("\nTraining set distribution:")
print(train_df_stratified['category'].value_counts())
print("\nTest set distribution:")
print(test_df_stratified['category'].value_counts())

# === STRATEGY 3: WEIGHTED BALANCED (Oversample minority, undersample majority) ===
print("\n" + "="*60)
print("STRATEGY 3: Hybrid Sampling (Target size between min and max)")
print("="*60)

class_counts = df['category'].value_counts()
target_size = int(np.mean(class_counts))  # Use average as target
print(f"Original class sizes:")
for cat in [-1, 0, 1]:
    count = class_counts.get(cat, 0)
    print(f"  Class {cat}: {count} samples")
print(f"\nTarget size per class: {target_size} (average of all classes)")

samples_per_class = {}
for cat in [-1, 0, 1]:
    original_count = class_counts.get(cat, 0)
    sampled_count = min(target_size, original_count)
    will_oversample = target_size > original_count
    samples_per_class[cat] = {
        'original': original_count,
        'sampled': sampled_count if not will_oversample else target_size,
        'oversampled': will_oversample
    }

print("\nSampling details:")
for cat in [-1, 0, 1]:
    info = samples_per_class[cat]
    status = "⬆️ OVERSAMPLING" if info['oversampled'] else "⬇️ UNDERSAMPLING"
    print(f"  Class {cat}: {info['original']:,} → {info['sampled']:,} {status}")

df_hybrid = pd.concat([
    df[df['category'] == -1].sample(
        min(target_size, class_counts[-1]), 
        replace=(target_size > class_counts[-1]), 
        random_state=42
    ),
    df[df['category'] == 0].sample(
        min(target_size, class_counts[0]), 
        replace=(target_size > class_counts[0]), 
        random_state=42
    ),
    df[df['category'] == 1].sample(
        min(target_size, class_counts[1]), 
        replace=(target_size > class_counts[1]), 
        random_state=42
    )
]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\n📦 Total hybrid dataset: {len(df_hybrid)} samples ({len(df_hybrid)/len(df)*100:.1f}% of original)")
print("Hybrid distribution:")
print(df_hybrid['category'].value_counts())

# Split: Use ALL hybrid data for training, test on remaining original data
train_df_hybrid = df_hybrid.copy()

# Test on data not used in hybrid sampling
remaining_indices_hybrid = df.index.difference(df_hybrid.index)
if len(remaining_indices_hybrid) > 0:
    test_df_hybrid = df.loc[remaining_indices_hybrid].sample(
        min(len(remaining_indices_hybrid), int(len(df)*0.3)), 
        random_state=42
    )
else:
    # If hybrid used all data, split normally
    train_df_hybrid, test_df_hybrid = train_test_split(
        df_hybrid, test_size=0.3, random_state=42, stratify=df_hybrid['category']
    )

print(f"\n📚 Training: {len(train_df_hybrid):,} samples (hybrid balanced)")
print(f"🧪 Testing: {len(test_df_hybrid):,} samples")
print(f"   Test set is {len(test_df_hybrid)/len(df)*100:.1f}% of original data")

# === Function to train and evaluate ===
def train_and_evaluate(train_df, test_df, strategy_name):
    print("\n" + "="*60)
    print(f"Training with: {strategy_name}")
    print("="*60)
    
    # Preprocessing
    negation_words = {"not", "no", "nor", "never", "n't", "don't", "dont", "cannot"}
    stop_words = set(stopwords.words("english")) - negation_words
    
    def clean_tweet(text):
        if not text:
            return []
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)
        text = re.sub(r'[^A-Za-z\s]', '', text)
        text = text.lower().split()
        return [w for w in text if w and w not in stop_words]
    
    train_df["tokens"] = train_df["clean_text"].astype(str).apply(clean_tweet)
    test_df["tokens"] = test_df["clean_text"].astype(str).apply(clean_tweet)
    
    # Build word probabilities
    vocab_stats = defaultdict(lambda: [0, 0, 0])
    for _, row in train_df.iterrows():
        cat = int(row["category"])
        for w in row["tokens"]:
            if cat == -1:
                vocab_stats[w][0] += 1
            elif cat == 0:
                vocab_stats[w][1] += 1
            elif cat == 1:
                vocab_stats[w][2] += 1
    
    word_probs = {}
    for w, counts in vocab_stats.items():
        total = sum(counts)
        if total >= 2:  # Minimum 2 occurrences
            probs = np.array(counts) / total
            word_probs[w] = probs
    
    word_prob_df = pd.DataFrame(word_probs).T
    word_prob_df.columns = ['p(-1)', 'p(0)', 'p(1)']
    
    # Compute thresholds
    max_p = word_prob_df.max(axis=1)
    second_p = word_prob_df.apply(lambda x: sorted(x, reverse=True)[1], axis=1)
    ratio = max_p / (second_p + 1e-9)
    strong_thresh = np.percentile(ratio, 75)
    mild_thresh = np.percentile(ratio, 55)
    
    # Tag words
    def tag_word(row):
        probs = row.values
        labels = ['neg', 'neu', 'pos']
        max_idx = np.argmax(probs)
        sorted_probs = np.sort(probs)[::-1]
        ratio = sorted_probs[0] / (sorted_probs[1] + 1e-9)
        
        if ratio >= strong_thresh:
            return labels[max_idx]
        elif ratio >= mild_thresh:
            if labels[max_idx] == 'pos' and probs[1] > 0.15:
                return 'mild_pos'
            elif labels[max_idx] == 'neg' and probs[1] > 0.15:
                return 'mild_neg'
            else:
                return labels[max_idx]
        else:
            return 'contextual'
    
    word_prob_df["tag"] = word_prob_df.apply(tag_word, axis=1)
    word_tag_dict = word_prob_df["tag"].to_dict()
    
    # Tag tweets
    def tag_tweet(tokens):
        return [word_tag_dict[w] for w in tokens if w in word_tag_dict]
    
    train_df["tags"] = train_df["tokens"].apply(tag_tweet)
    test_df["tags"] = test_df["tokens"].apply(tag_tweet)
    
    # Compress tags
    def compress_tags(tags, max_tags=6):
        tags = [t for t in tags if t != "contextual"]
        if not tags:
            return "contextual_only"
        priority = {'pos': 3, 'neg': 3, 'mild_pos': 2, 'mild_neg': 2, 'neu': 1}
        tags_sorted = sorted(tags, key=lambda t: priority.get(t, 0), reverse=True)[:max_tags]
        tag_count = Counter(tags_sorted)
        return "_".join([f"{k}:{v}" for k, v in sorted(tag_count.items())])
    
    train_df["signature"] = train_df["tags"].apply(compress_tags)
    
    # Build probability table
    alpha = 1
    tag_class_counts = defaultdict(lambda: Counter())
    
    for _, row in train_df.iterrows():
        tag_class_counts[row["signature"]][int(row["category"])] += 1
    
    prob_table = {}
    for sig, class_counts in tag_class_counts.items():
        total = sum(class_counts.values())
        probs = {}
        for c in [-1, 0, 1]:
            probs[c] = (class_counts[c] + alpha) / (total + alpha * 3)
        prob_table[sig] = probs
    
    prob_df = pd.DataFrame(prob_table).T.fillna(1/3)
    prob_df.columns = ['p(-1)', 'p(0)', 'p(1)']
    
    # Drop rare signatures
    min_occ = 3  # Lower threshold for balanced data
    valid_sigs = [sig for sig, counts in tag_class_counts.items() if sum(counts.values()) >= min_occ]
    prob_df = prob_df.loc[valid_sigs]
    
    print(f"Vocabulary size: {len(word_tag_dict)}")
    print(f"Valid signatures: {len(valid_sigs)}")
    
    # Predict
    def predict_argmax(tags):
        sig = compress_tags(tags)
        if sig not in prob_df.index:
            return 0
        row = prob_df.loc[sig]
        max_col = row.idxmax()
        class_val = int(max_col.split('(')[1].split(')')[0])
        return class_val
    
    preds = [predict_argmax(tags) for tags in test_df["tags"]]
    test_df["pred"] = preds
    
    accuracy = np.mean(test_df["pred"] == test_df["category"])
    
    # Metrics
    from sklearn.metrics import classification_report, confusion_matrix
    
    print(f"\nACCURACY: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print("\nClassification Report:")
    print(classification_report(test_df["category"], test_df["pred"], 
                              target_names=['Negative', 'Neutral', 'Positive'],
                              zero_division=0))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(test_df["category"], test_df["pred"])
    cm_df = pd.DataFrame(cm, 
                         index=['True Neg', 'True Neu', 'True Pos'],
                         columns=['Pred Neg', 'Pred Neu', 'Pred Pos'])
    print(cm_df)
    
    return accuracy

# === Run all strategies ===
results = {}
results_holdout = {}

print("\n" + "#"*60)
print("COMPARING ALL SAMPLING STRATEGIES")
print("#"*60)

results['Balanced Undersampling'] = train_and_evaluate(
    train_df_balanced.copy(), test_df_balanced.copy(), 
    "Balanced Undersampling"
)

results['Stratified (Original)'] = train_and_evaluate(
    train_df_stratified.copy(), test_df_stratified.copy(), 
    "Stratified Split (Original Distribution)"
)

results['Hybrid Sampling'] = train_and_evaluate(
    train_df_hybrid.copy(), test_df_hybrid.copy(), 
    "Hybrid Sampling"
)

# === NOW TEST ON HOLDOUT SET ===
print("\n" + "#"*60)
print("TESTING ON HOLDOUT SET (20% UNSEEN DATA)")
print("#"*60)

print("\n🔒 Testing all 3 strategies on completely unseen 20% holdout data...")

# We need to save the trained models from best strategy
# Let's retrain and test on holdout for each strategy

def test_on_holdout(train_df, holdout_df, strategy_name):
    """Train on strategy data, test on holdout"""
    print("\n" + "="*60)
    print(f"HOLDOUT TEST: {strategy_name}")
    print("="*60)
    
    # Preprocessing
    negation_words = {"not", "no", "nor", "never", "n't", "don't", "dont", "cannot"}
    stop_words = set(stopwords.words("english")) - negation_words
    
    def clean_tweet(text):
        if not text:
            return []
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)
        text = re.sub(r'[^A-Za-z\s]', '', text)
        text = text.lower().split()
        return [w for w in text if w and w not in stop_words]
    
    # Process training data
    train_df["tokens"] = train_df["clean_text"].astype(str).apply(clean_tweet)
    # Process HOLDOUT data
    holdout_df["tokens"] = holdout_df["clean_text"].astype(str).apply(clean_tweet)
    
    # Build word probabilities from TRAINING only
    vocab_stats = defaultdict(lambda: [0, 0, 0])
    for _, row in train_df.iterrows():
        cat = int(row["category"])
        for w in row["tokens"]:
            if cat == -1:
                vocab_stats[w][0] += 1
            elif cat == 0:
                vocab_stats[w][1] += 1
            elif cat == 1:
                vocab_stats[w][2] += 1
    
    word_probs = {}
    for w, counts in vocab_stats.items():
        total = sum(counts)
        if total >= 2:
            probs = np.array(counts) / total
            word_probs[w] = probs
    
    word_prob_df = pd.DataFrame(word_probs).T
    word_prob_df.columns = ['p(-1)', 'p(0)', 'p(1)']
    
    # Compute thresholds
    max_p = word_prob_df.max(axis=1)
    second_p = word_prob_df.apply(lambda x: sorted(x, reverse=True)[1], axis=1)
    ratio = max_p / (second_p + 1e-9)
    strong_thresh = np.percentile(ratio, 75)
    mild_thresh = np.percentile(ratio, 55)
    
    # Tag words
    def tag_word(row):
        probs = row.values
        labels = ['neg', 'neu', 'pos']
        max_idx = np.argmax(probs)
        sorted_probs = np.sort(probs)[::-1]
        ratio = sorted_probs[0] / (sorted_probs[1] + 1e-9)
        
        if ratio >= strong_thresh:
            return labels[max_idx]
        elif ratio >= mild_thresh:
            if labels[max_idx] == 'pos' and probs[1] > 0.15:
                return 'mild_pos'
            elif labels[max_idx] == 'neg' and probs[1] > 0.15:
                return 'mild_neg'
            else:
                return labels[max_idx]
        else:
            return 'contextual'
    
    word_prob_df["tag"] = word_prob_df.apply(tag_word, axis=1)
    word_tag_dict = word_prob_df["tag"].to_dict()
    
    # Tag tweets
    def tag_tweet(tokens):
        return [word_tag_dict[w] for w in tokens if w in word_tag_dict]
    
    train_df["tags"] = train_df["tokens"].apply(tag_tweet)
    holdout_df["tags"] = holdout_df["tokens"].apply(tag_tweet)
    
    # Compress tags
    def compress_tags(tags, max_tags=6):
        tags = [t for t in tags if t != "contextual"]
        if not tags:
            return "contextual_only"
        priority = {'pos': 3, 'neg': 3, 'mild_pos': 2, 'mild_neg': 2, 'neu': 1}
        tags_sorted = sorted(tags, key=lambda t: priority.get(t, 0), reverse=True)[:max_tags]
        tag_count = Counter(tags_sorted)
        return "_".join([f"{k}:{v}" for k, v in sorted(tag_count.items())])
    
    train_df["signature"] = train_df["tags"].apply(compress_tags)
    
    # Build probability table from TRAINING only
    alpha = 1
    tag_class_counts = defaultdict(lambda: Counter())
    
    for _, row in train_df.iterrows():
        tag_class_counts[row["signature"]][int(row["category"])] += 1
    
    prob_table = {}
    for sig, class_counts in tag_class_counts.items():
        total = sum(class_counts.values())
        probs = {}
        for c in [-1, 0, 1]:
            probs[c] = (class_counts[c] + alpha) / (total + alpha * 3)
        prob_table[sig] = probs
    
    prob_df = pd.DataFrame(prob_table).T.fillna(1/3)
    prob_df.columns = ['p(-1)', 'p(0)', 'p(1)']
    
    min_occ = 3
    valid_sigs = [sig for sig, counts in tag_class_counts.items() if sum(counts.values()) >= min_occ]
    prob_df = prob_df.loc[valid_sigs]
    
    # Predict on HOLDOUT
    def predict_argmax(tags):
        sig = compress_tags(tags)
        if sig not in prob_df.index:
            return 0
        row = prob_df.loc[sig]
        max_col = row.idxmax()
        class_val = int(max_col.split('(')[1].split(')')[0])
        return class_val
    
    preds = [predict_argmax(tags) for tags in holdout_df["tags"]]
    holdout_df["pred"] = preds
    
    accuracy = np.mean(holdout_df["pred"] == holdout_df["category"])
    
    # Metrics
    from sklearn.metrics import classification_report, confusion_matrix
    
    print(f"\n🔒 HOLDOUT ACCURACY: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print("\nClassification Report:")
    print(classification_report(holdout_df["category"], holdout_df["pred"], 
                              target_names=['Negative', 'Neutral', 'Positive'],
                              zero_division=0))
    
    print("\nConfusion Matrix:")
    cm = confusion_matrix(holdout_df["category"], holdout_df["pred"])
    cm_df = pd.DataFrame(cm, 
                         index=['True Neg', 'True Neu', 'True Pos'],
                         columns=['Pred Neg', 'Pred Neu', 'Pred Pos'])
    print(cm_df)
    
    return accuracy

# Test each strategy on holdout
results_holdout['Balanced Undersampling'] = test_on_holdout(
    train_df_balanced.copy(), df_holdout_test.copy(),
    "Balanced Undersampling"
)

results_holdout['Stratified (Original)'] = test_on_holdout(
    train_df_stratified.copy(), df_holdout_test.copy(),
    "Stratified Split (Original Distribution)"
)

results_holdout['Hybrid Sampling'] = test_on_holdout(
    train_df_hybrid.copy(), df_holdout_test.copy(),
    "Hybrid Sampling"
)

# === Final comparison ===
print("\n" + "="*60)
print("FINAL COMPARISON")
print("="*60)

print("\nData Usage Summary:")
print("-" * 60)
print(f"Original dataset: {len(df) + len(df_holdout_test):,} samples")
print(f"  Available for training: {len(df):,} samples (80%)")
print(f"  Holdout test set: {len(df_holdout_test):,} samples (20%)")
print(f"\n  Strategy 1 (Balanced):  {len(df_balanced):,} samples ({len(df_balanced)/len(df)*100:.1f}% of available)")
print(f"  Strategy 2 (Stratified): {len(df):,} samples (100.0% of available)")
print(f"  Strategy 3 (Hybrid):     {len(df_hybrid):,} samples ({len(df_hybrid)/len(df)*100:.1f}% of available)")

print("\n" + "="*60)
print("VALIDATION SET ACCURACY (from 80% data)")
print("="*60)
for strategy, acc in results.items():
    print(f"{strategy:35s}: {acc:.4f} ({acc*100:.2f}%)")

print("\n" + "="*60)
print("🔒 HOLDOUT TEST SET ACCURACY (20% unseen data)")
print("="*60)
for strategy, acc in results_holdout.items():
    print(f"{strategy:35s}: {acc:.4f} ({acc*100:.2f}%)")

print("\n" + "="*60)
print("COMPARISON: Validation vs Holdout")
print("="*60)
for strategy in results.keys():
    val_acc = results[strategy]
    holdout_acc = results_holdout[strategy]
    diff = holdout_acc - val_acc
    diff_pct = (diff / val_acc * 100) if val_acc > 0 else 0
    print(f"{strategy:35s}: {val_acc:.4f} → {holdout_acc:.4f} ({diff:+.4f}, {diff_pct:+.1f}%)")

best_strategy_val = max(results, key=results.get)
best_strategy_holdout = max(results_holdout, key=results_holdout.get)

print(f"\n🏆 BEST ON VALIDATION: {best_strategy_val}")
print(f"   Accuracy: {results[best_strategy_val]:.4f}")

print(f"\n🏆 BEST ON HOLDOUT: {best_strategy_holdout}")
print(f"   Accuracy: {results_holdout[best_strategy_holdout]:.4f}")

if best_strategy_val == best_strategy_holdout:
    print(f"\n✅ Same strategy wins on both! Model generalizes well.")
else:
    print(f"\n⚠️  Different winners - possible overfitting or variance.")


ORIGINAL DATASET
Total samples: 162969

Class distribution:
category
 1.0    72249
 0.0    55211
-1.0    35509
Name: count, dtype: int64

Class proportions:
category
 1.0    0.443330
 0.0    0.338782
-1.0    0.217888
Name: proportion, dtype: float64

CREATING HOLDOUT TEST SET
📦 Available for training: 130,375 samples (80%)
🔒 HOLDOUT test set: 32,594 samples (20%)

Holdout test distribution:
category
 1.0    14450
 0.0    11042
-1.0     7102
Name: count, dtype: int64
category
 1.0    0.443333
 0.0    0.338774
-1.0    0.217893
Name: proportion, dtype: float64

STRATEGY 1: Balanced Undersampling
Original class counts:
  Class -1: 28407 samples
  Class 0: 44169 samples
  Class 1: 57799 samples

Minority class size: 28407

✂️ Sampled 28407 from each class
📦 Total balanced dataset: 85221 samples (65.4% of original)
🗑️ Removed: 45154 samples (34.6%)

Balanced distribution:
category
 0.0    28407
 1.0    28407
-1.0    28407
Name: count, dtype: int64

📚 Training: 85,221 samples (balanced)
🧪 Tes