# Model Comparison - Jigsaw Agile Community Rules Classification

This notebook compares all 5 models and selects the best performing one for the Jigsaw Agile Community Rules Classification hackathon.

## Models Compared:
1. **Logistic Regression** - TF-IDF (15,000 features, n-grams 1-2)
2. **Random Forest** - TF-IDF (20,000 features, n-grams 1-3)
3. **XGBoost** - TF-IDF (20,000 features, n-grams 1-3)
4. **SVM** - TF-IDF (20,000 features, n-grams 1-3)
5. **MLP (PyTorch)** - TF-IDF (20,000 features, n-grams 1-3)

## Target: Achieve >92% accuracy


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

print("Libraries imported successfully!")
print("Starting Model Comparison...")


In [None]:
# Load and prepare data (same preprocessing as individual models)
train_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/train.csv')
test_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')

print(f"Train dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")

# Data augmentation
def augment_training_data(df):
    augmented_data = []
    for _, row in df.iterrows():
        augmented_data.append({
            'body': row['body'], 'rule': row['rule'], 'subreddit': row['subreddit'], 'rule_violation': row['rule_violation']
        })
    
    for _, row in df.iterrows():
        if pd.notna(row['positive_example_1']):
            augmented_data.append({'body': row['positive_example_1'], 'rule': row['rule'], 'subreddit': row['subreddit'], 'rule_violation': 1})
        if pd.notna(row['positive_example_2']):
            augmented_data.append({'body': row['positive_example_2'], 'rule': row['rule'], 'subreddit': row['subreddit'], 'rule_violation': 1})
    
    for _, row in df.iterrows():
        if pd.notna(row['negative_example_1']):
            augmented_data.append({'body': row['negative_example_1'], 'rule': row['rule'], 'subreddit': row['subreddit'], 'rule_violation': 0})
        if pd.notna(row['negative_example_2']):
            augmented_data.append({'body': row['negative_example_2'], 'rule': row['rule'], 'subreddit': row['subreddit'], 'rule_violation': 0})
    
    return pd.DataFrame(augmented_data)

# Feature engineering
def add_features(df):
    df = df.copy()
    df['has_url'] = df['body'].apply(lambda x: 1 if ('http' in str(x).lower() or 'www' in str(x).lower()) else 0)
    df['body_length'] = df['body'].apply(lambda x: len(str(x)))
    df['word_count'] = df['body'].apply(lambda x: len(str(x).split()))
    df['avg_word_length'] = df['body'].apply(lambda x: np.mean([len(word) for word in str(x).split()]) if len(str(x).split()) > 0 else 0)
    df['exclamation_count'] = df['body'].apply(lambda x: str(x).count('!'))
    df['question_count'] = df['body'].apply(lambda x: str(x).count('?'))
    df['caps_ratio'] = df['body'].apply(lambda x: sum(1 for c in str(x) if c.isupper()) / len(str(x)) if len(str(x)) > 0 else 0)
    df['digit_count'] = df['body'].apply(lambda x: sum(1 for c in str(x) if c.isdigit()))
    return df

def combine_text(row):
    return f"{row['body']} [SEP] Rule: {row['rule']} [SEP] Subreddit: {row['subreddit']} [SEP] URL: {row['has_url']} [SEP] Length: {row['body_length']} [SEP] Words: {row['word_count']} [SEP] AvgWordLen: {row['avg_word_length']:.1f} [SEP] Exclamations: {row['exclamation_count']} [SEP] Questions: {row['question_count']} [SEP] CapsRatio: {row['caps_ratio']:.2f} [SEP] Digits: {row['digit_count']}"

# Process data
augmented_train = augment_training_data(train_df)
augmented_train = add_features(augmented_train)
test_df = add_features(test_df)
augmented_train['enhanced_text'] = augmented_train.apply(combine_text, axis=1)
test_df['enhanced_text'] = test_df.apply(combine_text, axis=1)

print(f"Augmented train data size: {len(augmented_train)}")
print(f"Class distribution: {augmented_train['rule_violation'].value_counts().to_dict()}")


In [None]:
# Split data and create TF-IDF features
X = augmented_train['enhanced_text']
y = augmented_train['rule_violation']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Enhanced TF-IDF with multiple configurations
from scipy.sparse import hstack

tfidf_configs = [
    {'max_features': 15000, 'ngram_range': (1, 2), 'min_df': 2, 'max_df': 0.95},
    {'max_features': 20000, 'ngram_range': (1, 3), 'min_df': 3, 'max_df': 0.9},
    {'max_features': 10000, 'ngram_range': (2, 4), 'min_df': 2, 'max_df': 0.95}
]

X_train_features = []
X_val_features = []
X_test_features = []

for i, config in enumerate(tfidf_configs):
    print(f"Creating TF-IDF config {i+1}: {config}")
    tfidf = TfidfVectorizer(stop_words='english', **config)
    X_train_tfidf = tfidf.fit_transform(X_train)
    X_val_tfidf = tfidf.transform(X_val)
    X_test_tfidf = tfidf.transform(test_df['enhanced_text'])
    
    X_train_features.append(X_train_tfidf)
    X_val_features.append(X_val_tfidf)
    X_test_features.append(X_test_tfidf)

# Combine features
X_train_combined = hstack(X_train_features)
X_val_combined = hstack(X_val_features)
X_test_combined = hstack(X_test_features)

print(f"Combined feature matrix shape: {X_train_combined.shape}")
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")


In [None]:
# Train all models and compare results
print("="*70)
print("TRAINING ALL MODELS FOR COMPARISON")
print("="*70)

models = {}

# 1. Logistic Regression
print("\n1. Training Logistic Regression...")
lr_model = LogisticRegression(random_state=42, max_iter=2000, class_weight='balanced', C=0.1, solver='liblinear')
lr_model.fit(X_train_combined, y_train)
lr_pred = lr_model.predict_proba(X_val_combined)[:, 1]
lr_auc = roc_auc_score(y_val, lr_pred)
lr_acc = accuracy_score(y_val, (lr_pred > 0.5).astype(int))
models['Logistic Regression'] = {'model': lr_model, 'auc': lr_auc, 'accuracy': lr_acc}
print(f"   AUC: {lr_auc:.4f}, Accuracy: {lr_acc:.4f} ({lr_acc*100:.2f}%)")

# 2. Random Forest
print("\n2. Training Random Forest...")
rf_model = RandomForestClassifier(n_estimators=1000, max_depth=15, min_samples_split=5, min_samples_leaf=2, random_state=42, class_weight='balanced', n_jobs=-1)
rf_model.fit(X_train_combined, y_train)
rf_pred = rf_model.predict_proba(X_val_combined)[:, 1]
rf_auc = roc_auc_score(y_val, rf_pred)
rf_acc = accuracy_score(y_val, (rf_pred > 0.5).astype(int))
models['Random Forest'] = {'model': rf_model, 'auc': rf_auc, 'accuracy': rf_acc}
print(f"   AUC: {rf_auc:.4f}, Accuracy: {rf_acc:.4f} ({rf_acc*100:.2f}%)")

# 3. XGBoost
print("\n3. Training XGBoost...")
xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.01, max_depth=8, subsample=0.8, colsample_bytree=0.8, random_state=42, eval_metric='auc')
xgb_model.fit(X_train_combined, y_train, eval_set=[(X_val_combined, y_val)], verbose=False)
xgb_pred = xgb_model.predict_proba(X_val_combined)[:, 1]
xgb_auc = roc_auc_score(y_val, xgb_pred)
xgb_acc = accuracy_score(y_val, (xgb_pred > 0.5).astype(int))
models['XGBoost'] = {'model': xgb_model, 'auc': xgb_auc, 'accuracy': xgb_acc}
print(f"   AUC: {xgb_auc:.4f}, Accuracy: {xgb_acc:.4f} ({xgb_acc*100:.2f}%)")

# 4. SVM
print("\n4. Training SVM...")
svm_model = SVC(kernel='linear', probability=True, random_state=42, class_weight='balanced', C=1.0)
svm_model.fit(X_train_combined, y_train)
svm_pred = svm_model.predict_proba(X_val_combined)[:, 1]
svm_auc = roc_auc_score(y_val, svm_pred)
svm_acc = accuracy_score(y_val, (svm_pred > 0.5).astype(int))
models['SVM'] = {'model': svm_model, 'auc': svm_auc, 'accuracy': svm_acc}
print(f"   AUC: {svm_auc:.4f}, Accuracy: {svm_acc:.4f} ({svm_acc*100:.2f}%)")


In [None]:
# 5. MLP (PyTorch) - Simplified version
print("\n5. Training MLP (PyTorch)...")

class MLPClassifier(nn.Module):
    def __init__(self, input_dim, dropout=0.3):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 1)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return self.sigmoid(x)

# Convert to dense for PyTorch
X_train_dense = X_train_combined.toarray()
X_val_dense = X_val_combined.toarray()

X_train_tensor = torch.FloatTensor(X_train_dense)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1)
X_val_tensor = torch.FloatTensor(X_val_dense)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

mlp_model = MLPClassifier(X_train_dense.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=0.001)

# Training loop
mlp_model.train()
for epoch in tqdm(range(10), desc="Training MLP"):
    epoch_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = mlp_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

# Evaluate MLP
mlp_model.eval()
with torch.no_grad():
    mlp_pred = mlp_model(X_val_tensor).numpy().flatten()

mlp_auc = roc_auc_score(y_val, mlp_pred)
mlp_acc = accuracy_score(y_val, (mlp_pred > 0.5).astype(int))
models['MLP'] = {'model': mlp_model, 'auc': mlp_auc, 'accuracy': mlp_acc}
print(f"   AUC: {mlp_auc:.4f}, Accuracy: {mlp_acc:.4f} ({mlp_acc*100:.2f}%)")


In [None]:
# Model Comparison Results
print("\n" + "="*70)
print("MODEL COMPARISON RESULTS")
print("="*70)

# Sort models by accuracy
sorted_models = sorted(models.items(), key=lambda x: x[1]['accuracy'], reverse=True)

print(f"{'Rank':<4} {'Model':<20} {'AUC':<8} {'Accuracy':<10} {'Target (>92%)':<12}")
print("-" * 70)

for rank, (model_name, results) in enumerate(sorted_models, 1):
    auc = results['auc']
    acc = results['accuracy']
    target_met = "YES" if acc > 0.92 else "NO"
    status = "🏆 BEST" if rank == 1 else "  "
    print(f"{rank:<4} {status} {model_name:<15} {auc:<8.4f} {acc:<10.4f} {target_met:<12}")

# Find best model
best_model_name = sorted_models[0][0]
best_model = sorted_models[0][1]['model']
best_auc = sorted_models[0][1]['auc']
best_accuracy = sorted_models[0][1]['accuracy']

print(f"\n🏆 BEST MODEL: {best_model_name}")
print(f"📊 Best AUC: {best_auc:.4f}")
print(f"📈 Best Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")
print(f"🎯 Target Achieved (>92%): {'YES' if best_accuracy > 0.92 else 'NO'}")


In [None]:
# Generate final predictions with best model
print(f"\n" + "="*70)
print(f"GENERATING FINAL PREDICTIONS WITH BEST MODEL: {best_model_name}")
print("="*70)

# Generate test predictions
if best_model_name == 'MLP':
    best_model.eval()
    with torch.no_grad():
        test_predictions = best_model(torch.FloatTensor(X_test_combined.toarray())).numpy().flatten()
else:
    test_predictions = best_model.predict_proba(X_test_combined)[:, 1]

print(f"Test predictions generated: {len(test_predictions)}")
print(f"Prediction range: [{min(test_predictions):.4f}, {max(test_predictions):.4f}]")
print(f"Mean prediction: {np.mean(test_predictions):.4f}")

# Create final submission
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'rule_violation': test_predictions
})

# Save submission file
submission_path = '/kaggle/working/best_model_submission.csv'
submission_df.to_csv(submission_path, index=False)

print(f"\nFinal submission saved to: {submission_path}")
print(f"Submission shape: {submission_df.shape}")
print(f"\nFirst few predictions:")
print(submission_df.head())

# Final summary
print(f"\n" + "="*70)
print("FINAL COMPETITION SUMMARY")
print("="*70)
print(f"Dataset: Jigsaw Agile Community Rules Classification")
print(f"Models Tested: 5 (Logistic Regression, Random Forest, XGBoost, SVM, MLP)")
print(f"Best Model: {best_model_name}")
print(f"Best AUC: {best_auc:.4f}")
print(f"Best Accuracy: {best_accuracy:.4f} ({best_accuracy*100:.2f}%)")
print(f"Target Achieved (>92%): {'✅ YES' if best_accuracy > 0.92 else '❌ NO'}")
print(f"Final Submission: {submission_path}")
print("="*70)
