# Model 5: MLP (PyTorch) - Jigsaw Agile Community Rules Classification

This notebook implements a **Multi-Layer Perceptron (MLP)** using PyTorch for the Jigsaw Agile Community Rules Classification hackathon.

## Model Details:
- **Algorithm**: Neural Network with 5 layers (1024→512→256→128→1)
- **Features**: 20,000 TF-IDF features with (1,3) n-grams
- **Hyperparameters**: batch_size=64, epochs=20, lr=0.001, dropout=0.3
- **Target**: Achieve >92% accuracy

## Features Used:
- Data augmentation with positive/negative examples
- Enhanced text preprocessing with [SEP] separators
- URL detection and text length features
- TF-IDF vectorization with batch normalization


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)
torch.manual_seed(42)

print("Libraries imported successfully!")
print("Model: MLP (PyTorch)")


In [None]:
# Load the datasets
train_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/train.csv')
test_df = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')

print(f"Train dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")
print(f"Train columns: {train_df.columns.tolist()}")

# Display first few rows
print("\nFirst few rows of training data:")
print(train_df.head())


In [None]:
# Data augmentation: Add positive and negative examples
def augment_training_data(df):
    """Augment training data with positive/negative examples"""
    augmented_data = []
    
    # Add original data
    for _, row in df.iterrows():
        augmented_data.append({
            'body': row['body'],
            'rule': row['rule'],
            'subreddit': row['subreddit'],
            'rule_violation': row['rule_violation']
        })
    
    # Add positive examples (rule_violation=1)
    for _, row in df.iterrows():
        if pd.notna(row['positive_example_1']):
            augmented_data.append({
                'body': row['positive_example_1'],
                'rule': row['rule'],
                'subreddit': row['subreddit'],
                'rule_violation': 1
            })
        if pd.notna(row['positive_example_2']):
            augmented_data.append({
                'body': row['positive_example_2'],
                'rule': row['rule'],
                'subreddit': row['subreddit'],
                'rule_violation': 1
            })
    
    # Add negative examples (rule_violation=0)
    for _, row in df.iterrows():
        if pd.notna(row['negative_example_1']):
            augmented_data.append({
                'body': row['negative_example_1'],
                'rule': row['rule'],
                'subreddit': row['subreddit'],
                'rule_violation': 0
            })
        if pd.notna(row['negative_example_2']):
            augmented_data.append({
                'body': row['negative_example_2'],
                'rule': row['rule'],
                'subreddit': row['subreddit'],
                'rule_violation': 0
            })
    
    return pd.DataFrame(augmented_data)

# Augment the training data
augmented_train = augment_training_data(train_df)
print(f"Original train data size: {len(train_df)}")
print(f"Augmented train data size: {len(augmented_train)}")
print(f"Class distribution: {augmented_train['rule_violation'].value_counts().to_dict()}")


In [None]:
# Feature engineering: Add enhanced features
def add_features(df):
    """Add enhanced features to the dataframe"""
    df = df.copy()
    
    # Basic features
    df['has_url'] = df['body'].apply(lambda x: 1 if ('http' in str(x).lower() or 'www' in str(x).lower()) else 0)
    df['body_length'] = df['body'].apply(lambda x: len(str(x)))
    df['word_count'] = df['body'].apply(lambda x: len(str(x).split()))
    df['avg_word_length'] = df['body'].apply(lambda x: np.mean([len(word) for word in str(x).split()]) if len(str(x).split()) > 0 else 0)
    
    # Advanced text features
    df['exclamation_count'] = df['body'].apply(lambda x: str(x).count('!'))
    df['question_count'] = df['body'].apply(lambda x: str(x).count('?'))
    df['caps_ratio'] = df['body'].apply(lambda x: sum(1 for c in str(x) if c.isupper()) / len(str(x)) if len(str(x)) > 0 else 0)
    df['digit_count'] = df['body'].apply(lambda x: sum(1 for c in str(x) if c.isdigit()))
    
    return df

# Add features to augmented training data
augmented_train = add_features(augmented_train)

# Add features to test data
test_df = add_features(test_df)

print("Features added successfully!")
print(f"New columns: {augmented_train.columns.tolist()}")


In [None]:
# Combine text with [SEP] separators
def combine_text(row):
    """Combine text with enhanced features"""
    combined = f"{row['body']} [SEP] Rule: {row['rule']} [SEP] Subreddit: {row['subreddit']} [SEP] URL: {row['has_url']} [SEP] Length: {row['body_length']} [SEP] Words: {row['word_count']} [SEP] AvgWordLen: {row['avg_word_length']:.1f} [SEP] Exclamations: {row['exclamation_count']} [SEP] Questions: {row['question_count']} [SEP] CapsRatio: {row['caps_ratio']:.2f} [SEP] Digits: {row['digit_count']}"
    return combined

# Apply text combination
augmented_train['enhanced_text'] = augmented_train.apply(combine_text, axis=1)
test_df['enhanced_text'] = test_df.apply(combine_text, axis=1)

print("Text combination completed!")
print(f"Sample of enhanced text (first 200 chars):")
print(augmented_train['enhanced_text'].iloc[0][:200] + "...")


In [None]:
# Split data into train and validation sets
X = augmented_train['enhanced_text']
y = augmented_train['rule_violation']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Training class distribution: {y_train.value_counts().to_dict()}")
print(f"Validation class distribution: {y_val.value_counts().to_dict()}")


In [None]:
# TF-IDF Vectorization for MLP
print("Creating TF-IDF vectorizer for MLP...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 3),
    min_df=3,
    max_df=0.9,
    stop_words='english'
)

# Fit and transform training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(test_df['enhanced_text'])

print(f"TF-IDF matrix shape - Train: {X_train_tfidf.shape}, Val: {X_val_tfidf.shape}, Test: {X_test_tfidf.shape}")
print(f"Number of features: {X_train_tfidf.shape[1]}")


In [None]:
# Define Enhanced MLP Model
class EnhancedMLP(nn.Module):
    def __init__(self, input_dim, dropout=0.3):
        super(EnhancedMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 256)
        self.fc4 = nn.Linear(256, 128)
        self.fc5 = nn.Linear(128, 1)
        
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.batch_norm1 = nn.BatchNorm1d(1024)
        self.batch_norm2 = nn.BatchNorm1d(512)
        self.batch_norm3 = nn.BatchNorm1d(256)
        self.batch_norm4 = nn.BatchNorm1d(128)
    
    def forward(self, x):
        x = self.relu(self.batch_norm1(self.fc1(x)))
        x = self.dropout(x)
        x = self.relu(self.batch_norm2(self.fc2(x)))
        x = self.dropout(x)
        x = self.relu(self.batch_norm3(self.fc3(x)))
        x = self.dropout(x)
        x = self.relu(self.batch_norm4(self.fc4(x)))
        x = self.dropout(x)
        x = self.fc5(x)
        return self.sigmoid(x)

print("Enhanced MLP model defined!")


In [None]:
# Train MLP Model
print("="*60)
print("TRAINING ENHANCED MLP MODEL")
print("="*60)

# Convert sparse matrices to dense for PyTorch
X_train_dense = X_train_tfidf.toarray()
X_val_dense = X_val_tfidf.toarray()

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_dense)
y_train_tensor = torch.FloatTensor(y_train.values).unsqueeze(1)
X_val_tensor = torch.FloatTensor(X_val_dense)

# Create data loader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Initialize model
mlp_model = EnhancedMLP(X_train_dense.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(mlp_model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, factor=0.5)

# Training loop
mlp_model.train()
best_val_auc = 0
patience = 5
patience_counter = 0

for epoch in tqdm(range(20), desc="Training Enhanced MLP"):
    epoch_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = mlp_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    # Validation
    mlp_model.eval()
    with torch.no_grad():
        val_pred = mlp_model(X_val_tensor).numpy().flatten()
        val_auc = roc_auc_score(y_val, val_pred)
        
        if val_auc > best_val_auc:
            best_val_auc = val_auc
            patience_counter = 0
        else:
            patience_counter += 1
        
        scheduler.step(epoch_loss)
        
        if patience_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break
    
    mlp_model.train()

print("Training completed!")


In [None]:
# Evaluate MLP Model
print("\nEvaluating MLP Model...")

# Make predictions on validation set
mlp_model.eval()
with torch.no_grad():
    y_val_pred_proba = mlp_model(X_val_tensor).numpy().flatten()
    y_val_pred_binary = (y_val_pred_proba > 0.5).astype(int)

# Calculate metrics
mlp_auc = roc_auc_score(y_val, y_val_pred_proba)
mlp_accuracy = accuracy_score(y_val, y_val_pred_binary)

print(f"\nEnhanced MLP Results:")
print(f"  AUC Score: {mlp_auc:.4f}")
print(f"  Accuracy: {mlp_accuracy:.4f} ({mlp_accuracy*100:.2f}%)")

# Classification report
print(f"\nClassification Report:")
print(classification_report(y_val, y_val_pred_binary))


In [None]:
# Generate test predictions
print("Generating test predictions...")

# Convert test data to tensor
X_test_dense = X_test_tfidf.toarray()
X_test_tensor = torch.FloatTensor(X_test_dense)

# Make predictions
mlp_model.eval()
with torch.no_grad():
    test_predictions = mlp_model(X_test_tensor).numpy().flatten()

print(f"Test predictions generated: {len(test_predictions)}")
print(f"Prediction range: [{min(test_predictions):.4f}, {max(test_predictions):.4f}]")
print(f"Mean prediction: {np.mean(test_predictions):.4f}")

# Create submission file
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'rule_violation': test_predictions
})

# Save submission file
submission_path = '/kaggle/working/mlp_submission.csv'
submission_df.to_csv(submission_path, index=False)

print(f"\nSubmission file saved to: {submission_path}")
print(f"Submission shape: {submission_df.shape}")
print(f"\nFirst few predictions:")
print(submission_df.head())


In [None]:
# Final Summary
print("="*70)
print("ENHANCED MLP MODEL SUMMARY")
print("="*70)
print(f"Model: Enhanced MLP (PyTorch)")
print(f"Architecture: 5 layers (1024→512→256→128→1) with BatchNorm & Dropout")
print(f"Features: TF-IDF (20,000 features, n-grams 1-3)")
print(f"Hyperparameters: batch_size=64, epochs=20, lr=0.001, dropout=0.3")
print(f"Validation AUC: {mlp_auc:.4f}")
print(f"Validation Accuracy: {mlp_accuracy:.4f} ({mlp_accuracy*100:.2f}%)")
print(f"Target Achieved (>92%): {'YES' if mlp_accuracy > 0.92 else 'NO'}")
print(f"Submission File: {submission_path}")
print("="*70)
