# Model 1: Logistic Regression - Jigsaw Agile Community Rules Classification

This notebook implements a **Logistic Regression** model for the Jigsaw Agile Community Rules Classification hackathon.

## Model Details:
- **Algorithm**: Logistic Regression with TF-IDF features
- **Features**: 15,000 TF-IDF features with (1,2) n-grams
- **Hyperparameters**: C=0.1, solver='liblinear', class_weight='balanced'
- **Target**: Achieve >92% accuracy

## Features Used:
- Data augmentation with positive/negative examples
- Enhanced text preprocessing with [SEP] separators
- URL detection and text length features
- TF-IDF vectorization with multiple configurations


In [1]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")
print("Model: Logistic Regression")


Libraries imported successfully!
Model: Logistic Regression


In [2]:
# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(f"Train dataset shape: {train_df.shape}")
print(f"Test dataset shape: {test_df.shape}")
print(f"Train columns: {train_df.columns.tolist()}")

# Display first few rows
print("\nFirst few rows of training data:")
print(train_df.head())


Train dataset shape: (2029, 9)
Test dataset shape: (10, 8)
Train columns: ['row_id', 'body', 'rule', 'subreddit', 'positive_example_1', 'positive_example_2', 'negative_example_1', 'negative_example_2', 'rule_violation']

First few rows of training data:
   row_id                                               body  \
0       0  Banks don't want you to know this! Click here ...   
1       1  SD Stream [ ENG Link 1] (http://www.sportsstre...   
2       2  Lol. Try appealing the ban and say you won't d...   
3       3  she will come your home open her legs with  an...   
4       4  code free tyrande --->>> [Imgur](http://i.imgu...   

                                                rule      subreddit  \
0  No Advertising: Spam, referral links, unsolici...     Futurology   
1  No Advertising: Spam, referral links, unsolici...  soccerstreams   
2  No legal advice: Do not offer or request legal...   pcmasterrace   
3  No Advertising: Spam, referral links, unsolici...            sex   
4  No 

In [3]:
# Data augmentation: Add positive and negative examples
def augment_training_data(df):
    """Augment training data with positive/negative examples"""
    augmented_data = []
    
    # Add original data
    for _, row in df.iterrows():
        augmented_data.append({
            'body': row['body'],
            'rule': row['rule'],
            'subreddit': row['subreddit'],
            'rule_violation': row['rule_violation']
        })
    
    # Add positive examples (rule_violation=1)
    for _, row in df.iterrows():
        if pd.notna(row['positive_example_1']):
            augmented_data.append({
                'body': row['positive_example_1'],
                'rule': row['rule'],
                'subreddit': row['subreddit'],
                'rule_violation': 1
            })
        if pd.notna(row['positive_example_2']):
            augmented_data.append({
                'body': row['positive_example_2'],
                'rule': row['rule'],
                'subreddit': row['subreddit'],
                'rule_violation': 1
            })
    
    # Add negative examples (rule_violation=0)
    for _, row in df.iterrows():
        if pd.notna(row['negative_example_1']):
            augmented_data.append({
                'body': row['negative_example_1'],
                'rule': row['rule'],
                'subreddit': row['subreddit'],
                'rule_violation': 0
            })
        if pd.notna(row['negative_example_2']):
            augmented_data.append({
                'body': row['negative_example_2'],
                'rule': row['rule'],
                'subreddit': row['subreddit'],
                'rule_violation': 0
            })
    
    return pd.DataFrame(augmented_data)

# Augment the training data
augmented_train = augment_training_data(train_df)
print(f"Original train data size: {len(train_df)}")
print(f"Augmented train data size: {len(augmented_train)}")
print(f"Class distribution: {augmented_train['rule_violation'].value_counts().to_dict()}")


Original train data size: 2029
Augmented train data size: 10145
Class distribution: {1: 5089, 0: 5056}


In [4]:
# Feature engineering: Add has_url and body_length features
def add_features(df):
    """Add enhanced features to the dataframe"""
    df = df.copy()
    
    # Basic features
    df['has_url'] = df['body'].apply(lambda x: 1 if ('http' in str(x).lower() or 'www' in str(x).lower()) else 0)
    df['body_length'] = df['body'].apply(lambda x: len(str(x)))
    df['word_count'] = df['body'].apply(lambda x: len(str(x).split()))
    df['avg_word_length'] = df['body'].apply(lambda x: np.mean([len(word) for word in str(x).split()]) if len(str(x).split()) > 0 else 0)
    
    # Advanced text features
    df['exclamation_count'] = df['body'].apply(lambda x: str(x).count('!'))
    df['question_count'] = df['body'].apply(lambda x: str(x).count('?'))
    df['caps_ratio'] = df['body'].apply(lambda x: sum(1 for c in str(x) if c.isupper()) / len(str(x)) if len(str(x)) > 0 else 0)
    df['digit_count'] = df['body'].apply(lambda x: sum(1 for c in str(x) if c.isdigit()))
    
    return df

# Add features to augmented training data
augmented_train = add_features(augmented_train)

# Add features to test data
test_df = add_features(test_df)

print("Features added successfully!")
print(f"New columns: {augmented_train.columns.tolist()}")


Features added successfully!
New columns: ['body', 'rule', 'subreddit', 'rule_violation', 'has_url', 'body_length', 'word_count', 'avg_word_length', 'exclamation_count', 'question_count', 'caps_ratio', 'digit_count']


In [5]:
# Combine text with [SEP] separators
def combine_text(row):
    """Combine text with enhanced features"""
    combined = f"{row['body']} [SEP] Rule: {row['rule']} [SEP] Subreddit: {row['subreddit']} [SEP] URL: {row['has_url']} [SEP] Length: {row['body_length']} [SEP] Words: {row['word_count']} [SEP] AvgWordLen: {row['avg_word_length']:.1f} [SEP] Exclamations: {row['exclamation_count']} [SEP] Questions: {row['question_count']} [SEP] CapsRatio: {row['caps_ratio']:.2f} [SEP] Digits: {row['digit_count']}"
    return combined

# Apply text combination
augmented_train['enhanced_text'] = augmented_train.apply(combine_text, axis=1)
test_df['enhanced_text'] = test_df.apply(combine_text, axis=1)

print("Text combination completed!")
print(f"Sample of enhanced text (first 200 chars):")
print(augmented_train['enhanced_text'].iloc[0][:200] + "...")


Text combination completed!
Sample of enhanced text (first 200 chars):
Banks don't want you to know this! Click here to know more! [SEP] Rule: No Advertising: Spam, referral links, unsolicited advertising, and promotional content are not allowed. [SEP] Subreddit: Futurol...


In [6]:
# Split data into train and validation sets
X = augmented_train['enhanced_text']
y = augmented_train['rule_violation']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Training class distribution: {y_train.value_counts().to_dict()}")
print(f"Validation class distribution: {y_val.value_counts().to_dict()}")


Training set size: 8116
Validation set size: 2029
Training class distribution: {1: 4071, 0: 4045}
Validation class distribution: {1: 1018, 0: 1011}


In [7]:
# TF-IDF Vectorization for Logistic Regression
print("Creating TF-IDF vectorizer for Logistic Regression...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    stop_words='english'
)

# Fit and transform training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(test_df['enhanced_text'])

print(f"TF-IDF matrix shape - Train: {X_train_tfidf.shape}, Val: {X_val_tfidf.shape}, Test: {X_test_tfidf.shape}")
print(f"Number of features: {X_train_tfidf.shape[1]}")


Creating TF-IDF vectorizer for Logistic Regression...
TF-IDF matrix shape - Train: (8116, 15000), Val: (2029, 15000), Test: (10, 15000)
Number of features: 15000


In [8]:
# Train Logistic Regression Model
print("="*60)
print("TRAINING LOGISTIC REGRESSION MODEL")
print("="*60)

# Initialize and train the model
lr_model = LogisticRegression(
    random_state=42,
    max_iter=2000,
    class_weight='balanced',
    C=0.1,
    solver='liblinear'
)

print("Training Logistic Regression...")
lr_model.fit(X_train_tfidf, y_train)

# Make predictions on validation set
y_val_pred_proba = lr_model.predict_proba(X_val_tfidf)[:, 1]
y_val_pred_binary = lr_model.predict(X_val_tfidf)

# Calculate metrics
lr_auc = roc_auc_score(y_val, y_val_pred_proba)
lr_accuracy = accuracy_score(y_val, y_val_pred_binary)

print(f"\nLogistic Regression Results:")
print(f"  AUC Score: {lr_auc:.4f}")
print(f"  Accuracy: {lr_accuracy:.4f} ({lr_accuracy*100:.2f}%)")

# Classification report
print(f"\nClassification Report:")
print(classification_report(y_val, y_val_pred_binary))


TRAINING LOGISTIC REGRESSION MODEL
Training Logistic Regression...

Logistic Regression Results:
  AUC Score: 0.9688
  Accuracy: 0.9202 (92.02%)

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.91      0.92      1011
           1       0.92      0.93      0.92      1018

    accuracy                           0.92      2029
   macro avg       0.92      0.92      0.92      2029
weighted avg       0.92      0.92      0.92      2029



In [10]:
# Generate test predictions
print("Generating test predictions...")
test_predictions = lr_model.predict_proba(X_test_tfidf)[:, 1]

print(f"Test predictions generated: {len(test_predictions)}")
print(f"Prediction range: [{min(test_predictions):.4f}, {max(test_predictions):.4f}]")
print(f"Mean prediction: {np.mean(test_predictions):.4f}")

# Create submission file
submission_df = pd.DataFrame({
    'row_id': test_df['row_id'],
    'rule_violation': test_predictions
})

# Save submission file
submission_path = 'logistic_regression_submission.csv'
submission_df.to_csv(submission_path, index=False)

print(f"\nSubmission file saved to: {submission_path}")
print(f"Submission shape: {submission_df.shape}")
print(f"\nFirst few predictions:")
print(submission_df.head())


Generating test predictions...
Test predictions generated: 10
Prediction range: [0.2707, 0.6807]
Mean prediction: 0.4866

Submission file saved to: logistic_regression_submission.csv
Submission shape: (10, 2)

First few predictions:
   row_id  rule_violation
0    2029        0.449083
1    2030        0.551495
2    2031        0.563740
3    2032        0.540546
4    2033        0.680694


In [11]:
# Final Summary
print("="*70)
print("LOGISTIC REGRESSION MODEL SUMMARY")
print("="*70)
print(f"Model: Logistic Regression")
print(f"Features: TF-IDF (15,000 features, n-grams 1-2)")
print(f"Hyperparameters: C=0.1, solver='liblinear', class_weight='balanced'")
print(f"Validation AUC: {lr_auc:.4f}")
print(f"Validation Accuracy: {lr_accuracy:.4f} ({lr_accuracy*100:.2f}%)")
print(f"Target Achieved (>92%): {'YES' if lr_accuracy > 0.92 else 'NO'}")
print(f"Submission File: {submission_path}")
print("="*70)


LOGISTIC REGRESSION MODEL SUMMARY
Model: Logistic Regression
Features: TF-IDF (15,000 features, n-grams 1-2)
Hyperparameters: C=0.1, solver='liblinear', class_weight='balanced'
Validation AUC: 0.9688
Validation Accuracy: 0.9202 (92.02%)
Target Achieved (>92%): YES
Submission File: logistic_regression_submission.csv
