In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')
from google.colab import drive
from scipy import sparse
import re

# Mount Google Drive
drive.mount('/content/gdrive')

# Define file paths
train_path = '/content/gdrive/MyDrive/ECSE_551/Train.csv'
test_path = '/content/gdrive/MyDrive/ECSE_551/Test.csv'
output_path = '/content/gdrive/MyDrive/ECSE_551/submission.csv'

class EnhancedBernoulliNB:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.classes_ = None
        self.class_priors_ = None
        self.feature_probabilities_ = None
        self.class_weights = {
            'Brussels': 1.2,
            'London': 1.1,
            'Montreal': 1.2,
            'Toronto': 1.0
        }

    def fit(self, X, y):
        if sparse.issparse(X):
            X = X.toarray()

        self.classes_ = np.unique(y)
        n_samples = X.shape[0]
        n_features = X.shape[1]

        # Apply class weights
        class_counts = np.array([np.sum(y == c) for c in self.classes_])
        weighted_counts = np.array([class_counts[i] * self.class_weights.get(c, 1.0)
                                  for i, c in enumerate(self.classes_)])

        self.class_priors_ = (weighted_counts + self.alpha) / (n_samples + self.alpha * len(self.classes_))
        self.feature_probabilities_ = np.zeros((len(self.classes_), n_features))

        for i, c in enumerate(self.classes_):
            X_c = X[y == c]
            feature_counts = X_c.sum(axis=0)
            # Apply adaptive smoothing based on class frequency
            smooth_factor = self.alpha * (1.0 / max(class_counts[i], 1)) ** 0.5
            self.feature_probabilities_[i] = (feature_counts + smooth_factor) / (class_counts[i] + 2 * smooth_factor)

        return self

    def predict_proba(self, X):
        if sparse.issparse(X):
            X = X.toarray()

        log_probs = np.zeros((X.shape[0], len(self.classes_)))

        for i, c in enumerate(self.classes_):
            log_probs[:, i] = np.log(self.class_priors_[i])
            log_probs[:, i] += np.sum(X * np.log(self.feature_probabilities_[i]), axis=1)
            log_probs[:, i] += np.sum((1 - X) * np.log(1 - self.feature_probabilities_[i]), axis=1)

        # Numerical stability
        log_probs -= np.max(log_probs, axis=1)[:, np.newaxis]
        probs = np.exp(log_probs)
        probs /= np.sum(probs, axis=1)[:, np.newaxis]

        return probs

    def predict(self, X):
        return self.classes_[np.argmax(self.predict_proba(X), axis=1)]

def clean_text(text):
    """Enhanced text cleaning function"""
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()

    # Remove URLs and Reddit formatting
    text = re.sub(r'http\S+|www\S+|\[.*?\]\(.*?\)', ' ', text)
    text = re.sub(r'/r/\w+|/u/\w+', ' ', text)

    # Remove HTML entities
    text = re.sub(r'&amp;|&lt;|&gt;', ' ', text)

    # Remove bot signatures
    text = re.sub(r'i am a bot.*|please contact the moderators.*',
                 '', text, flags=re.IGNORECASE | re.DOTALL)

    # Clean special characters but keep meaningful punctuation
    text = re.sub(r'[^\w\s.,!?-]', ' ', text)

    # Normalize whitespace
    text = ' '.join(text.split())

    return text.strip()

def create_city_features(text):
    """Extract city-specific features"""
    text = text.lower()
    features = []

    # City-specific keywords
    city_patterns = {
        'toronto': ['toronto', 'gta', 'ttc', 'ontario', 'dundas', 'yonge'],
        'montreal': ['montreal', 'quebec', 'stm', 'plateau', 'metro'],
        'london': ['london', 'tube', 'underground', 'oyster', 'thames', 'uk'],
        'brussels': ['brussels', 'belgium', 'belgian', 'eu', 'european']
    }

    for city_words in city_patterns.values():
        count = sum(1 for word in city_words if word in text)
        features.append(count)

    return features

def perform_cross_validation(X, y, model, k=5):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    scores = []

    for fold, (train_index, val_index) in enumerate(skf.split(X, y)):
        X_train_fold = X[train_index]
        X_val_fold = X[val_index]
        y_train_fold = y[train_index]
        y_val_fold = y[val_index]

        model.fit(X_train_fold, y_train_fold)
        y_pred = model.predict(X_val_fold)

        accuracy = accuracy_score(y_val_fold, y_pred)
        scores.append(accuracy)

        print(f"Fold {fold + 1} Accuracy: {accuracy:.4f}")
        print("\nClassification Report:")
        print(classification_report(y_val_fold, y_pred))

    return scores

def main():
    # Read data
    print("Reading data...")
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    print(f"Training data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")

    # Clean texts
    print("\nCleaning texts...")
    train_texts = [clean_text(text) for text in train_df.iloc[:, 0]]
    test_texts = [clean_text(text) for text in test_df['body']]

    # Feature extraction
    print("\nExtracting features...")

    # Binary features
    count_vec = CountVectorizer(
        max_features=3000,
        binary=True,
        ngram_range=(1, 2),
        min_df=2,
        stop_words='english'
    )

    X_train_binary = count_vec.fit_transform(train_texts)
    X_test_binary = count_vec.transform(test_texts)

    # Get labels
    y_train = train_df.iloc[:, 1]

    # Initialize models
    print("\nInitializing models...")
    bernoulli_nb = EnhancedBernoulliNB(alpha=0.1)
    rf = RandomForestClassifier(
        n_estimators=200,
        max_depth=20,
        class_weight='balanced',
        n_jobs=-1,
        random_state=42
    )

    # Cross-validation
    print("\nPerforming cross-validation for BernoulliNB...")
    nb_scores = perform_cross_validation(X_train_binary, y_train, bernoulli_nb)
    print(f"\nBernoulliNB Mean CV Score: {np.mean(nb_scores):.4f} (±{np.std(nb_scores):.4f})")

    print("\nPerforming cross-validation for RandomForest...")
    rf_scores = perform_cross_validation(X_train_binary, y_train, rf)
    print(f"\nRandomForest Mean CV Score: {np.mean(rf_scores):.4f} (±{np.std(rf_scores):.4f})")

    # Train final model
    print("\nTraining final model...")
    best_model = bernoulli_nb if np.mean(nb_scores) > np.mean(rf_scores) else rf
    best_model.fit(X_train_binary, y_train)

    # Generate predictions
    print("Generating predictions...")
    predictions = best_model.predict(X_test_binary)

    # Create submission
    submission = pd.DataFrame({
        'id': test_df['id'],
        'subreddit': predictions
    })

    submission.to_csv(output_path, index=False)
    print(f"\nSubmission saved to {output_path}")

if __name__ == "__main__":
    main()

Mounted at /content/gdrive
Reading data...
Training data shape: (1399, 2)
Test data shape: (600, 2)

Cleaning texts...

Extracting features...

Initializing models...

Performing cross-validation for BernoulliNB...
Fold 1 Accuracy: 0.6321

Classification Report:
              precision    recall  f1-score   support

    Brussels       0.55      0.77      0.64        70
      London       0.62      0.57      0.60        70
    Montreal       0.97      0.40      0.57        70
     Toronto       0.62      0.79      0.69        70

    accuracy                           0.63       280
   macro avg       0.69      0.63      0.62       280
weighted avg       0.69      0.63      0.62       280

Fold 2 Accuracy: 0.6571

Classification Report:
              precision    recall  f1-score   support

    Brussels       0.55      0.74      0.63        70
      London       0.64      0.70      0.67        70
    Montreal       0.97      0.49      0.65        70
     Toronto       0.67      0.70    