In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer 
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/richardph911/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/richardph911/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/richardph911/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/richardph911/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Raw data
label : 1 for negative and 2 for positive

title : review heading

text : review body

In [2]:
def load_data(train_path, test_path):
    try:
        train_df = pd.read_csv(train_path, header=None, names=("label", "title", "text"))
        test_df = pd.read_csv(test_path, header=None, names=("label", "title", "text"))

        if len(train_df) > 50000:
            train_df = train_df.sample(50000, random_state=42)

        # 1 → 0 (negative), 2 → 1 (positive)
        label_map = {1: 0, 2: 1}
        train_df['label'] = train_df['label'].map(label_map)
        test_df['label'] = test_df['label'].map(label_map)

        print("Datasets loaded successfully!")
        return train_df, test_df

    except FileNotFoundError:
        print("File not found. Please check the file path.")
        return None, None
    except Exception as e:
        print("Error loading data:", str(e))
        return None, None


In [3]:
def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text) if text is not None else ""
    # Remove special characters, URLs, emails in one pass
    text = re.sub(r'[^a-zA-Z0-9\s]|https?://\S+|www\.\S+|\S+@\S+', '', text)
    # Tokenize and process
    tokens = text.lower().split()
    sw = set(stopwords.words('english'))  # Convert to set for faster lookup
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in sw]
    return ' '.join(tokens)

In [4]:
def vectorize_text(train_text, test_text):
    # Use HashingVectorizer for memory efficiency
    vectorizer = HashingVectorizer(
        n_features=2**14,
        alternate_sign=False,
        ngram_range=(1, 2),
        norm='l2', 
        stop_words='english'
    )

    # HashingVectorizer doesn't need to be "fit"
    X_train = vectorizer.transform(train_text)
    X_test = vectorizer.transform(test_text)

    return X_train, X_test

In [5]:
if __name__ == "__main__":
    train_path = '/Users/richardph911/Downloads/archive/train.csv' 
    test_path = '/Users/richardph911/Downloads/archive/test.csv' 
    train_df, test_df = load_data(train_path, test_path)
    # Preprocess the text
    if train_df is not None and test_df is not None:
        # Combine title + text, then preprocess
        train_df['reviews'] = train_df['title'].astype(str).fillna('') + ' ' + train_df['text'].astype(str).fillna('')
        test_df['reviews'] = test_df['title'].astype(str).fillna('') + ' ' + test_df['text'].astype(str).fillna('')
        train_df['text'] = train_df['reviews'].apply(preprocess_text)
        test_df['text'] = test_df['reviews'].apply(preprocess_text)
        # Drop the temporary combined column
        train_df = train_df.drop(columns=['reviews'])
        test_df = test_df.drop(columns=['reviews'])
        
        # Vectorize
        X_train_full, X_test = vectorize_text(train_df['text'], test_df['text'])
        y_train_full = train_df['label'].values
        y_test = test_df['label'].values
        
        # Define models
        models = {
            "Logistic Regression": LogisticRegression(max_iter=1000),
            "Naive Bayes": MultinomialNB(),
            "SVM": SVC(kernel="linear", probability=True), 
            "KNN": KNeighborsClassifier(n_neighbors=5),
            "Decision Tree": DecisionTreeClassifier(),
            "Random Forest": RandomForestClassifier(n_estimators=100)
        }
        # Run 5 epochs with different seeds
        seeds = [42, 123, 456, 789, 101]
        seed_results = {}
        all_accuracies = {name: [] for name in models.keys()}

        for seed in seeds:
            print(f"\nRunning with seed: {seed}")
            # Split data with current seed
            X_train, X_val, y_train, y_val = train_test_split(
                X_train_full, y_train_full, test_size=0.2, random_state=seed, stratify=y_train_full
            )

            accuracies = []
            model_names = list(models.keys())

            # Train and evaluate models
            for name, model in models.items():
                print(f"\nTraining {name}...")
                model.fit(X_train, y_train)

                # Test performance
                y_test_pred = model.predict(X_test)
                accuracy = accuracy_score(y_test, y_test_pred)
                accuracies.append(accuracy)
                all_accuracies[name].append(accuracy)
                print(f"{name} Test Accuracy: {accuracy:.4f}")
                # print(f"\n{name} Test Performance:")
                # print(classification_report(y_test, y_test_pred))

        # Store average accuracy for this seed
        seed_results[seed] = np.mean(accuracies)
        best_seed = max(seed_results, key=seed_results.get)
        print(f"\nBest Seed: {best_seed} with average accuracy: {seed_results[best_seed]:.4f}")

        # Plot accuracies for the best seed (re-run with best seed for visualization)
        X_train, X_val, y_train, y_val = train_test_split(
            X_train_full, y_train_full, test_size=0.2, random_state=best_seed, stratify=y_train_full
        )
        accuracies = []
        model_names = list(models.keys())
        for name, model in models.items():
            model.fit(X_train, y_train)
            y_test_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_test_pred)
            accuracies.append(accuracy)
            print(f"\n{name} Test Performance (Best Seed {best_seed}):")
            print(classification_report(y_test, y_test_pred))

            if hasattr(model, "predict_proba"):
                y_test_proba = model.predict_proba(X_test)[:, 1]
                print(f"{name} Test Confidence Scores (first 5 samples):")
                print(y_test_proba[:5])
            else:
                print(f"{name} does not support confidence scores.")

        # Plot
        plt.figure(figsize=(10, 6))
        ax = sns.barplot(x=model_names, y=accuracies, palette='viridis')
        plt.xlabel('Models', fontsize=12)
        plt.ylabel('Accuracy', fontsize=12)
        plt.title(f'Comparison of Different Models (Seed: {best_seed})', fontsize=14)
        plt.ylim(0, 1)
        plt.xticks(rotation=45, ha='right', fontsize=10)

        for p in ax.patches:
            ax.annotate(f"{p.get_height():.2f}",
                        (p.get_x() + p.get_width() / 2., p.get_height()),
                        ha='center', va='center', fontsize=10, color='black',
                        xytext=(0, 6), textcoords='offset points')

        plt.tight_layout()
        plt.show()

        # Print all seed results
        print("\nAll Seed Results (Average Accuracy Across Models):")
        for seed, avg_acc in seed_results.items():
            print(f"Seed {seed}: {avg_acc:.4f}")

Datasets loaded successfully!
