# Imports

In [18]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from langdetect import detect, DetectorFactory

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

import os
import time
import json
import matplotlib.pyplot as plt
from sklearn.metrics import (
    classification_report, accuracy_score, roc_curve, auc, roc_auc_score
)
from sklearn.preprocessing import label_binarize

In [None]:
# Download once
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\alex7\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alex7\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\alex7\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

# Load Data

In [3]:
df = pd.read_csv("./data/reviews.csv")
df.dropna(subset=['Review', 'Label'], inplace=True)
df['Review'] = df['Review'].astype(str)

# Preprocess Text

In [None]:
def clean_text(text):
    # clean out tags urls and only alphabet + no multiple spaces
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+|#\w+", '', text)
    text = re.sub(r"[^a-zA-Z\s]", ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove stop words and lemmatize
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in text.split() if w not in stop_words]
    return ' '.join(tokens)

In [5]:
df['processed'] = df['Review'].apply(clean_text)
df.head()

Unnamed: 0,Id,Review,Label,processed
0,0,good and interesting,5,good interesting
1,1,"This class is very helpful to me. Currently, I...",5,class helpful currently still learning class m...
2,2,like!Prof and TAs are helpful and the discussi...,5,like prof ta helpful discussion among student ...
3,3,Easy to follow and includes a lot basic and im...,5,easy follow includes lot basic important techn...
4,4,Really nice teacher!I could got the point eazl...,4,really nice teacher could got point eazliy v


In [None]:
DetectorFactory.seed = 0

def is_english(text):
    """Return True if detected language is English, False otherwise."""
    try:
        return detect(text) == 'en'
    except:
        # langdetect can fail on very short or weird text
        return False

In [7]:
df['is_english'] = df['Review'].apply(is_english)
df.head()

Unnamed: 0,Id,Review,Label,processed,is_english
0,0,good and interesting,5,good interesting,True
1,1,"This class is very helpful to me. Currently, I...",5,class helpful currently still learning class m...,True
2,2,like!Prof and TAs are helpful and the discussi...,5,like prof ta helpful discussion among student ...,True
3,3,Easy to follow and includes a lot basic and im...,5,easy follow includes lot basic important techn...,True
4,4,Really nice teacher!I could got the point eazl...,4,really nice teacher could got point eazliy v,True


In [8]:
print(df['is_english'].value_counts())

is_english
True    107018
Name: count, dtype: int64


In [9]:
def add_sentiment_column(df):
    sentiment_map = {
        1: 'negative',
        2: 'negative',
        3: 'neutral',
        4: 'positive',
        5: 'positive'
    }
    df['Sentiment'] = df['Label'].map(sentiment_map)
    return df


In [10]:
df = add_sentiment_column(df)
df.head()

Unnamed: 0,Id,Review,Label,processed,is_english,Sentiment
0,0,good and interesting,5,good interesting,True,positive
1,1,"This class is very helpful to me. Currently, I...",5,class helpful currently still learning class m...,True,positive
2,2,like!Prof and TAs are helpful and the discussi...,5,like prof ta helpful discussion among student ...,True,positive
3,3,Easy to follow and includes a lot basic and im...,5,easy follow includes lot basic important techn...,True,positive
4,4,Really nice teacher!I could got the point eazl...,4,really nice teacher could got point eazliy v,True,positive


In [15]:
le = LabelEncoder()
df['Sentiment_encoded'] = le.fit_transform(df['Sentiment'])

tfidf = TfidfVectorizer(
    max_features=20000,  
    ngram_range=(1,2),  
    stop_words='english'
)

X = tfidf.fit_transform(df['processed'])
y = df['Sentiment_encoded']

# Train Test Split and Model Building

In [17]:
df['Sentiment'].value_counts()

Sentiment
positive    97227
neutral      5071
negative     4720
Name: count, dtype: int64

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [19]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Linear SVM": LinearSVC()
}


In [None]:
# OpenAI (2025) ChatGPT (GPT-5) create metrics for common ML workloads, 7 October. Available at: https://chat.openai.com/
# (Accessed: 7 October 2025).
def evaluate_and_log_model(name, model, X_train, X_test, y_train, y_test, target_names):
    print(f"\n🚀 Training {name}...")
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    # --- predictions ---
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # --- accuracies ---
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)

    print(f"✅ {name} trained in {train_time:.2f} sec | Train Acc={train_acc:.4f} | Test Acc={test_acc:.4f}")

    # --- classification report ---
    report = classification_report(y_test, y_pred_test, target_names=target_names, output_dict=True)

    # --- create directory ---
    folder = f'./results/{name.replace(" ", "_")}'
    os.makedirs(folder, exist_ok=True)

    # --- save report to JSON ---
    with open(f'{folder}/accuracy_report.json', 'w') as f:
        json.dump({
            'model': name,
            'train_accuracy': train_acc,
            'test_accuracy': test_acc,
            'train_time_sec': train_time,
            'classification_report': report
        }, f, indent=4)

    # --- ROC-AUC (for models with decision_function or predict_proba) ---
    try:
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test)
        else:
            y_prob = model.decision_function(X_test)

        # Binarize labels for multi-class ROC
        y_bin = label_binarize(y_test, classes=np.unique(y_test))
        n_classes = y_bin.shape[1]

        fpr, tpr, roc_auc = {}, {}, {}
        for i in range(n_classes):
            fpr[i], tpr[i], _ = roc_curve(y_bin[:, i], y_prob[:, i])
            roc_auc[i] = auc(fpr[i], tpr[i])

        plt.figure(figsize=(6, 5))
        for i in range(n_classes):
            plt.plot(fpr[i], tpr[i], lw=2,
                     label=f'Class {target_names[i]} (AUC={roc_auc[i]:.2f})')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {name}')
        plt.legend()
        plt.tight_layout()
        plt.savefig(f'{folder}/roc_curve.png', dpi=120)
        plt.close()
    except Exception as e:
        print(f"⚠️ Could not compute ROC for {name}: {e}")

    # --- Train vs Test Accuracy comparison ---
    plt.figure(figsize=(4, 5))
    plt.bar(['Train', 'Test'], [train_acc, test_acc], color=['#4e79a7', '#f28e2b'])
    plt.title(f'{name} Accuracy Comparison')
    plt.ylabel('Accuracy')
    plt.ylim(0, 1)
    plt.tight_layout()
    plt.savefig(f'{folder}/train_vs_test_accuracy.png', dpi=120)
    plt.close()

    print(f"📂 Logs saved under: {folder}\n")

    return {
        'name': name,
        'train_acc': train_acc,
        'test_acc': test_acc,
        'train_time': train_time
    }

In [22]:
results = []

for name, model in models.items():
    result = evaluate_and_log_model(name, model, X_train, X_test, y_train, y_test, target_names=['negative','neutral','positive'])
    results.append(result)


🚀 Training Logistic Regression...
✅ Logistic Regression trained in 1.12 sec | Train Acc=0.9369 | Test Acc=0.9221
📂 Logs saved under: ./results/Logistic_Regression


🚀 Training Naive Bayes...
✅ Naive Bayes trained in 0.02 sec | Train Acc=0.9175 | Test Acc=0.9142
📂 Logs saved under: ./results/Naive_Bayes


🚀 Training Linear SVM...
✅ Linear SVM trained in 2.06 sec | Train Acc=0.9617 | Test Acc=0.9199
📂 Logs saved under: ./results/Linear_SVM

