In [2]:
#  Install dependencies
%pip install -q scikit-learn pandas

import os
import re
import zipfile
import urllib.request
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# === Step 1: Download and unzip dataset ===
url = "http://www.cs.cornell.edu/people/pabo/movie-review-data/mix20_rand700_tokens_cleaned.zip"
zip_path = "mix20_rand700_tokens_cleaned.zip"
data_folder = "mix20_rand700_tokens_cleaned"

if not os.path.exists(zip_path):
    print(" Downloading dataset...")
    urllib.request.urlretrieve(url, zip_path)

if not os.path.exists(data_folder):
    print(" Unzipping dataset...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(data_folder)

# === Step 2: Load reviews and assign labels ===
texts, labels = [], []

# Define the paths to the positive and negative directories within 'tokens'
pos_dir = os.path.join(data_folder, 'tokens', 'pos')
neg_dir = os.path.join(data_folder, 'tokens', 'neg')

# Load positive reviews
if os.path.exists(pos_dir):
    for fname in os.listdir(pos_dir):
        if fname.endswith('.txt'): # Assuming the review files are .txt
            with open(os.path.join(pos_dir, fname), 'r', encoding='utf-8', errors='ignore') as f:
                texts.append(f.read())
                labels.append('pos')
else:
    print(f"Warning: Positive reviews directory not found at {pos_dir}")

# Load negative reviews
if os.path.exists(neg_dir):
    for fname in os.listdir(neg_dir):
        if fname.endswith('.txt'): # Assuming the review files are .txt
            with open(os.path.join(neg_dir, fname), 'r', encoding='utf-8', errors='ignore') as f:
                texts.append(f.read())
                labels.append('neg')
else:
     print(f"Warning: Negative reviews directory not found at {neg_dir}")

print(f" Loaded {len(texts)} reviews.")


# === Step 3: Train/test split ===
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

# === Step 4: Define models ===
models = {
    "Naive Bayes": Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', MultinomialNB()),
    ]),
    "Logistic Regression (MaxEnt)": Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LogisticRegression(max_iter=1000)),
    ]),
    "Support Vector Machine": Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', LinearSVC()),
    ])
}

# === Step 5: Train and evaluate ===
for name, model in models.items():
    print(f"\n=====  {name} =====")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=["neg", "pos"]))

 Loaded 1400 reviews.

=====  Naive Bayes =====
              precision    recall  f1-score   support

         neg       0.75      0.86      0.80       140
         pos       0.83      0.72      0.77       140

    accuracy                           0.79       280
   macro avg       0.79      0.79      0.79       280
weighted avg       0.79      0.79      0.79       280


=====  Logistic Regression (MaxEnt) =====
              precision    recall  f1-score   support

         neg       0.78      0.78      0.78       140
         pos       0.78      0.78      0.78       140

    accuracy                           0.78       280
   macro avg       0.78      0.78      0.78       280
weighted avg       0.78      0.78      0.78       280


=====  Support Vector Machine =====
              precision    recall  f1-score   support

         neg       0.81      0.77      0.79       140
         pos       0.78      0.82      0.80       140

    accuracy                           0.80       280
