In [8]:
# ============================
# Sentiment Analysis on CSV (IMDB format)
# - Columns expected: 'review', 'sentiment' (values: 'positive'/'negative')
# - Prints accuracy (train/test split)
# - Saves predictions for ALL rows (CSV + Excel)
# ============================

import os, re, time, warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# ----------------------------
# 1) Load CSV
# ----------------------------
# Change this path if needed
CSV_PATH = "IMDB Dataset.csv"   # e.g., "/content/IMDB Dataset.csv"
df = pd.read_csv(CSV_PATH)
print("✅ CSV loaded:", df.shape)

# Sanity check for expected columns
if not {'review', 'sentiment'}.issubset(df.columns):
    raise ValueError("CSV must have columns: 'review' and 'sentiment'")

# ----------------------------
# 2) Minimal cleaning
# ----------------------------
def basic_clean(s: str) -> str:
    s = str(s)
    s = re.sub(r"<.*?>", " ", s)                 # remove HTML
    s = re.sub(r"http\S+|www\.\S+", " ", s)      # remove URLs
    s = re.sub(r"[^A-Za-z0-9\s']", " ", s)       # keep alnum + apostrophes
    s = re.sub(r"\s+", " ", s).strip()
    return s

df['review_clean'] = df['review'].astype(str).apply(basic_clean)

# ----------------------------
# 3) Map labels to 0/1
# ----------------------------
label_map = {'negative': 0, 'positive': 1}
df['label'] = df['sentiment'].astype(str).str.lower().map(label_map)
if df['label'].isna().any():
    raise ValueError("Labels must be 'positive' or 'negative'.")

print("Class counts:", df['label'].value_counts().to_dict())

# ----------------------------
# 4) Train/Test split
# ----------------------------
X_train_text, X_val_text, y_train, y_val = train_test_split(
    df['review_clean'].values,
    df['label'].values,
    test_size=0.2,
    random_state=42,
    stratify=df['label'].values
)

# ----------------------------
# 5) Vectorize (TF-IDF)
# ----------------------------
vectorizer = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1,2),
    min_df=2,
    max_df=0.95
)
X_train = vectorizer.fit_transform(X_train_text)
X_val   = vectorizer.transform(X_val_text)

# ----------------------------
# 6) Train Logistic Regression
# ----------------------------
# 'saga' is fast for large sparse data and supports predict_proba
lr = LogisticRegression(solver='saga', max_iter=2000, n_jobs=-1)
t0 = time.time()
lr.fit(X_train, y_train)
print(f"⏱ Train time: {time.time()-t0:.2f}s")

# ----------------------------
# 7) Evaluate
# ----------------------------
y_pred = lr.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"🎯 Validation Accuracy (LR): {acc:.4f}")

print("📊 Classification Report (LR):")
print(classification_report(y_val, y_pred, target_names=['negative','positive']))

# ----------------------------
# 8) Final predictions for ALL rows (optional but useful)
#    Fit on ALL data, then predict for the whole CSV
# ----------------------------
X_all = vectorizer.fit_transform(df['review_clean'].values)
lr.fit(X_all, df['label'].values)

pred_all = lr.predict(X_all)
df['PredLabel'] = pd.Series(pred_all).map({0:'negative', 1:'positive'})

# Confidence (may not be perfect calibration; still useful)
try:
    prob_all = lr.predict_proba(X_all).max(axis=1)
    df['Confidence'] = prob_all
except Exception:
    pass

# ----------------------------
# 9) Save outputs
# ----------------------------
out_cols = ['review', 'PredLabel']
if 'Confidence' in df.columns:
    out_cols.append('Confidence')

csv_out = "Sentiment_Predictions.csv"
xlsx_out = "Sentiment_Predictions.xlsx"

df[out_cols].to_csv(csv_out, index=False, encoding='utf-8-sig')
try:
    df[out_cols].to_excel(xlsx_out, index=False)
    print(f"💾 Saved:\n - {csv_out}\n - {xlsx_out}")
except Exception as e:
    print(f"💾 Saved:\n - {csv_out}\n - (Excel skipped: {e})")

# (Colab) auto-download
try:
    from google.colab import files
    files.download(csv_out)
except Exception:
    pass


✅ CSV loaded: (50000, 2)
Class counts: {1: 25000, 0: 25000}
⏱ Train time: 4.68s
🎯 Validation Accuracy (LR): 0.9052
📊 Classification Report (LR):
              precision    recall  f1-score   support

    negative       0.91      0.90      0.90      5000
    positive       0.90      0.91      0.91      5000

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000

💾 Saved:
 - Sentiment_Predictions.csv
 - (Excel skipped: First things first: I'm not a conservative. And even though I would never refer to myself as a liberal or a Democrat, I was opposed to the war in Iraq from day one. I think it's safe to say John Cusack and I would probably see eye-to-eye on politics, in fact, I'm sure we'd become drinking buddies if we ever got to talking about how great Adam Curtis' BBC docs are. My point is this: don't discredit this review by thinking I'm not a part of the choir Cusack is preachin

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>