<a href="https://colab.research.google.com/github/AkshatTalwar/cs175-teamx-sentiment/blob/main/notebooks/week1_baseline_imdb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install scikit-learn pandas numpy matplotlib

In [17]:
import urllib.request
import tarfile
from pathlib import Path

In [18]:
data_folder = Path("data")
data_folder.mkdir(exist_ok=True)

dataset_url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset_file = data_folder / "aclImdb_v1.tar.gz"

In [19]:
urllib.request.urlretrieve(dataset_url, dataset_file)
print("Downloaded:", dataset_file)

Downloaded: data/aclImdb_v1.tar.gz


In [20]:
with tarfile.open(dataset_file, "r:gz") as tar:
    tar.extractall(path=data_folder)

print("Dataset extracted")

  tar.extractall(path=data_folder)


Dataset extracted


In [21]:
imdb_folder = data_folder / "aclImdb"

print("Dataset folder exists:", imdb_folder.exists())
print("Train folder:", (imdb_folder / "train").exists())
print("Test folder:", (imdb_folder / "test").exists())


Dataset folder exists: True
Train folder: True
Test folder: True


In [22]:
#data test loading test
sample_file = imdb_folder / "train" / "pos"
files = list(sample_file.glob("*.txt"))
text = files[0].read_text(encoding="utf-8")
print(text[:500])

I enjoyed this movie. Haven't seen Andy Griffith in ages and felt he fit this role perfectly. I've associated him with comedy but am pleased to see that he's versatile.<br /><br />I wasn't troubled that Dotty's "anxiety disorder" may not have been verbatim from a psychiatric textbook. There are zillions of whatever-phobias and neuroses, and these can take on a broad variety of quantitative and qualitative forms. She is clearly a sensitive with extra-sensory powers as was understood by the local 


In [23]:
texts = []
labels = []

#positive reviews
pos_folder = imdb_folder / "train" / "pos"
for file in pos_folder.glob("*.txt"):
    texts.append(file.read_text(encoding="utf-8"))
    labels.append(1)

#negative reviews
neg_folder = imdb_folder / "train" / "neg"
for file in neg_folder.glob("*.txt"):
    texts.append(file.read_text(encoding="utf-8"))
    labels.append(0)

print("Total reviews loaded:", len(texts))
print("Positive reviews:", labels.count(1))
print("Negative reviews:", labels.count(0))

Total reviews loaded: 25000
Positive reviews: 12500
Negative reviews: 12500


In [24]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    texts, labels,
    test_size=0.2,
    random_state=42
)

print("Train size:", len(X_train))
print("Val size:", len(X_val))


Train size: 20000
Val size: 5000


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english", max_features=50000)

X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)

print("Vector shape (train):", X_train_vec.shape)
print("Vector shape (val):", X_val_vec.shape)


Vector shape (train): (20000, 50000)
Vector shape (val): (5000, 50000)


In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

pred = model.predict(X_val_vec)

acc = accuracy_score(y_val, pred)
f1 = f1_score(y_val, pred)

print("Validation Accuracy:", acc)
print("Validation F1:", f1)


Validation Accuracy: 0.879
Validation F1: 0.8809758016919143


In [27]:
test_texts = []
test_labels = []

pos_folder_test = imdb_folder / "test" / "pos"
for file in pos_folder_test.glob("*.txt"):
    test_texts.append(file.read_text(encoding="utf-8"))
    test_labels.append(1)

neg_folder_test = imdb_folder / "test" / "neg"
for file in neg_folder_test.glob("*.txt"):
    test_texts.append(file.read_text(encoding="utf-8"))
    test_labels.append(0)

print("Test reviews loaded:", len(test_texts))
print("Test positives:", test_labels.count(1))
print("Test negatives:", test_labels.count(0))

Test reviews loaded: 25000
Test positives: 12500
Test negatives: 12500


In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

vectorizer2 = TfidfVectorizer(stop_words="english", max_features=50000)

X_train_full = vectorizer2.fit_transform(texts)
X_test_full = vectorizer2.transform(test_texts)

model2 = LogisticRegression(max_iter=1000)
model2.fit(X_train_full, labels)

test_pred = model2.predict(X_test_full)

test_acc = accuracy_score(test_labels, test_pred)
test_f1 = f1_score(test_labels, test_pred)

print("TEST Accuracy:", test_acc)
print("TEST F1:", test_f1)


TEST Accuracy: 0.879
TEST F1: 0.8790822240876204


In [29]:
!pip -q install gradio


In [32]:
import numpy as np
import random

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score


# 1) Train baseline model (TF-IDF + Logistic Regression)
vectorizer = TfidfVectorizer(stop_words="english", max_features=50000)

X_train_vec = vectorizer.fit_transform(texts)

baseline_model = LogisticRegression(max_iter=1000)
baseline_model.fit(X_train_vec, labels)


# 2) Test on official test set
X_test_vec = vectorizer.transform(test_texts)

test_probs = baseline_model.predict_proba(X_test_vec)[:, 1]   # P(positive)
test_pred = (test_probs >= 0.5).astype(int)

test_acc = accuracy_score(test_labels, test_pred)
test_f1 = f1_score(test_labels, test_pred)

print("Baseline TEST accuracy:", test_acc)
print("Baseline TEST F1:", test_f1)


# 3) Predict one review (used by the UI)
def predict_one_review(text, threshold=0.8):
    x = vectorizer.transform([text])
    p_pos = float(baseline_model.predict_proba(x)[0, 1])

    pred = 1 if p_pos >= 0.5 else 0
    label = "POSITIVE" if pred == 1 else "NEGATIVE"

    confidence = p_pos if pred == 1 else (1 - p_pos)

    if confidence < threshold:
        label = "UNCERTAIN"

    return {
        "prediction": label,
        "p_positive": round(p_pos, 4),
        "confidence": round(confidence, 4),
        "threshold": threshold
    }


# 4) Correct ECE (calibration score)
def compute_ece_correct(probs_pos, true_labels, bins=10):
    probs_pos = np.array(probs_pos)
    true_labels = np.array(true_labels)

    pred = (probs_pos >= 0.5).astype(int)
    conf = np.where(pred == 1, probs_pos, 1 - probs_pos)
    correct = (pred == true_labels).astype(int)

    edges = np.linspace(0, 1, bins + 1)
    ece = 0.0

    for i in range(bins):
        lo, hi = edges[i], edges[i+1]
        mask = (conf >= lo) & (conf < hi) if i < bins-1 else (conf >= lo) & (conf <= hi)

        if mask.sum() == 0:
            continue

        avg_conf = conf[mask].mean()
        avg_acc = correct[mask].mean()
        ece += (mask.mean()) * abs(avg_acc - avg_conf)

    return float(ece)

baseline_ece = compute_ece_correct(test_probs, test_labels, bins=10)
print("Baseline ECE (correct):", baseline_ece)


# 5) Get some wrong examples (used by the UI)
def get_error_examples(k=10):
    wrong = [i for i in range(len(test_labels)) if test_pred[i] != test_labels[i]]
    if len(wrong) == 0:
        return []

    chosen = random.sample(wrong, min(k, len(wrong)))
    examples = []

    for idx in chosen:
        examples.append({
            "true_label": "POSITIVE" if test_labels[idx] == 1 else "NEGATIVE",
            "predicted_label": "POSITIVE" if test_pred[idx] == 1 else "NEGATIVE",
            "p_positive": round(float(test_probs[idx]), 4),
            "text": test_texts[idx][:600]
        })

    return examples


Baseline TEST accuracy: 0.879
Baseline TEST F1: 0.8790822240876204
Baseline ECE (correct): 0.09539715753787606


In [33]:
import gradio as gr

def ui_predict(text, threshold, model_name):
    if model_name == "Baseline (TF-IDF + Logistic Regression)":
        return predict_one_review(text, threshold=threshold)
    else:
        return {"prediction": "Coming soon", "p_positive": None, "confidence": None, "threshold": threshold}

def ui_show_calibration(model_name):
    if model_name == "Baseline (TF-IDF + Logistic Regression)":
        return {
            "model": model_name,
            "ece": round(float(baseline_ece), 4),
            "test_accuracy": round(float(test_acc), 4),
            "test_f1": round(float(test_f1), 4)
        }
    else:
        return {"model": model_name, "ece": None, "note": "Coming soon"}

def ui_show_errors(model_name, k):
    if model_name == "Baseline (TF-IDF + Logistic Regression)":
        return get_error_examples(k=int(k))
    else:
        return [{"note": "Coming soon"}]

with gr.Blocks(title="CS175 Sentiment + Confidence Tool") as demo:
    gr.Markdown("# CS175 Sentiment + Confidence Tool")
    gr.Markdown("Paste a review, get prediction + confidence, and explore calibration + common mistakes.")

    model_name = gr.Dropdown(
        ["Baseline (TF-IDF + Logistic Regression)", "DistilBERT (coming soon)"],
        value="Baseline (TF-IDF + Logistic Regression)",
        label="Model"
    )

    with gr.Tab("Predict"):
        text_in = gr.Textbox(lines=8, label="Review text")
        threshold = gr.Slider(0.50, 0.95, value=0.80, step=0.01, label="High-confidence threshold")
        btn = gr.Button("Predict")
        pred_out = gr.JSON(label="Output")
        btn.click(ui_predict, inputs=[text_in, threshold, model_name], outputs=pred_out)

    with gr.Tab("Calibration"):
        cal_btn = gr.Button("Show calibration stats")
        cal_out = gr.JSON(label="Calibration + Test Results")
        cal_btn.click(ui_show_calibration, inputs=model_name, outputs=cal_out)

    with gr.Tab("Error examples"):
        k = gr.Slider(5, 30, value=10, step=1, label="How many wrong examples to show")
        err_btn = gr.Button("Show misclassified examples")
        err_out = gr.JSON(label="Misclassified Examples")
        err_btn.click(ui_show_errors, inputs=[model_name, k], outputs=err_out)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://0181618aface991a6c.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


