In [None]:
import pandas as pd
import re, string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("IMDB Dataset.csv")  # e.g., columns: "review", "sentiment"

# If sentiment is 'positive'/'negative', map to 1/0
if df['sentiment'].dtype == object:
    df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Clean the review text
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text

df['cleaned_review'] = df['review'].apply(clean_text)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=10_000)  # controls feature size
X = vectorizer.fit_transform(df['cleaned_review'])
y = df['sentiment']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=42,
                                                    stratify=y)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n",
      classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))


Accuracy: 0.8938

Classification Report:
               precision    recall  f1-score   support

    Negative       0.90      0.89      0.89      7500
    Positive       0.89      0.90      0.89      7500

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000

