In [None]:
# 1. Import the tools
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 2. Load Data (Directly from a raw GitHub URL)
print("‚è≥ Downloading dataset... (this may take a moment)")
# We use the 'raw' link so pandas can read it like a normal CSV file
url = "https://raw.githubusercontent.com/bestvater/misc/master/IMDB%20Dataset.csv"
data = pd.read_csv(url)

# 3. Clean Data
# This dataset is huge (50k reviews), so we just drop any errors
data = data.dropna()
# The column names in this specific file are 'review' and 'sentiment'
# We rename them to be safe so the code below works perfectly
data.columns = ['Review', 'Sentiment']

# 4. Split Data
X = data['Review']
y = data['Sentiment']
# 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Vectorize (Translate words to numbers)
# We limit to the top 5000 most important words to keep it fast
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 6. Train the Brain (Logistic Regression)
print("üß† Training the model...")
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# 7. Check the Grade
predictions = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, predictions)

print(f"üéâ Success! Model Accuracy: {accuracy * 100:.2f}%")

# --- TEST IT YOURSELF ---
print("\n--- üçø MOVIE REVIEW JUDGE ---")
my_review = ["The movie was too long and the plot made no sense."]
my_vec = vectorizer.transform(my_review)
result = model.predict(my_vec)

print(f"Review: '{my_review[0]}'")
print(f"Verdict: {result[0].upper()}")

‚è≥ Downloading dataset... (this may take a moment)
üß† Training the model...
üéâ Success! Model Accuracy: 88.89%

--- üçø MOVIE REVIEW JUDGE ---
Review: 'The movie was too long and the plot made no sense.'
Verdict: NEGATIVE
