In [None]:
# ✅ Step 1: Import Libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

print("✅ Libraries loaded!\n")

✅ Libraries loaded!



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from google.colab import files
uploaded = files.upload()  # Select 'IMDB Dataset.csv'

Saving IMDB Dataset.csv to IMDB Dataset.csv


In [None]:

df = pd.read_csv('IMDB Dataset.csv')

df = df.sample(5000, random_state=42).reset_index(drop=True)

print(f"✅ Dataset loaded with {len(df)} reviews")

✅ Dataset loaded with 5000 reviews


In [None]:


# Load real dataset
df = pd.read_csv('IMDB Dataset.csv')  # Kaggle downloaded data

# Sample 1000 rows
df = df.sample(1000, random_state=42).reset_index(drop=True)

print("✅ Real dataset loaded!")
print(df[['review', 'sentiment']].head())

✅ Real dataset loaded!
                                              review sentiment
0  I really liked this Summerslam due to the look...  positive
1  Not many television shows appeal to quite as m...  positive
2  The film quickly gets to a major chase scene w...  negative
3  Jane Austen would definitely approve of this o...  positive
4  Expectations were somewhat high for me when I ...  negative


In [None]:

custom_stop_words = set(stopwords.words('english')) - {'not', 'but', 'very', 'no', 'nor'}

def preprocess_text(text):
    text = text.lower()
    # removingspecial characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Extra spaces hatado
    text = ' '.join(text.split())
    # Custom stopwords used
    words = [w for w in text.split() if w not in custom_stop_words]
    return ' '.join(words)

df['cleaned_review'] = df['review'].apply(preprocess_text)

In [None]:
vectorizer = CountVectorizer(max_features=2000)
X = vectorizer.fit_transform(df['cleaned_review'])
y = (df['sentiment'] == 'positive').astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate
train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)
print(f" Training Accuracy: {train_acc:.2f}")
print(f" Testing Accuracy: {test_acc:.2f} (Expected: 80%+)")

 Training Accuracy: 1.00
 Testing Accuracy: 0.82 (Expected: 80%+)


In [None]:

vectorizer = CountVectorizer(max_features=3000, ngram_range=(1, 2), min_df=2)
X = vectorizer.fit_transform(df['cleaned_review'])
y = (df['sentiment'] == 'positive').astype(int)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train_Test_Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

Training samples: 800
Testing samples: 200


In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Accuracy check
train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)
print(f"🎯 Training Accuracy: {train_acc:.2f}")
print(f"✅ Testing Accuracy: {test_acc:.2f}")

🎯 Training Accuracy: 1.00
✅ Testing Accuracy: 0.83


In [None]:
def predict_sentiment(review):
    cleaned = preprocess_text(review)
    vector = vectorizer.transform([cleaned])
    pred = model.predict(vector)[0]
    proba = model.predict_proba(vector)[0]
    sentiment = "😊 Positive" if pred == 1 else "😠 Negative"
    confidence = max(proba) * 100
    return sentiment, confidence

In [None]:
test_reviews = [
    "This movie is amazing!",
    "Worst film ever.",
    "I loved it so much",
    "Terrible acting and boring plot",
    "Fantastic movie, loved it",
    "Very bad movie, waste of time",
    "Outstanding performance",
    "Dull and lifeless"
]

print("\n🔍 Final Predictions (Improved Model):")
for rev in test_reviews:
    sentiment, conf = predict_sentiment(rev)
    print(f"Review: '{rev}' → {sentiment} (Confidence: {conf:.1f}%)")


🔍 Final Predictions (Improved Model):
Review: 'This movie is amazing!' → 😊 Positive (Confidence: 83.5%)
Review: 'Worst film ever.' → 😠 Negative (Confidence: 61.8%)
Review: 'I loved it so much' → 😊 Positive (Confidence: 68.0%)
Review: 'Terrible acting and boring plot' → 😠 Negative (Confidence: 94.5%)
Review: 'Fantastic movie, loved it' → 😊 Positive (Confidence: 86.8%)
Review: 'Very bad movie, waste of time' → 😠 Negative (Confidence: 98.6%)
Review: 'Outstanding performance' → 😊 Positive (Confidence: 75.2%)
Review: 'Dull and lifeless' → 😠 Negative (Confidence: 66.8%)


In [None]:
# ✅ Step 6: User Input Loop
print("\n" + "="*60)
print("🎬 REAL-TIME MOVIE REVIEW SENTIMENT ANALYZER")
print("Enter your review (or type 'exit' to quit):")
print("="*60)

while True:
    user_review = input("\n📝 Your Review: ").strip()

    if user_review.lower() in ['quit', 'exit', 'stop']:
        print("👋 Thank you for using the analyzer! Stay awesome!")
        break

    if len(user_review) < 2:
        print("⚠️  Please enter a valid review.")
        continue

    sentiment, conf = predict_sentiment(user_review)
    print(f"🔍 Result: {sentiment}")
    print(f"📊 Confidence: {conf:.1f}%")


🎬 REAL-TIME MOVIE REVIEW SENTIMENT ANALYZER
Enter your review (or type 'exit' to quit):

📝 Your Review: Very bad movie
🔍 Result: 😠 Negative
📊 Confidence: 72.0%

📝 Your Review: nice movie yaar
🔍 Result: 😊 Positive
📊 Confidence: 69.8%

📝 Your Review: superb movie
🔍 Result: 😊 Positive
📊 Confidence: 77.8%

📝 Your Review: lol
🔍 Result: 😊 Positive
📊 Confidence: 55.7%

📝 Your Review: bad
🔍 Result: 😠 Negative
📊 Confidence: 76.1%

📝 Your Review: Interesting 
🔍 Result: 😠 Negative
📊 Confidence: 51.1%

📝 Your Review: nice
🔍 Result: 😊 Positive
📊 Confidence: 69.5%

📝 Your Review: positive
🔍 Result: 😠 Negative
📊 Confidence: 53.2%

📝 Your Review: bad
🔍 Result: 😠 Negative
📊 Confidence: 76.1%

📝 Your Review: goog
🔍 Result: 😊 Positive
📊 Confidence: 55.7%

📝 Your Review: good
🔍 Result: 😊 Positive
📊 Confidence: 59.4%

📝 Your Review: higly recommend
🔍 Result: 😊 Positive
📊 Confidence: 73.0%

📝 Your Review: whats the fuck
🔍 Result: 😠 Negative
📊 Confidence: 63.7%

📝 Your Review: what the fuck
🔍 Result: 😊 Posi