In [21]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

In [23]:
# Make sure nltk resources are downloaded
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asad\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Asad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Asad\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [25]:
# Load the IMDB dataset (you can replace with your own path)
df = pd.read_csv('IMDB_Dataset_Sample.csv')  # The file should have 'review' and 'sentiment' columns

In [27]:
# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)        # Remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)     # Remove special characters and digits
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [29]:
# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)

In [30]:
# Encode sentiment labels
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [31]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_review']).toarray()
y = df['sentiment'].values

In [32]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'])
print("Classification Report:\n")
print(report)

Classification Report:

              precision    recall  f1-score   support

    Negative       0.85      0.80      0.83       203
    Positive       0.81      0.85      0.83       197

    accuracy                           0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400



In [34]:
# Train using Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate model performance
y_pred = model.predict(X_test)
print("Model Evaluation:\n")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

Model Evaluation:

              precision    recall  f1-score   support

    Negative       0.82      0.84      0.83       203
    Positive       0.83      0.81      0.82       197

    accuracy                           0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400



In [35]:
# User input for prediction
user_input = input("\nEnter a movie review: ")
processed_input = preprocess_text(user_input)
input_vector = vectorizer.transform([processed_input]).toarray()
prediction = model.predict(input_vector)[0]

if prediction == 1:
    print("\nPredicted Sentiment: Positive 😊")
else:
    print("\nPredicted Sentiment: Negative 😞")


Enter a movie review:  The movie was awful i dont like it.



Predicted Sentiment: Negative 😞
