In [9]:
import pandas as pd
import re
import pickle
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("IMDB Dataset.csv")  

# Text Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"\W", " ", text)  # Remove special characters
    text = re.sub(r"\d", "", text)   # Remove digits
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

df["review"] = df["review"].apply(preprocess_text)

# Convert Labels to Binary (1: Positive, 0: Negative)
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})

# Train-Test Split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=0.2, random_state=42)

# Feature Extraction using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Save Model and Vectorizer
with open("sentiment_model.pkl", "wb") as model_file:
    pickle.dump(model, model_file)

with open("tfidf_vectorizer.pkl", "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

# Model Evaluation
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8951
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [8]:
# Load the Saved Model and Vectorizer
with open("sentiment_model.pkl", "rb") as model_file:
    loaded_model = pickle.load(model_file)

with open("tfidf_vectorizer.pkl", "rb") as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)

# Function to Predict Sentiment for New Reviews
def predict_sentiment(new_reviews):
    processed_reviews = [preprocess_text(review) for review in new_reviews]
    transformed_reviews = loaded_vectorizer.transform(processed_reviews)
    predictions = loaded_model.predict(transformed_reviews)
    return ["Positive" if pred == 1 else "Negative" for pred in predictions]


# Take user input for a review
test_review = input("Enter your review: ")

# Convert input to a list (since the function expects multiple reviews)
test_reviews = [test_review]

predicted_sentiments = predict_sentiment(test_reviews)

# Print Results
for review, sentiment in zip(test_reviews, predicted_sentiments):
    print(f"Review: {review}\nPredicted Sentiment: {sentiment}\n")


Enter your review:  The movie is great.


Review: The movie is great.
Predicted Sentiment: Positive

