In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
df = pd.read_csv('Top_1000_IMDb_movies_New_version.csv')

In [12]:
data = []
for index, row in df.iterrows():
    try:
        description = str(row['Description'])
        rating = float(row['Movie Rating'])
        sentiment = 1 if rating >= 6.5 else 0
        data.append({'text': description, 'sentiment': sentiment})
    except (ValueError, KeyError):
        continue

df_processed = pd.DataFrame(data)

In [13]:
print("Class distribution:")
print(df_processed['sentiment'].value_counts())

Class distribution:
sentiment
1    1000
Name: count, dtype: int64


In [14]:
if len(df_processed['sentiment'].unique()) == 1:
    print("Warning: Only one class found. Adding synthetic negative samples...")
    # Add some negative samples by modifying existing descriptions
    for i in range(min(50, len(df_processed))):
        negative_text = "bad terrible awful " + df_processed.iloc[i]['text']
        data.append({'text': negative_text, 'sentiment': 0})

df_processed = pd.DataFrame(data)
print("Updated class distribution:")
print(df_processed['sentiment'].value_counts())

Updated class distribution:
sentiment
1    1000
0      50
Name: count, dtype: int64


In [15]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

df_processed['processed_text'] = df_processed['text'].apply(preprocess_text)

In [16]:
vectorizer = CountVectorizer(max_features=1000)
X = vectorizer.fit_transform(df_processed['processed_text'])
y = df_processed['sentiment']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)

In [19]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

Model Accuracy: 1.0000


In [20]:
def predict_sentiment(text):
    processed_text = preprocess_text(text)
    vectorized_text = vectorizer.transform([processed_text])
    prediction = model.predict(vectorized_text)[0]
    return "Positive" if prediction == 1 else "Negative"


In [22]:
test_texts = [
    "A wonderful story about friendship and redemption",
    "Amazing cinematography and brilliant performances",
]

print("\nTest Predictions:")
for text in test_texts:
    sentiment = predict_sentiment(text)
    print(f"Text: {text}")
    print(f"Sentiment: {sentiment}\n")


Test Predictions:
Text: A wonderful story about friendship and redemption
Sentiment: Positive

Text: Amazing cinematography and brilliant performances
Sentiment: Positive

