In [1]:
!pip install nltk scikit-learn pandas numpy

Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl (273 kB)
Installing collected packages: regex, nltk
Successfully installed nltk-3.9.1 regex-2024.11.6


In [2]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anees\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anees\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anees\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
from nltk.corpus import movie_reviews
import random

# Load IMDB reviews and shuffle them
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)


In [4]:
print("Total reviews:", len(documents))
print("Example review words:\n", documents[0][0][:20])  # First 20 words of a review
print("Sentiment label:", documents[0][1])              # 'pos' or 'neg'


Total reviews: 2000
Example review words:
 ['for', 'those', 'who', 'associate', 'italian', 'cinema', 'with', 'fellini', 'and', '"', 'high', 'art', ',', '"', 'the', 'son', "'", 's', 'room', 'is']
Sentiment label: pos


In [5]:
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_words(words):
    cleaned = []
    for word in words:
        word = word.lower()
        if word not in stop_words and word not in string.punctuation:
            cleaned.append(lemmatizer.lemmatize(word))
    return ' '.join(cleaned)

# Clean all reviews
texts = [clean_words(words) for words, label in documents]
labels = [1 if label == 'pos' else 0 for words, label in documents]


In [6]:
print("Sample cleaned review:\n", texts[0][:300])
print("Label (1=Positive, 0=Negative):", labels[0])


Sample cleaned review:
 associate italian cinema fellini high art son room inventive subtle alternative written directed starring nanni moretti take u slow complicated path bereavement slow best description film first take time establishing habit appears normal happy family father mother work still find time support son da
Label (1=Positive, 0=Negative): 1


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=3000)  # You can change 3000 if needed
X = vectorizer.fit_transform(texts)
y = labels


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)


In [9]:
from sklearn.metrics import classification_report, accuracy_score

# Predict sentiments for test data
y_pred = model.predict(X_test)

# Print evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))


Accuracy: 0.795

Classification Report:
               precision    recall  f1-score   support

    Negative       0.83      0.76      0.80       210
    Positive       0.76      0.83      0.79       190

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.79       400
weighted avg       0.80      0.80      0.80       400



In [10]:
def predict_sentiment(text):
    cleaned = clean_words(text.split())
    vectorized = vectorizer.transform([cleaned])
    prediction = model.predict(vectorized)[0]
    return "Positive 😊" if prediction == 1 else "Negative 😞"

# Example usage:
print(predict_sentiment("This movie was absolutely fantastic and touching!"))
print(predict_sentiment("I hated everything about this film."))


Positive 😊
Positive 😊


In [11]:
print(predict_sentiment("The plot was boring and the acting was bad."))
print(predict_sentiment("I really enjoyed the storyline and the characters."))


Negative 😞
Positive 😊
