In [2]:
import nltk
from nltk.corpus import movie_reviews

# Download the data (you only need to do this once)
# A new window might pop up, just select 'all' and click 'download'
nltk.download('movie_reviews') 

# Load the reviews into a list
# Each item will be a (list_of_words, category)
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle them so we don't have all negatives then all positives
import random
random.shuffle(documents)

print(f"Loaded {len(documents)} reviews.")
print("--- Example of one review ---")
print(documents[0])

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Artharva\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Loaded 2000 reviews.
--- Example of one review ---
(['first', 'impressions', ':', 'critically', ',', 'a', 'close', '-', 'to', '-', 'awful', 'film', ',', 'but', 'money', '-', 'wise', ',', 'it', 'has', 'been', 'doing', '(', 'and', 'will', 'continue', 'to', 'do', ')', 'great', '.', 'a', 'sometimes', '-', 'funny', 'film', 'that', 'sags', 'and', 'lags', 'and', 'oftentimes', 'gets', 'boring', '.', 'an', 'orginal', 'plot', 'that', 'grows', 'old', 'real', 'fast', '.', 'one', 'of', 'the', 'only', '90', 'minute', 'films', 'that', 'i', "'", 've', 'gotten', 'bored', 'through', '.', 'men', 'in', 'black', 'has', 'defied', 'the', 'odds', '.', 'when', 'i', 'first', 'saw', 'that', 'the', 'flick', 'was', '89', 'minutes', 'long', ',', 'i', 'thought', 'maybe', 'that', 'this', 'was', 'a', 'poor', 'attempt', 'at', 'an', 'independence', 'day', 'type', 'film', 'that', 'just', 'ran', 'out', 'of', 'gas', '.', 'however', ',', 'i', 'now', 'realize', 'that', 'not', 'only', 'did', 'men', 'in', 'black', 'run', 'out'

In [3]:
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download the necessary NLTK packages (this also might not pop up)
nltk.download('stopwords')
nltk.download('wordnet')

# Get our lists of stop words and our lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# This list will hold our cleaned documents
cleaned_documents = []

for words, category in documents:
    cleaned_words = []
    for word in words:
        # 1. Convert to lowercase
        lower_word = word.lower()
        
        # 2. Check if it's not a stop word and not punctuation
        if lower_word not in stop_words and lower_word not in string.punctuation:
            # 3. Lemmatize the word (e.g., 'running' -> 'run')
            cleaned_words.append(lemmatizer.lemmatize(lower_word))
    
    # Join the words back into a single string (sentence)
    cleaned_documents.append((" ".join(cleaned_words), category))

print("Example of cleaned review:")
print(cleaned_documents[0])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Artharva\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Artharva\AppData\Roaming\nltk_data...


Example of cleaned review:
('first impression critically close awful film money wise continue great sometimes funny film sag lag oftentimes get boring orginal plot grows old real fast one 90 minute film gotten bored men black defied odds first saw flick 89 minute long thought maybe poor attempt independence day type film ran gas however realize men black run gas film 90 minute manages show original idea summer audience embraced becomes old 25 minute movie tommy lee jones smith play two government agent responsible keeping order alien society ridiculous plot begin alien bug played weirdly vincent onofrio great full metal jacket land earth retrieve galaxy somewhere orion belt anyway basic plot revolves around jones smith stop bug getting galaxy higher power blow earth premise ridulous like film love original plot one original one director barry sonnenfeld something film ruined plot made film drag also put unncessary element found romance film whenever saw onofrio bug stomp eat people fil

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Separate our cleaned text from its labels (positive/negative)
X = [doc[0] for doc in cleaned_documents] # The text
y = [doc[1] for doc in cleaned_documents] # The labels (pos/neg)

# 1. Create the TF-IDF Vectorizer
# max_features=5000 means it will only use the 5000 most common words
vectorizer = TfidfVectorizer(max_features=5000)

# 2. Fit and Transform
# .fit() learns the vocabulary from all our text (X)
# .transform() converts all the text into a big matrix of numbers
X_features = vectorizer.fit_transform(X)

print("--- Feature Extraction Complete ---")
print(f"Our feature matrix shape: {X_features.shape}")
print("This means: (Number of Reviews, Number of Words)")

--- Feature Extraction Complete ---
Our feature matrix shape: (2000, 5000)
This means: (Number of Reviews, Number of Words)


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

# Split data: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)

# 1. Create the SVM model
# LinearSVC is a type of Support Vector Machine
model = LinearSVC()

# 2. Train the model on the training data
print("Training model...")
model.fit(X_train, y_train)
print("Training complete!")

# 3. Make predictions on the unseen test data
y_pred = model.predict(X_test)

# 4. Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy * 100:.2f}%")

# Get a detailed report (Precision, Recall, F1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Training model...
Training complete!

Accuracy: 87.50%

Classification Report:
              precision    recall  f1-score   support

         neg       0.89      0.85      0.87       200
         pos       0.86      0.90      0.88       200

    accuracy                           0.88       400
   macro avg       0.88      0.88      0.87       400
weighted avg       0.88      0.88      0.87       400



In [6]:
def predict_sentiment(review_text):
    # 1. Clean the new text using the same steps as before
    cleaned_words = []
    # We need to split the text into words to process it
    words = review_text.split() 
    
    for word in words:
        lower_word = word.lower()
        if lower_word not in stop_words and lower_word not in string.punctuation:
            cleaned_words.append(lemmatizer.lemmatize(lower_word))
    
    cleaned_review = " ".join(cleaned_words)
    
    # 2. Convert the cleaned text to a TF-IDF vector
    # IMPORTANT: We use .transform() ONLY. 
    # We do not use .fit_transform() here because we want to use 
    # the same 5000-word vocabulary the model was trained on.
    review_vector = vectorizer.transform([cleaned_review])
    
    # 3. Make the prediction
    prediction = model.predict(review_vector)
    
    return prediction[0]

In [7]:
# --- Let's try it! ---

my_review_1 = "This movie was absolutely brilliant! Loved the acting."
print(f"Review: '{my_review_1}'")
print(f"Prediction: {predict_sentiment(my_review_1)}")

print("-" * 20)

my_review_2 = "It was a total waste of time. The plot was boring and predictable."
print(f"Review: '{my_review_2}'")
print(f"Prediction: {predict_sentiment(my_review_2)}")

print("-" * 20)

# --- Try your own! ---
my_review_3 = "The actors did a great job, but the story was just okay."
print(f"Review: '{my_review_3}'")
print(f"Prediction: {predict_sentiment(my_review_3)}")

Review: 'This movie was absolutely brilliant! Loved the acting.'
Prediction: pos
--------------------
Review: 'It was a total waste of time. The plot was boring and predictable.'
Prediction: neg
--------------------
Review: 'The actors did a great job, but the story was just okay.'
Prediction: pos
