#Vanilla Sentiment Analyzer
a. Use the movie_reviews corpus (from nltk) for data.

b. You may use Tfidf, Count, or any other vectorizer (Word2Vec, Transformers, etc.)
for creating sentence embeddings.

c. Train a classical Classifier (Naive Bayes or SVM from sklearn) for sentiment
classification using the above features.

In [6]:
import nltk
from nltk.corpus import movie_reviews
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

nltk.download('movie_reviews')

documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

import random
random.shuffle(documents)
all_words = [word.lower() for word in movie_reviews.words()]


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [7]:
sample_words = all_words[:20]  # Print the first 20 words in the dataset
print("Sample words from the dataset:")
print(sample_words)


Sample words from the dataset:
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an']


In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform([' '.join(doc) for doc, _ in documents])

X = tfidf_features.toarray()
y = [category for _, category in documents]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Make predictions on the validation set
y_val_pred = clf.predict(X_val)

# Calculate validation accuracy
validation_accuracy = accuracy_score(y_val, y_val_pred)

# Train the final model on the entire training set (including validation data)
clf.fit(X_temp, y_temp)

# Make predictions on the test set
y_test_pred = clf.predict(X_test)

# Calculate test accuracy
test_accuracy = accuracy_score(y_test, y_test_pred)

# Generate the classification report as a DataFrame
report_dict = classification_report(y_test, y_test_pred, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()

# Print the validation and test accuracy, along with the classification report
print(f'Validation Accuracy: {validation_accuracy:.2f}')
print(f'Test Accuracy: {test_accuracy:.2f}')
report_df


Validation Accuracy: 0.83
Test Accuracy: 0.96


Unnamed: 0,precision,recall,f1-score,support
neg,0.932692,1.0,0.965174,97.0
pos,1.0,0.932039,0.964824,103.0
accuracy,0.965,0.965,0.965,0.965
macro avg,0.966346,0.966019,0.964999,200.0
weighted avg,0.967356,0.965,0.964994,200.0
