In [2]:
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Download the IMDb dataset
nltk.download('movie_reviews')

# Import the movie_reviews corpus
from nltk.corpus import movie_reviews

# Access file IDs and categories (labels)
file_ids = movie_reviews.fileids()
labels = [1 if file_id.split('/')[0] == 'pos' else 0 for file_id in file_ids]

# Access text data
texts = [movie_reviews.raw(file_id) for file_id in file_ids]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Convert text data to numerical features using CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

# Train a Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_counts, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_counts)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'])

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", report)


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


Accuracy: 0.81
Classification Report:
               precision    recall  f1-score   support

    Negative       0.78      0.85      0.81       199
    Positive       0.84      0.76      0.80       201

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.80       400
weighted avg       0.81      0.81      0.80       400

