In [3]:
import nltk
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Download the IMDb movie reviews dataset using NLTK
nltk.download('movie_reviews')

# Load the movie reviews dataset
positive_reviews = movie_reviews.fileids('pos')
negative_reviews = movie_reviews.fileids('neg')

# Function to extract features and labels
def extract_features_and_labels(fileids, category):
    reviews = [movie_reviews.raw(fileid) for fileid in fileids]
    labels = [category] * len(fileids)
    return reviews, labels

# Extract positive and negative features and labels
positive_reviews, positive_labels = extract_features_and_labels(positive_reviews, 'positive')
negative_reviews, negative_labels = extract_features_and_labels(negative_reviews, 'negative')

# Combine reviews and labels
reviews = positive_reviews + negative_reviews
labels = positive_labels + negative_labels

# Convert reviews to numerical feature vectors
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(reviews)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Train the SVM model
model = SVC()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


Test Accuracy: 0.74
