In [3]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [4]:
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [5]:
def load_imdb_dataset():
    positive_reviews = nltk.corpus.movie_reviews.fileids('pos')
    positive_reviews = [nltk.corpus.movie_reviews.raw(fileid) for fileid in positive_reviews]

    negative_reviews = nltk.corpus.movie_reviews.fileids('neg')
    negative_reviews = [nltk.corpus.movie_reviews.raw(fileid) for fileid in negative_reviews]

    dataset = [(review, 'positive') for review in positive_reviews] + [(review, 'negative') for review in negative_reviews]
    return dataset

In [6]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

In [7]:
def extract_features(dataset):
    features = []
    labels = []

    for review, sentiment in dataset:
        tokens = preprocess_text(review)
        features.append(tokens)
        labels.append(sentiment)

    return features, labels

In [8]:
def train_classifier(features, labels):
    vocabulary = set()
    for tokens in features:
        vocabulary.update(tokens)

    vocabulary = list(vocabulary)

    feature_vectors = []
    for tokens in features:
        vector = [1 if token in tokens else 0 for token in vocabulary]
        feature_vectors.append(vector)

    feature_vectors = np.array(feature_vectors)
    labels = np.array(labels)

    X_train, X_test, y_train, y_test = train_test_split(feature_vectors, labels, test_size=0.2, random_state=42)

    classifier = MultinomialNB()
    classifier.fit(X_train, y_train)

    return classifier, vocabulary, X_test, y_test

In [9]:
def predict_sentiment(classifier, vocabulary, new_reviews):
    tokens = [preprocess_text(review) for review in new_reviews]

    feature_vectors = []
    for tokens in tokens:
        vector = [1 if token in tokens else 0 for token in vocabulary]
        feature_vectors.append(vector)

    feature_vectors = np.array(feature_vectors)

    predictions = classifier.predict(feature_vectors)

    return predictions

In [10]:
dataset = load_imdb_dataset()

In [11]:
random.shuffle(dataset)

In [12]:
features, labels = extract_features(dataset)

In [13]:
classifier, vocabulary, X_test, y_test = train_classifier(features, labels)

In [14]:
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.845


In [15]:
new_reviews = ["I loved the movie! It was amazing."]
predictions = predict_sentiment(classifier, vocabulary, new_reviews)
print("Predictions:", predictions)

Predictions: ['positive']
