In [1]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def preprocess(text):
    stop_words= set(stopwords.words('english'))
    lemmatizer= WordNetLemmatizer()
    tokens= nltk.word_tokenize(text.lower())
    tokens= [token for token in tokens if token.isalpha() and token not in stop_words]
    tokens= [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

documents= [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

preprocessed_doc= [(preprocess(' '.join(text)), label) for (text, label) in documents]

In [17]:
vectorizer= CountVectorizer()
X= vectorizer.fit_transform([text for text, _ in preprocessed_doc])
y= [label for _, label in preprocessed_doc]

classifier= MultinomialNB()
classifier.fit(X, y)

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

classifier.fit(X_train, y_train)

y_pred= classifier.predict(X_test)

accuracy= accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy,"\n")

new_reviews= input("Your review: ")
reviews= sent_tokenize(new_reviews)

Accuracy: 0.8075 

Your review: The movie was amazing!


In [18]:
preprocessed_new_reviews= [preprocess(rev) for rev in reviews]

In [19]:
X_new= vectorizer.transform(preprocessed_new_reviews)
y_new_pred= classifier.predict(X_new)

for review, sentiment in zip(reviews, y_new_pred):
    print("Review: ", review)
    print("Class: ", sentiment)

Review:  The movie was amazing!
Class:  pos
