In [None]:
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

nltk.download('movie_reviews')

def load_imdb_dataset():
    positive_reviews = nltk.corpus.movie_reviews.fileids('pos')
    positive_reviews = [nltk.corpus.movie_reviews.raw(fileid) for fileid in positive_reviews]
    negative_reviews = nltk.corpus.movie_reviews.fileids('neg')
    negative_reviews = [nltk.corpus.movie_reviews.raw(fileid) for fileid in negative_reviews]
    dataset = [(review, 'positive') for review in positive_reviews] + [(review, 'negative') for review in negative_reviews]
    return dataset

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

def create_feature_vectors(dataset):
    texts, labels = zip(*dataset)
    vectorizer = CountVectorizer(tokenizer=preprocess_text, max_features=3000)
    feature_vectors = vectorizer.fit_transform(texts)
    return feature_vectors, labels, vectorizer

def train_and_evaluate_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return clf, vectorizer, accuracy

def predict_sentiment(text, clf, vectorizer):
    text_features = vectorizer.transform([text])
    sentiment = clf.predict(text_features)
    return sentiment[0]

dataset = load_imdb_dataset()
X, y, vectorizer = create_feature_vectors(dataset)
clf, vectorizer, accuracy = train_and_evaluate_model(X, y)
print(f"Accuracy: {accuracy:.2f}")

# Test string
test_string = "Bad movie"
predicted_sentiment = predict_sentiment(test_string, clf, vectorizer)
print(f"Sentiment: {predicted_sentiment}")




Accuracy: 0.81
Sentiment: negative
