In [None]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer




In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download('movie_reviews')
nltk.download('stopwords')

def load_imdb_dataset():
    positive_reviews = nltk.corpus.movie_reviews.fileids('pos')
    positive_reviews = [nltk.corpus.movie_reviews.raw(fileid) for fileid in positive_reviews]
    negative_reviews = nltk.corpus.movie_reviews.fileids('neg')
    negative_reviews = [nltk.corpus.movie_reviews.raw(fileid) for fileid in negative_reviews]
    dataset = [(review, 'positive') for review in positive_reviews] + [(review, 'negative') for review in negative_reviews]
    return dataset

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    return tokens

def train_decision_tree(X_train, y_train):
    vectorizer = CountVectorizer(tokenizer=preprocess_text)
    X_train_vectorized = vectorizer.fit_transform(X_train)
    clf = DecisionTreeClassifier()
    clf.fit(X_train_vectorized, y_train)
    return clf, vectorizer

def test_decision_tree(clf, vectorizer, X_test, y_test):
    X_test_vectorized = vectorizer.transform(X_test)
    y_pred = clf.predict(X_test_vectorized)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

dataset = load_imdb_dataset()
random.shuffle(dataset)
reviews, labels = zip(*dataset)
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

clf, vectorizer = train_decision_tree(X_train, y_train)
accuracy = test_decision_tree(clf, vectorizer, X_test, y_test)
print(f"Decision Tree Classifier Accuracy: {accuracy:.2f}")

test_string = "I absolutely loved this movie! The acting was brilliant, and the story kept me on the edge of my seat."
test_string_vectorized = vectorizer.transform([test_string])
prediction = clf.predict(test_string_vectorized)
print(f"Test String Prediction: {prediction[0]}")

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Decision Tree Classifier Accuracy: 0.61
Test String Prediction: positive
