# Step 1: Setup - Install necessary libraries (if not installed)


In [5]:
import nltk

*Purpose: Imports the Natural Language Toolkit (NLTK) library for natural language processing tasks*


In [6]:
import pandas as pd

*Purpose: Imports the pandas library for data manipulation and analysis with DataFrames*

In [7]:
from sklearn.feature_extraction.text import CountVectorizer


*Purpose: Imports CountVectorizer to transform text data into numerical feature vectors*

In [8]:
from sklearn.model_selection import train_test_split


*Purpose: Imports function to divide data into separate training and testing sets*

In [9]:
from sklearn.naive_bayes import MultinomialNB


*Purpose: Imports the Multinomial Naive Bayes classifier suitable for text classification*

In [10]:
from sklearn.metrics import accuracy_score, classification_report


 *Purpose: Imports metrics to evaluate classification model performance*


In [11]:
from nltk.corpus import movie_reviews


 *Purpose: Imports the pre-labeled movie reviews dataset from NLTK*


 # Data Preparation


In [12]:
nltk.download("movie_reviews")

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

*Purpose: Downloads the movie reviews dataset to local NLTK data directory*

In [13]:
documents = [
    (" ".join(movie_reviews.words(fileid)), category)
    for category in movie_reviews.categories()
    for fileid in movie_reviews.fileids(category)
]

*Purpose: Creates a list of (review_text, sentiment) tuples by joining words from each review and pairing with its category*


In [14]:
df = pd.DataFrame(documents, columns=["review", "sentiment"])


 *Purpose: Converts the list of document tuples into a structured DataFrame with two columns*


# Step 3: Model Training


In [15]:
vectorizer = CountVectorizer(max_features=2000)


*Purpose: Initializes a vectorizer that will convert text to numerical features, limiting to the 2000 most frequent words*

In [16]:
X = vectorizer.fit_transform(df["review"])


*Purpose: Transforms the review texts into a sparse matrix of word count features*

In [17]:
y = df["sentiment"]


*Purpose: Extracts the sentiment labels as the target variable for classification*

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

*Purpose: Splits data into 80% training and 20% testing sets with fixed random seed for reproducibility*

In [19]:
model = MultinomialNB()


*Purpose: Creates an instance of the Multinomial Naive Bayes classifier*

In [20]:
model.fit(X_train, y_train)


*Purpose: Trains the classifier on the training data to learn patterns between words and sentiments*

In [21]:
y_pred = model.predict(X_test)


*Purpose: Uses the trained model to predict sentiment labels for the test set*

In [22]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")


Accuracy: 0.8


*Purpose: Calculates and displays the overall accuracy of the model*

In [23]:
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")


Classification Report:
              precision    recall  f1-score   support

         neg       0.80      0.80      0.80       199
         pos       0.80      0.80      0.80       201

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400



*Purpose: Generates and prints detailed metrics including precision, recall and F1-score*

#Step 4: Prediction

In [24]:
def predict_sentiment(text):
    text_vector = vectorizer.transform([text])
    prediction = model.predict(text_vector)
    return prediction[0]

*Purpose: Defines a function that takes new text input, vectorizes it, and returns the predicted sentiment*

In [25]:
a = input()


the best movie i have ever seen


*Purpose: Prompts user to enter text for sentiment analysis*

In [26]:
print(predict_sentiment(a))


pos


*Purpose: Displays the predicted sentiment for the user's input text*