In [1]:
import nltk
nltk.download('movie_reviews')

from nltk.corpus import movie_reviews
import pandas as pd

docs = [(list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]

df = pd.DataFrame(docs, columns=["Review", "Sentiment"])
df["Review"] = df["Review"].apply(lambda x: " ".join(x))  # Convert list to text
print(df.head())

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\angel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


                                              Review Sentiment
0  plot : two teen couples go to a church party ,...       neg
1  the happy bastard ' s quick movie review damn ...       neg
2  it is movies like these that make a jaded movi...       neg
3  " quest for camelot " is warner bros . ' first...       neg
4  synopsis : a mentally unstable man undergoing ...       neg


Data Preprocessing

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    words = word_tokenize(text.lower())  # Lowercase & tokenize
    words = [word for word in words if word.isalpha()]  # Remove punctuation
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
    return " ".join(words)

df["Cleaned_Review"] = df["Review"].apply(preprocess)
print(df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\angel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\angel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\angel\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                              Review Sentiment  \
0  plot : two teen couples go to a church party ,...       neg   
1  the happy bastard ' s quick movie review damn ...       neg   
2  it is movies like these that make a jaded movi...       neg   
3  " quest for camelot " is warner bros . ' first...       neg   
4  synopsis : a mentally unstable man undergoing ...       neg   

                                      Cleaned_Review  
0  plot two teen couple go church party drink dri...  
1  happy bastard quick movie review damn bug got ...  
2  movie like make jaded movie viewer thankful in...  
3  quest camelot warner bros first feature length...  
4  synopsis mentally unstable man undergoing psyc...  


Train a Sentiment Analysis Model

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df["Cleaned_Review"], df["Sentiment"], test_size=0.2, random_state=42)

model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8125


Hugging Face’s BERT for better accuracy

In [4]:
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis")
print(sentiment_pipeline("I love this product! It's amazing."))

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998866319656372}]


Exp with Bert with nltk movie rev data

In [5]:
print(sentiment_pipeline("Capernaum 2018 is a heartbreaking yet powerful tale of survival, following a young boy who sues his parents for bringing him into a world of suffering. Nadine Labaki masterfully captures the raw struggles of poverty, neglect, and resilience with stunning realism. Zain Al Rafeeas performance is deeply moving, making this a mustwatch for those who appreciate socially impactful cinema."))

[{'label': 'POSITIVE', 'score': 0.9998142123222351}]
