In [None]:
# For downloading using nltk
import nltk
nltk.download('movie_reviews')
from nltk.corpus import movie_reviews

# Extract reviews and labels
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle the data
import random
random.shuffle(documents)


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download the 'stopwords' resource
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    # Convert to lowercase
    text = [word.lower() for word in text]
    # Remove punctuation
    text = [word for word in text if word not in string.punctuation]
    # Remove stopwords
    text = [word for word in text if word not in stop_words]
    return text

# Apply preprocessing to all documents
documents = [(preprocess(review), category) for (review, category) in documents]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Separate data and labels
reviews, labels = zip(*documents)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Join tokens back into sentences for vectorizer
X_train = [' '.join(review) for review in X_train]
X_test = [' '.join(review) for review in X_test]

# Apply TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label='pos')
recall = recall_score(y_test, y_pred, pos_label='pos')
f1 = f1_score(y_test, y_pred, pos_label='pos')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.83
Precision: 0.8324873096446701
Recall: 0.8241206030150754
F1 Score: 0.8282828282828283


Analyze Results

😭Limitations:

1,The model relies heavily on word occurrence and may miss contextual nuances.
2.Bag-of-words and TF-IDF ignore word order and meaning in context.
3.Model performance can be improved using word embeddings like Word2Vec or BERT.

😃Extensions:

1.Implement advanced techniques like LSTM or BERT for sentiment analysis.
2.Use cross-validation for more robust model evaluation.
3.Handle imbalanced datasets by adjusting the decision threshold or using techniques like SMOTE.