In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [4]:
import os
import pandas as pd

# Define paths to the positive and negative review directories
pos_reviews_path = r'C:\Users\EmmanuelWilliams\Downloads\review_polarity\txt_sentoken\pos'
neg_reviews_path = r'C:\Users\EmmanuelWilliams\Downloads\review_polarity\txt_sentoken\neg'

# Load positive reviews
pos_reviews = []
for filename in os.listdir(pos_reviews_path):
    with open(os.path.join(pos_reviews_path, filename), 'r', encoding='utf-8') as file:
        pos_reviews.append(file.read())

# Load negative reviews
neg_reviews = []
for filename in os.listdir(neg_reviews_path):
    with open(os.path.join(neg_reviews_path, filename), 'r', encoding='utf-8') as file:
        neg_reviews.append(file.read())

# Create a DataFrame
df = pd.DataFrame({
    'review': pos_reviews + neg_reviews,
    'sentiment': [1] * len(pos_reviews) + [0] * len(neg_reviews)  # 1 for positive, 0 for negative
})

# Display the first few rows of the dataset
df.head()

Unnamed: 0,review,sentiment
0,films adapted from comic books have had plenty...,1
1,every now and then a movie comes along from a ...,1
2,you've got mail works alot better than it dese...,1
3,""" jaws "" is a rare film that grabs your atten...",1
4,moviemaking is a lot like being the general ma...,1


In [5]:
from sklearn.model_selection import train_test_split

# Split data into features (X) and labels (y)
X = df['review']
y = df['sentiment']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Display the sizes of the train and test sets
print(f'Training set size: {len(X_train)}')
print(f'Testing set size: {len(X_test)}')

Training set size: 1500
Testing set size: 500


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert text data into numerical data using CountVectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [7]:
from sklearn.naive_bayes import MultinomialNB

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [8]:
from sklearn.metrics import accuracy_score

# Predict the sentiments on the test set
y_pred = model.predict(X_test_vec)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 80.00%


In [9]:
# Test with your own review
your_review = ["The food was terrible and the service was even worse!"]
your_review_vec = vectorizer.transform(your_review)
prediction = model.predict(your_review_vec)
print("Prediction:", "Positive" if prediction[0] == 1 else "Negative")

Prediction: Negative


In [10]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'alpha': [0.5, 1.0, 1.5, 2.0]  # Smoothing parameter for Naive Bayes
}

# Create a GridSearchCV object
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5)
grid_search.fit(X_train_vec, y_train)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

Best parameters: {'alpha': 1.5}
Best cross-validation score: 0.8166666666666668


In [11]:
# Retrain the Naive Bayes model using the best hyperparameters
best_alpha = grid_search.best_params_['alpha']
best_model = MultinomialNB(alpha=best_alpha)
best_model.fit(X_train_vec, y_train)

# Evaluate the model again on the test set
y_pred_best = best_model.predict(X_test_vec)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f'Accuracy with best hyperparameters: {accuracy_best * 100:.2f}%')

Accuracy with best hyperparameters: 80.60%
