In [1]:
import glob

In [2]:
reviews_dir = './data_ml_2020/movies_reviews'

In [10]:
negative_reviews = []
positive_reviews = []

for file in glob.glob(reviews_dir + "/neg/*.txt"):
    with open(file, "r") as f:
        negative_reviews.append(f.read().replace("\n", ""))
        
for file in glob.glob(reviews_dir + "/pos/*.txt"):
    with open(file, "r") as f:
        positive_reviews.append(f.read().replace("\n", ""))

Use Glob to get the file paths and add them to an array.

Then match them with 0 or 1. 0 if they are negative, 1 if they are positive.

In [69]:
import numpy as np

X = np.concatenate((negative_reviews, positive_reviews))
Y = np.concatenate((np.zeros((len(negative_reviews))), np.ones((len(positive_reviews)))))

In [108]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.4,random_state=20)

CountVectorizer transforms the dataset into a vector matrix (counts each word). Then, the TfidfTransformer reduces the impact of common words such as "The". This is the transformation step where we get data ready for modeling.

In [109]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)

tf_trans = TfidfTransformer()
X_train_tf = tf_trans.fit_transform(X_train_cv)

The docs suggest using either MultinomialNB or ComplementNB. I decided to use both to test which one is better.
As you can see from the results below, CNB has a slight edge over the MNB, so if I was to only use one for a dataset that required more training I'd definitely use CNB.

In [110]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import accuracy_score

mnb_model = MultinomialNB().fit(X_train_tf, Y_train)
cnb_model = ComplementNB().fit(X_train_tf, Y_train)

In [73]:
print(f"Score for MNB: {mnb_model.score(X_train_tf, Y_train)}")
print(f"Score for CNB: {cnb_model.score(X_train_tf, Y_train)}")

print(f"Accuracy Score for MNB: {accuracy_score(Y_train, mnb_model.predict(X_train_tf))}")
print(f"Accuracy Score for CNB: {accuracy_score(Y_train, cnb_model.predict(X_train_tf))}")

Score for MNB: 0.9638403990024937
Score for CNB: 0.970074812967581
Accuracy Score for MNB: 0.9638403990024937
Accuracy Score for CNB: 0.970074812967581


In [111]:
X_test_cv = cv.transform(X_test)
X_test_tf = tf_trans.transform(X_test_cv)

print(f"Score for MNB: {mnb_model.score(X_test_tf, Y_test)}")
print(f"Score for CNB: {cnb_model.score(X_test_tf, Y_test)}")

print(f"Accuracy for MNB: {accuracy_score(Y_test, mnb_model.predict(X_test_tf))}")
print(f"Accuracy for CNB: {accuracy_score(Y_test, cnb_model.predict(X_test_tf))}")

Score for MNB: 0.6807980049875312
Score for CNB: 0.729426433915212
Accuracy for MNB: 0.6807980049875312
Accuracy for CNB: 0.729426433915212


In [112]:
import os

predict_reviews = []
predict_data = []
for file in glob.glob(reviews_dir + "/review/*.txt"):
    with open(file, "r", encoding="utf8") as f:
        predict_reviews.append(f.read().replace("\n", ""))
    filename = os.path.splitext(os.path.basename(file))[0]
    title = " ".join([word.capitalize() for word in filename.split(" ")[0].replace("_", " ").split(" ")])
    rating = filename.split(" ")[1].replace("p", ".").replace("o", "/")
    predict_data.append({"title": title, "rating": rating})

In [113]:
X_predict_cv = cv.transform(predict_reviews)
X_predict_tf = tf_trans.transform(X_predict_cv)

prediction = mnb_model.predict(X_predict_tf)

def predict(i: int) -> str:
    if i == 0:
        return "Negative"
    if i == 1:
        return "Positive"

i = 0
for p in prediction:
    print(f"{predict_data[i]['title']} | {predict_data[i]['rating']} | {predict(p)}")
    i += 1

Dick Johnson Is Dead | (9/10) | Positive
Face To Face | (3/5) | Positive
Godmothered | (1/5) | Positive
Sweet Bean | (5/5) | Positive
The Truth | (3.5/5) | Positive


In [114]:
prediction = cnb_model.predict(X_predict_tf)

i = 0
for p in prediction:
    print(f"{predict_data[i]['title']} | {predict_data[i]['rating']} | {predict(p)}")
    i += 1

Dick Johnson Is Dead | (9/10) | Positive
Face To Face | (3/5) | Positive
Godmothered | (1/5) | Positive
Sweet Bean | (5/5) | Positive
The Truth | (3.5/5) | Positive


All the reviews were above average except for Godmothered, but when I looked at the review, although it was very cruel about the movie a majority of the review was dedicated to praise for the star Jillian Bell. So even though the review was overall negative, I can see why the model decided to assign it a 'positive' rating - because it was (in a way)...

Maybe with more training it would properly predict that dataset.