In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from joblib import dump

import re
import string
import numpy as np
import contractions
from num2words import num2words
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from tqdm import tqdm

# Naive Bayes

In [2]:
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

The same clean_text preprocessing function as used in the data preprocessing notebook. This is so we can type our own reviews and see what we get.

In [3]:
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Expand contractions
    text = contractions.fix(text)
    
    # Remove HTML tags
    text = re.sub('<[^<]+?>', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Replace special characters with their ASCII equivalent
    text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Replace numbers with their written form
    words = []
    for word in text.split():
        if word.isdigit():
            words.append(num2words(int(word)))
        else:
            words.append(word)
    text = ' '.join(words)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stem the words
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join the tokens back into a string
    preprocessed_text = " ".join(tokens)
    
    return text

In [4]:
df = pd.read_csv('../Data/preprocessed_data.csv')

In [5]:
df.isnull().mean()

overall              0.0
reviewText           0.0
asin                 0.0
preprocessed_text    0.0
dtype: float64

In [6]:
df.dropna(inplace=True)

Here we use a count vectorizer as Naive Bayes will determine the importance of words depending on the class by itself.

In [7]:
X = df['preprocessed_text']
y = df['overall']

In [8]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [11]:
y_pred = clf.predict(X_test)

The model gets a 62% accuracy which is pretty good. We will analyse this further in the results notebook.

In [12]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 0.6240362938016644


In [13]:
def predict_rating(review):
    preprocessed_text = clean_text(review)
    X = vectorizer.transform([preprocessed_text])
    y_pred = clf.predict(X)
    return y_pred[0]

In [14]:
predict_rating("Good product, works well. However, broke after only 1 use. I'm happy")

2

In [15]:
tqdm.pandas()
df['bayes_rating'] = df['preprocessed_text'].progress_apply(predict_rating)
df.head()

100%|█████████████████████████████████████████████████████████████████████████| 179642/179642 [07:42<00:00, 388.52it/s]


Unnamed: 0,overall,reviewText,asin,preprocessed_text,bayes_rating
0,5,I don't spend a lot on my flags because they r...,9539723809,spend lot flag realli get beat lesser qualiti ...,5
1,5,A very dear friend of mine is slowly losing he...,B00000JSZH,dear friend mine slowli lose sight pen make po...,5
2,5,This is absolutely exquisite! It's made of car...,B00000JSZH,absolut exquisit made cardboard like descript ...,5
3,4,"This is really nice to use, however, just not ...",B00000JSZH,realli nice use howev color saddl shimmer give...,4
4,5,This Angel is beautiful. I as so glad I chose ...,B00000JSZH,angel beauti glad chose one even beauti look o...,5


In [16]:
df.describe()

Unnamed: 0,overall,bayes_rating
count,179642.0,179642.0
mean,4.088621,4.35794
std,1.307608,1.224699
min,1.0,1.0
25%,4.0,4.0
50%,5.0,5.0
75%,5.0,5.0
max,5.0,5.0


In [17]:
df.to_csv('../Data/naive_bayes.csv', index=False)
dump(clf, '../Models/bayes_clf.joblib')
dump(vectorizer, '../Models/bayes_vectorizer.joblib')

['../Models/bayes_vectorizer.joblib']