# Data Science Incubator Project 5

## Imports and settings

In [7]:
# Python imports
import random as rn
import pickle

# Numerical imports
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

# NLP packages
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models import Word2Vec

# Plotting imports
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting settings
sns.set()
sns.set_palette("colorblind")
sns.set_style("ticks")

# Seed randomness
rn.seed(0)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load data - and split (optional)

In [2]:
def split_data(sample_rate=1e-3) -> None:
    for split in ["test", "train"]:
        data = pd.read_csv(f"./data/{split}.csv", header=None, names=["rating", "title", "review"])
        data = data.sample(frac=sample_rate)
        data.to_csv(f"./data/{split}_sample.csv", index=False)

# split_data(sample_rate=1e-1)  # 1% of data
train = pd.read_csv("./data/train_sample.csv")
train.dropna(inplace=True)
train.reset_index(drop=True, inplace=True)
test = pd.read_csv("./data/test_sample.csv")
test.dropna(inplace=True)
test.reset_index(drop=True, inplace=True)

display(train)
display(test)

Unnamed: 0,rating,title,review
0,5,breville,"I got it right before Xmas, so the shipment wa..."
1,2,Poor Quality. Great idea.,Length:: 0:15 MinsThe batch (10 in all) of toy...
2,2,Tempering the positives...,I wanted to write my own review to temper the ...
3,2,Bad Movie,"When I saw this movie I thought, ""How could an..."
4,2,marble,There are no instructions on how or different ...
...,...,...,...
299989,3,This album has nothing you would want to see.,If you want good music check out the previous ...
299990,4,"Great performance, easy to set up and use.",What's Hot:Cute design with great performance....
299991,1,"Can you say ""cheesy""?",I was hoping that with the glowing reviews tha...
299992,4,A welcome upgrade to Operation Flashpoint,"The Good : Beautiful scenery and graphics, hug..."


Unnamed: 0,rating,title,review
0,2,So What?,"What can I say about this book? Well, it's pro..."
1,5,Rock'n'Roll Shangri-La,These guys came out of nowhere with spine-ting...
2,1,Need to Be Able to Label Them Repeatedly,My little guy started taking 6 ounces at a tim...
3,4,Not too bad,I have owned these scissors for several months...
4,1,Guilty of making trash,"this is horrible, her latest single beat of my..."
...,...,...,...
64995,1,An unhappy customer,We bought this journal for our daughter's 8th ...
64996,1,Ignore this book,I have read numerous submarine novels by autho...
64997,3,Ba-Humbug,"Oh geez, Badu is soooooo annoyingly pretentiou..."
64998,1,Vocals?,Flaw and FBM would be OK if they had a lead si...


## Preprocess data

In [3]:
def preprocess_data():
    ps = PorterStemmer()
    
    corpus = []
    for index in range(len(train)):
        title = train.title[index]
        review = train.review[index]

        title = re.sub(r"[^a-zA-Z]", " ", title).lower().split()
        review = re.sub(r"[^a-zA-Z]", " ", review).lower().split()

        title = [ps.stem(word) for word in title if not word in set(stopwords.words("english"))]
        review = [ps.stem(word) for word in review if not word in set(stopwords.words("english"))]

        corpus.append(" ".join(title + review))
    train["corpus"] = corpus
    
    corpus = []
    for index in range(len(test)):
        title = test.title[index]
        review = test.review[index]

        title = re.sub(r"[^a-zA-Z]", " ", title).lower().split()
        review = re.sub(r"[^a-zA-Z]", " ", review).lower().split()

        title = [ps.stem(word) for word in title if not word in set(stopwords.words("english"))]
        review = [ps.stem(word) for word in review if not word in set(stopwords.words("english"))]

        corpus.append(" ".join(title + review))
    test["corpus"] = corpus
    
    train.to_csv("./data/train_sample_processed.csv", index=False)
    test.to_csv("./data/test_sample_processed.csv", index=False)

In [4]:
preprocess_data()
train = pd.read_csv("./data/train_sample_processed.csv")
test = pd.read_csv("./data/test_sample_processed.csv")

## Partition data into X and Y

In [5]:
count_vectorizer = CountVectorizer(stop_words="english")
x = count_vectorizer.fit_transform(train.corpus)
y = train.rating

x_train, x_test, y_train, y_test = train_test_split(x, y)

## Train classifier

In [8]:
model = BernoulliNB()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print("BernoulliNB Model")
print(classification_report(y_test, y_pred))
print()

model = MultinomialNB()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print("MultinomialNB Model")
print(classification_report(y_test, y_pred))
print()

BernoulliNB Model
              precision    recall  f1-score   support

           1       0.52      0.65      0.58     14950
           2       0.40      0.32      0.36     14953
           3       0.40      0.33      0.36     15120
           4       0.42      0.31      0.36     15109
           5       0.49      0.67      0.57     14867

    accuracy                           0.46     74999
   macro avg       0.44      0.46      0.44     74999
weighted avg       0.44      0.46      0.44     74999


MultinomialNB Model
              precision    recall  f1-score   support

           1       0.53      0.62      0.57     14950
           2       0.39      0.37      0.38     14953
           3       0.39      0.35      0.37     15120
           4       0.41      0.35      0.37     15109
           5       0.53      0.60      0.57     14867

    accuracy                           0.46     74999
   macro avg       0.45      0.46      0.45     74999
weighted avg       0.45      0.46     

In [9]:
test.rating.value_counts()

5    13139
3    13040
1    12989
4    12965
2    12867
Name: rating, dtype: int64