In [26]:
import pandas as pd
import numpy as np
import nltk
import string 
import re

# Data splitting
from sklearn.model_selection import train_test_split
# Preprocessing/analysis
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
nltk.download("stopwords")
nltk.download("wordnet")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ellio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ellio\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Importing Data

In [2]:
#Reading data
df = pd.read_csv("where-the-crawdads-sing-book.csv")
df.head()

Unnamed: 0,product,title,rating,text_review
0,Where the Crawdads Sing,I stopped reading,1.0,I managed to get halfway through this book. Th...
1,Where the Crawdads Sing,Stick With It,4.0,Gradually abandoned by her family and shunned ...
2,Where the Crawdads Sing,Boring about a girl walking through mud.,1.0,The blurb of this book suggest an interesting ...
3,Where the Crawdads Sing,The most amazing book,5.0,I was so looking forward to this book and I ha...
4,Where the Crawdads Sing,"Croak, Croak",3.0,When a female friend I trust recommended this ...


In [3]:
df.shape

(3462, 4)

In [4]:
#Checking how many reviews of each there are
df.rating.value_counts()

5.0    2980
4.0     270
3.0      92
1.0      66
2.0      54
Name: rating, dtype: int64

In [5]:
#Checking to see if there is any null data
df[["rating", "text_review"]].isnull().any()

rating         False
text_review    False
dtype: bool

In [6]:
#Checking all reviews match product and dropping col
(df["product"] != "Where the Crawdads Sing 	").value_counts()


True    3462
Name: product, dtype: int64

In [7]:
df.drop(["product"], axis=1, inplace=True)
df.head()

Unnamed: 0,title,rating,text_review
0,I stopped reading,1.0,I managed to get halfway through this book. Th...
1,Stick With It,4.0,Gradually abandoned by her family and shunned ...
2,Boring about a girl walking through mud.,1.0,The blurb of this book suggest an interesting ...
3,The most amazing book,5.0,I was so looking forward to this book and I ha...
4,"Croak, Croak",3.0,When a female friend I trust recommended this ...


# Pre-Processing

- Lowercasing
- Remove special characters
- Remove stop words
- Stem words

In [8]:
#Make column lower
df["lower"] = df["text_review"].str.lower()

In [9]:
df.head()

Unnamed: 0,title,rating,text_review,lower
0,I stopped reading,1.0,I managed to get halfway through this book. Th...,i managed to get halfway through this book. th...
1,Stick With It,4.0,Gradually abandoned by her family and shunned ...,gradually abandoned by her family and shunned ...
2,Boring about a girl walking through mud.,1.0,The blurb of this book suggest an interesting ...,the blurb of this book suggest an interesting ...
3,The most amazing book,5.0,I was so looking forward to this book and I ha...,i was so looking forward to this book and i ha...
4,"Croak, Croak",3.0,When a female friend I trust recommended this ...,when a female friend i trust recommended this ...


In [22]:
#Make variable and check stopwords
stop_words = stopwords.words("english")


In [13]:
#Remove punc
df["no_punc"] = df["lower"].str.replace("[^\w\s]", "")

In [23]:
#Removing numbers
df["remove_numbers"] = df["no_punc"].str.replace("\d+", "")

In [28]:
#Tokenise
df["tokenise"] = [re.split("\W+", word) for word in df["remove_numbers"]]

In [29]:
df.head()

Unnamed: 0,title,rating,text_review,lower,no_punc,remove_numbers,tokenise
0,I stopped reading,1.0,I managed to get halfway through this book. Th...,i managed to get halfway through this book. th...,i managed to get halfway through this book tho...,i managed to get halfway through this book tho...,"[i, managed, to, get, halfway, through, this, ..."
1,Stick With It,4.0,Gradually abandoned by her family and shunned ...,gradually abandoned by her family and shunned ...,gradually abandoned by her family and shunned ...,gradually abandoned by her family and shunned ...,"[gradually, abandoned, by, her, family, and, s..."
2,Boring about a girl walking through mud.,1.0,The blurb of this book suggest an interesting ...,the blurb of this book suggest an interesting ...,the blurb of this book suggest an interesting ...,the blurb of this book suggest an interesting ...,"[the, blurb, of, this, book, suggest, an, inte..."
3,The most amazing book,5.0,I was so looking forward to this book and I ha...,i was so looking forward to this book and i ha...,i was so looking forward to this book and i ha...,i was so looking forward to this book and i ha...,"[i, was, so, looking, forward, to, this, book,..."
4,"Croak, Croak",3.0,When a female friend I trust recommended this ...,when a female friend i trust recommended this ...,when a female friend i trust recommended this ...,when a female friend i trust recommended this ...,"[when, a, female, friend, i, trust, recommende..."


In [30]:
#Dropping extra columns that won't be needed
df.drop(["lower", "no_punc", "remove_numbers",], axis=1, inplace=True)
df.head()

Unnamed: 0,title,rating,text_review,tokenise
0,I stopped reading,1.0,I managed to get halfway through this book. Th...,"[i, managed, to, get, halfway, through, this, ..."
1,Stick With It,4.0,Gradually abandoned by her family and shunned ...,"[gradually, abandoned, by, her, family, and, s..."
2,Boring about a girl walking through mud.,1.0,The blurb of this book suggest an interesting ...,"[the, blurb, of, this, book, suggest, an, inte..."
3,The most amazing book,5.0,I was so looking forward to this book and I ha...,"[i, was, so, looking, forward, to, this, book,..."
4,"Croak, Croak",3.0,When a female friend I trust recommended this ...,"[when, a, female, friend, i, trust, recommende..."


In [31]:
#Function to remove stopwords from tokenised column
def remove_stopwords(tokenised_text):
    cleaned_text = [word for word in tokenised_text if word not in stop_words]
    return cleaned_text

In [32]:
df["no_stopwords"] = df["tokenise"].apply(lambda x: remove_stopwords(x))

In [33]:
#Checking to see stopwords have been removed
df["no_stopwords"].head()

0    [managed, get, halfway, book, thought, somewha...
1    [gradually, abandoned, family, shunned, locals...
2    [blurb, book, suggest, interesting, crime, thr...
3    [looking, forward, book, say, didnt, disappoin...
4    [female, friend, trust, recommended, book, dow...
Name: no_stopwords, dtype: object

In [34]:
df.head()

Unnamed: 0,title,rating,text_review,tokenise,no_stopwords
0,I stopped reading,1.0,I managed to get halfway through this book. Th...,"[i, managed, to, get, halfway, through, this, ...","[managed, get, halfway, book, thought, somewha..."
1,Stick With It,4.0,Gradually abandoned by her family and shunned ...,"[gradually, abandoned, by, her, family, and, s...","[gradually, abandoned, family, shunned, locals..."
2,Boring about a girl walking through mud.,1.0,The blurb of this book suggest an interesting ...,"[the, blurb, of, this, book, suggest, an, inte...","[blurb, book, suggest, interesting, crime, thr..."
3,The most amazing book,5.0,I was so looking forward to this book and I ha...,"[i, was, so, looking, forward, to, this, book,...","[looking, forward, book, say, didnt, disappoin..."
4,"Croak, Croak",3.0,When a female friend I trust recommended this ...,"[when, a, female, friend, i, trust, recommende...","[female, friend, trust, recommended, book, dow..."


In [35]:
#Dropping tokenised column 
df.drop(["tokenise"], axis=1, inplace=True)

In [36]:
#Stemming the words
ps = PorterStemmer()

In [37]:
def stem(text):
    stem_text = [ps.stem(word) for word in text]
    return stem_text

In [38]:
df["stem_review"] = df["no_stopwords"].apply(lambda x: stem(x))

In [39]:
df["stem_review"].head()

0    [manag, get, halfway, book, thought, somewhat,...
1    [gradual, abandon, famili, shun, local, barkle...
2    [blurb, book, suggest, interest, crime, thrill...
3    [look, forward, book, say, didnt, disappoint, ...
4    [femal, friend, trust, recommend, book, downlo...
Name: stem_review, dtype: object

In [40]:
df.head()

Unnamed: 0,title,rating,text_review,no_stopwords,stem_review
0,I stopped reading,1.0,I managed to get halfway through this book. Th...,"[managed, get, halfway, book, thought, somewha...","[manag, get, halfway, book, thought, somewhat,..."
1,Stick With It,4.0,Gradually abandoned by her family and shunned ...,"[gradually, abandoned, family, shunned, locals...","[gradual, abandon, famili, shun, local, barkle..."
2,Boring about a girl walking through mud.,1.0,The blurb of this book suggest an interesting ...,"[blurb, book, suggest, interesting, crime, thr...","[blurb, book, suggest, interest, crime, thrill..."
3,The most amazing book,5.0,I was so looking forward to this book and I ha...,"[looking, forward, book, say, didnt, disappoin...","[look, forward, book, say, didnt, disappoint, ..."
4,"Croak, Croak",3.0,When a female friend I trust recommended this ...,"[female, friend, trust, recommended, book, dow...","[femal, friend, trust, recommend, book, downlo..."


In [41]:
#Dropping all columns apart from rating and stemmed reviews
df.drop(["title",], axis=1, inplace=True)

In [42]:
df.head()

Unnamed: 0,rating,text_review,no_stopwords,stem_review
0,1.0,I managed to get halfway through this book. Th...,"[managed, get, halfway, book, thought, somewha...","[manag, get, halfway, book, thought, somewhat,..."
1,4.0,Gradually abandoned by her family and shunned ...,"[gradually, abandoned, family, shunned, locals...","[gradual, abandon, famili, shun, local, barkle..."
2,1.0,The blurb of this book suggest an interesting ...,"[blurb, book, suggest, interesting, crime, thr...","[blurb, book, suggest, interest, crime, thrill..."
3,5.0,I was so looking forward to this book and I ha...,"[looking, forward, book, say, didnt, disappoin...","[look, forward, book, say, didnt, disappoint, ..."
4,3.0,When a female friend I trust recommended this ...,"[female, friend, trust, recommended, book, dow...","[femal, friend, trust, recommend, book, downlo..."


# Splitting Dataset and Modelling

I will be useing tfidf vectoriser before splitting the dataset.
Then I will be using Logistic Regression at first. Then, if needed explore some other models.

In [43]:
#Making list into string so I can vectorise
df["test"] = df["stem_review"].apply(" ".join)

In [44]:
#Using tfidf vect
tfidf_vect = TfidfVectorizer(max_features=5000)

In [45]:
#Splitting X and Y data
X = df["test"]
y = df["rating"]

In [46]:
#Tfidf vect variable
X = tfidf_vect.fit_transform(X)

In [47]:
#Checking data type
X

<3462x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 64016 stored elements in Compressed Sparse Row format>

In [48]:
#Splitting dataset into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [49]:
X_train.shape

(2769, 5000)

In [50]:
X_test.shape

(693, 5000)

# Logistic Regression

In [51]:
#Implementing Logistic Regression Model
log_model = LogisticRegression()

In [52]:
log_model = log_model.fit(X=X_train, y=y_train)

In [53]:
y_pred = log_model.predict(X_train)

In [54]:
series_y_pred = pd.Series(y_pred)

In [55]:
series_y_pred.value_counts()

5.0    2743
4.0      23
3.0       3
dtype: int64

Model is far off from actual values and is mostly predicting 5 star ratings.

In [56]:
y_train.value_counts()

5.0    2387
4.0     218
3.0      75
1.0      48
2.0      41
Name: rating, dtype: int64

In [57]:
log_model.intercept_

array([-0.71510199, -1.20493741, -0.76688573,  0.1711766 ,  2.51574852])

In [58]:
log_model.coef_

array([[-0.00351034, -0.00259647, -0.00457189, ..., -0.01143033,
        -0.00146059, -0.00146059],
       [-0.00302122, -0.00294864, -0.00430001, ..., -0.00896073,
        -0.00164358, -0.00164358],
       [-0.00562806, -0.00638401, -0.00724745, ..., -0.01552109,
        -0.00303596, -0.00303596],
       [-0.01598927, -0.01108417, -0.01630425, ..., -0.06025408,
        -0.02115917, -0.02115917],
       [ 0.02814889,  0.0230133 ,  0.0324236 , ...,  0.09616622,
         0.0272993 ,  0.0272993 ]])

In [59]:
#Checking overall rating of y test data
y_train.mean()

4.7533405561574575

In [60]:
#Comparing it to model
series_y_pred.mean()

4.989526905019863

# Bayes Model

In [67]:
#Had to use toarray because X_train was a sparse matrix
classifier = GaussianNB()
classifier.fit(X_train.toarray(), y_train)

GaussianNB()

In [72]:
y_pred_NB = classifier.predict(X_train.toarray())

In [73]:
series_y_pred_NB = pd.Series(y_pred_NB)

In [74]:
series_y_pred_NB.value_counts()

5.0    1334
4.0     667
3.0     311
1.0     282
2.0     175
dtype: int64