# Review Crystal Ball - Preprocessing Data

#### STEPS:
+ Text cleaning and preparation
  +      1.1. Special character cleaning
  +     1.2. Upcase/downcase
  +      1.3. Punctuation signs
  +      1.4. Possessive pronouns
  +      1.5. Stemming and Lemmatization
  +      1.6. Stop words

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

#importing libraries
from pandas import read_excel
import pandas as pd
import numpy as np
import datetime 
import re as re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize,sent_tokenize # tokenizing


# Code for hiding seaborn warnings
import warnings
warnings.filterwarnings("ignore")

## Read dataset

In [2]:
#read dataset
df = pd.read_csv("TestFileTemplate.csv")
df.head(2)

Unnamed: 0,ProductID,reviewText,reviewTime,reviewerID,summary
0,B00GIMVMME,Tracey H. Kitts does not disappoint! I started...,18-Dec-46,A3MW6NW5TO3RR5,"Another Bite, Please?"
1,B00JVAPRVW,Holy cliffhanger! Emma is definitely strugglin...,23-Apr-46,A33AULI1H6R342,Oh Emma!


In [19]:
#creating a new column called "Content" and storing values of "Review_Content" in it
df['Content'] = df['reviewText']

In [20]:
# 1.5. Stemming and Lemmatization

def clean_text(headline):
    le = WordNetLemmatizer()
    word_tokens = word_tokenize(headline)
    tokens=[le.lemmatize(w) for w in word_tokens if w not in new_stop_words and len(w)>3]
    cleaned_text = " ".join(tokens)
    return cleaned_text
  

In [21]:
# 1.1 Special character cleaning

df['Content_Parsed_1'] = df['Content'].str.replace("\r", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\n", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\n\n", " ")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\n\n'", "")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("\xa0", "")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("    ", "")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace('"', '')
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("--", "")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("*", "")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("-", "")
df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("%", "")
#df['Content_Parsed_1'] = df['Content_Parsed_1'].str.replace("$", "")

#define regex pattern for preprocessing
pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1,pat2))
www_pat = r'www.[^ ]+'

df['Content_Parsed_1'] =  [re.sub(combined_pat, '',str(x)) for x in df['Content_Parsed_1']]
df['Content_Parsed_1'] =  [re.sub(www_pat, '',str(x)) for x in df['Content_Parsed_1']]



# 1.2 Lowercasing the text

df['Content_Parsed_2'] = df['Content_Parsed_1'].str.lower()


#1.3. Punctuation signs

punctuation_signs = list('''!()-[]{};:'"\,<>/?@#$%^&*_~''')
df['Content_Parsed_3'] = df['Content_Parsed_2']

for punct_sign in punctuation_signs:
    df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(punct_sign, '')
    df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(".", ' ')
    

    
#1.4. Possessive pronouns

df['Content_Parsed_4'] = df['Content_Parsed_3'].str.replace("'s", "")


# STOPWORD LIST
# Loading the stop words in english
stop_words = set(stopwords.words('english'))

exclude_words = set(("very","not","no","nor"))
new_stop_words = list(stop_words.difference(exclude_words))
newStopWords = ['i','as','like','from','yep','me','so','were','or','this','here','do','has','was','got','still','this',
                'that','here','to','it','about','ll','while','let','the','do','s','and','can','go','t','is','into','and',
               'any','are','at','been','let','like','have','you','into','do','ll','say','go','let','theyll','thats','would',
               'unless','know','said','le']
new_stop_words.extend(newStopWords)



# 1.5. Lemmitization

df['Content_Parsed_5'] = df['Content_Parsed_4']
df['Content_Parsed_5'] = df['Content_Parsed_5'].apply(clean_text)



# 1.6. Stop words

df['Content_Parsed_6'] = df['Content_Parsed_5']

for stop_word in new_stop_words:
    regex_stopword = r"\b" + stop_word + r"\b"
    df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace(regex_stopword, '')



#remove numbers from data
df['Content_Parsed_6'] = df['Content_Parsed_6'].str.replace('\d+', '')

df['Review_Content_Parsed'] = df['Content_Parsed_6']

In [4]:
#newStopWords = ['i','as','like','from','yep','me','so','were','or','this','here','do','has','was','got','still','this',
#                'that','here','to','it','about','ll','while','let','the','do','s','and','can','go','t','is','into','and',
#               'any','are','at','been','let','like','have','you','into','do','ll','say']
#new_stop_words.extend(newStopWords)

#print(new_stop_words)
#for stop_word in new_stop_words:
#    regex_stopword = stop_word#r"\b" + stop_word + r"\b"
#    print(regex_stopword)

In [23]:
df.head(2)

Unnamed: 0,ProductID,Rating,reviewText,reviewTime,reviewerID,summary,std_rating,Content,Content_Parsed_1,Content_Parsed_2,Content_Parsed_3,Content_Parsed_4,Content_Parsed_5,Content_Parsed_6,Review_Content_Parsed
0,B00GIMVMME,5,Tracey H. Kitts does not disappoint! I started...,18-Dec-2046,A3MW6NW5TO3RR5,"Another Bite, Please?",1.0,Tracey H. Kitts does not disappoint! I started...,Tracey H. Kitts does not disappoint! I started...,tracey h. kitts does not disappoint! i started...,tracey h kitts does not disappoint i started ...,tracey h kitts does not disappoint i started ...,tracey kitts disappoint started menage novel s...,tracey kitts disappoint started menage novel s...,tracey kitts disappoint started menage novel s...
1,B00JVAPRVW,5,Holy cliffhanger! Emma is definitely strugglin...,23-Apr-2046,A33AULI1H6R342,Oh Emma!,1.0,Holy cliffhanger! Emma is definitely strugglin...,Holy cliffhanger! Emma is definitely strugglin...,holy cliffhanger! emma is definitely strugglin...,holy cliffhanger emma is definitely struggling...,holy cliffhanger emma is definitely struggling...,holy cliffhanger emma definitely struggling fe...,holy cliffhanger emma definitely struggling fe...,holy cliffhanger emma definitely struggling fe...


In [24]:
df.columns

Index(['ProductID', 'Rating', 'reviewText', 'reviewTime', 'reviewerID',
       'summary', 'std_rating', 'Content', 'Content_Parsed_1',
       'Content_Parsed_2', 'Content_Parsed_3', 'Content_Parsed_4',
       'Content_Parsed_5', 'Content_Parsed_6', 'Review_Content_Parsed'],
      dtype='object')

In [25]:
#columns to retain
list_columns = ['ProductID', 'Rating', 'reviewText', 'reviewTime', 'reviewerID',
       'summary', 'std_rating','Review_Content_Parsed']
df = df[list_columns]

print(df.shape)
df.head(2)

(947524, 8)


Unnamed: 0,ProductID,Rating,reviewText,reviewTime,reviewerID,summary,std_rating,Review_Content_Parsed
0,B00GIMVMME,5,Tracey H. Kitts does not disappoint! I started...,18-Dec-2046,A3MW6NW5TO3RR5,"Another Bite, Please?",1.0,tracey kitts disappoint started menage novel s...
1,B00JVAPRVW,5,Holy cliffhanger! Emma is definitely strugglin...,23-Apr-2046,A33AULI1H6R342,Oh Emma!,1.0,holy cliffhanger emma definitely struggling fe...


In [26]:
#save output
df.to_csv('input_CrystalBall.csv',index=False)

In [27]:
with pd.option_context('display.max_colwidth', None):
    display(df.tail(10))

Unnamed: 0,ProductID,Rating,reviewText,reviewTime,reviewerID,summary,std_rating,Review_Content_Parsed
947514,B009ZOBERU,4,Mr. Weil certain has a knack for creating a good story but his writing abilities are pitiful. I felt compelled to read the three books because the story is that good but I found myself skipping whole paragraphs. The story is very entertaining If you can get through the high school quality writing.,22-Sep-2045,A18NN9PTR1L90B,"Good story line, poorly written",0.75,weil certain knack creating good story writing ability pitiful felt compelled read three book story good found skipping whole paragraph story very entertaining high school quality writing
947515,B006T3P1TM,1,"This book was awful. First off if I ever heard my Pastor's wife talk the way Sam does in this book I would leave my church. The way Beth, and Jason talk to each is dirty, and not christ like at all. Plus there were plenty of other dirty words in this book. This author needs his mouth washed out with soap, and a good kick in the back side if he thinks God was glorified with this book. One minute quotes so called Bible verses, then the next cussing, and dirty talk between to unmarried adults. Crazy. I will not be reading anything else from this author, and would be embarresed to recommend it.",26-Aug-2045,A8DEIUNJYJG2H,This is a sorry excuse for Christian writting.,0.0,book awful first ever heard pastor wife talk book leave church beth jason talk dirty christ plus plenty dirty word book author need mouth washed soap good kick back side think glorified book minute quote called bible verse next cussing dirty talk unmarried adult crazy reading anything else author embarresed recommend
947516,B0077CUYPK,4,"I enjoyed this series just as much as many other books I have read. Donna Grant is a new favourite of mine, keeps my mind on what to read more & more & more, Being home bound with a foot injury is not fun. So I whole heartedly thank the author for such a great way to spend inside on the sunny days. :)",22-Aug-2046,A1KTM0NZDJT2BS,JRobb,0.75,enjoyed series much many book read donna grant favourite mine keep mind read home bound foot injury whole heartedly thank author great spend inside sunny day
947517,B005SHXTNU,5,"A pair of brilliant dark gems that showcase the author's incredible imagination and finely honed writing skills. From a subterranean society that would have given Arthur Machen chills, to a nightmare that hunts it's creator in the waking world, these novels will linger with you long into the night.",20-Nov-2045,A2GQBZDGRMZBIX,A Double Dose of Darkness,1.0,pair brilliant dark gem showcase author incredible imagination finely honed writing skill subterranean society given arthur machen chill nightmare hunt creator waking world novel linger long night
947518,B00B6LRUOG,3,"I enjoyed this book a great deal but had to give it a less than stellar rating due to multiple editing errors that were distracting and kept me from fully enjoying it. Sew vs sow, vile vs vial etc are the ones I remember off the top of my head. With a better job editing this book would definitely become a 4 star read no questions asked.",26-Oct-2046,AB4MC1XC0T5XX,Great read but editing needed work,0.5,enjoyed book great deal give stellar rating multiple editing error distracting kept fully enjoying vile vial one remember head better editing book definitely become star read question asked
947519,B00A15SPPQ,2,I got lost so many times in this book which in my opinion never did get to the point about any manifestation methods at all. All that is inside of this book is paragraph after paragraph that didn't tie together at all with the last one.,23-May-2048,A25DA2MY42IEKI,How To Ask The Universe to MAnifest Your Dreams?,0.25,lost many time book opinion never point manifestation method inside book paragraph paragraph didnt together last
947520,B00DVQSLR4,5,I wish gerith could of saved himself from Scott so he could find true love with sam. I loved it.,25-Feb-2045,A2EM0FIVZ62K28,good book,1.0,wish gerith could saved scott could find true love loved
947521,B004SHES4U,2,"Similes oh my, Simile me to death. i had to stop and try to figure out what was what. The whole story tripped and stumbled.",21-Jun-2045,A2MZ8TXGPG607T,Hard to read,0.25,simile simile death stop figure whole story tripped stumbled
947522,B003RWS5ZA,5,"Just Like That is Book #2 in the Bradfords series and I have to say I loved Just Like That even better than book #1 Just Right!In Just Like That we learn more about Sam, the middle sibling. Sam is the ultimate carefree, commitment phobic laid back casual guy, flocks of women are at his feet and he loves women: he loves giving them pleasure and enjoying them, but that is all. He has some serious rules: he doesn't sleep twice with the same woman, never sleeps with her after sex and he doesn't do any other leisure activity like watching a movie together or just talk. Having read about all three siblings I have to say Sam is the one who took their father's death the hardest, and he is the one who couldn't really cope with it or move on. He is a deeply hurt, scarred guy who is terrified of losing again someone he cares about and that is why he doesn't want to let anyone near and grow to love them. It is his self-preservation mechanism.He is a lady's man through and through and that's why Danika's sisters have chosen him to help out Danika by giving her her very first orgasm! Yes, that is how these two meet: Danika is at a bar with her two sisters, discussing her problematic sex life (or lack of) and Sam and his friend Ben overhear the whole shocking conversation from the neighbouring table, since Danika's sisters don't really care about keeping it down. Of course Sam is intrigued by the challenge and that is even before he catches his first glimpse of Danika, after that:""She swayed closer. His eyes focused on her lips for a moment and he felt a suden need to kiss her. Not just a desire to kiss her, but a demanding, do-it-or-die need.""What makes Erin Nicholas' novels stand out are the amount of humour infused in the story: usually love scenes and seductions are written in a sensual, steamy way, but in Erin Nicholas' stories you could very well laugh out loud at a line in the middle of a hot and steamy scene and that gives the whole relationship (and story) a different aspect, which I most certainly like.Just a quick example (it was so hard to narrow down all the quotes)""Now let's say you were going to seduce a guy. You knew you were going to see him later at a party. What would you wear?""""My black bra and panties, I guess,"" she finally said.""Did you buy those because they are sexy?""""No. I bought them for my aunt's funeral.""Sam coughed and looked over at her again. ""Excuse me?""""Oral sex is like knitting?"" Sam repeated. ""This I've got to hear.""But what the reader discovers through the story is that Sam is not just the perfect lover but he is a generous and thoughtful guy: he plays Santa's elf to an 80 year old lady (sneaking into her house once a week to repair anything that is broken or needs to be fixed), his heart is made of gold, he just doesn't like to display it. I was very moved reading about what an amazing and great guy he is and how insecure and afraid he is. Losing his father at 15 and having his mother leave him at 5 caused permanent damage. My heart went out to him and I was glad to see Danika was there offering him support and love, even if Sam didn't know that was what he needed.And again what one has come to know in Just Right (Book #1 in the series), being a romance novel does not necessarily mean that it is all fluff and candy. In Just Like That there are serious conversations about heartache and grief, and learning about Sam's and Danika's pain in losing their parents gave more substance to the novel.Of course besides the emotional depth, the chemistry between Sam and Danika was once again phenomenal (though maybe due to their emotional tangles or because of Danika's personality, compared to Book #1 I felt it was tamer here, but still pretty steamy).Ater finishing Book #1 I was eager to revisit the supporting characters and was so glad to see them again! Dooley, Mac and Kevin are a riot and I love all the teasing and bantering between the guys. My favourite scene hands down:(Sam on establishing some rules regarding Danika and what his friends can and cannot do:)""Rule number one, no talking about her when I'm not around.""""I thought rule number one was no ogling.""""Fine."" Sam gritted his teeth. ""No ogling. Then no talking about her.""""What if we say nice things?"" Dooley asked.""No."" Sam glared at the guy who had once pretended to be his probation officer to get him out of a blind date. ""Rule number three, no watching her leave the room.""""Isn't that kind of like ogling?"" Mac asked smirking.Sam turned his scowl on the guy who had once driven him sixteen hours straight so he could enter a poker tournament. ""Next, no thinking about her after she is out of sight.""""How will we know she's out of sight if we can't watch her leave?""Sam gave up.You'll laugh, you'll cry and you'll definitely pant while reading Just Like That, it is a keeper, and if you haven't bought this series yet (why? what are you waiting for?!) Now is the time!!",31-Mar-2043,A1JVLHI56QJF5V,"Fantastic series, it gets better and better!",1.0,book bradford series loved even better book rightin learn middle sibling ultimate carefree commitment phobic laid back casual flock woman foot love woman love giving pleasure enjoying serious rule doesnt sleep twice woman never sleep doesnt leisure activity watching movie together talk read three sibling took father death hardest couldnt really cope move deeply hurt scarred terrified losing someone care doesnt want anyone near grow love selfpreservation mechanism lady danikas sister chosen help danika giving very first orgasm meet danika sister discussing problematic life lack friend overhear whole shocking conversation neighbouring table since danikas sister dont really care keeping course intrigued challenge even catch first glimpse danika thatshe swayed closer eye focused lip moment felt suden need kiss desire kiss demanding doitordie need make erin nicholas novel stand amount humour infused story usually love scene seduction written sensual steamy erin nicholas story could very well laugh loud line middle steamy scene give whole relationship story different aspect certainly quick example hard narrow quotesnow going seduce knew going later party wearmy black panty guess finally sexyno bought aunt funeral coughed looked excuse meoral knitting repeated hear reader discovers story perfect lover generous thoughtful play santa year lady sneaking house week repair anything broken need fixed heart made gold doesnt display very moved reading amazing great insecure afraid losing father mother leave caused permanent damage heart went glad danika offering support love even didnt needed come right book series romance novel necessarily mean fluff candy serious conversation heartache grief learning sam danikas pain losing parent gave substance novel course besides emotional depth chemistry danika phenomenal though maybe emotional tangle danikas personality compared book felt tamer pretty steamy ater finishing book eager revisit supporting character glad dooley kevin riot love teasing bantering guy favourite scene hand downsam establishing rule regarding danika friend dorule number talking around thought rule number ogling fine gritted teeth ogling talking nice thing dooley asked glared pretended probation officer blind date rule number three watching leave room isnt kind ogling asked smirking turned scowl driven sixteen hour straight could enter poker tournament next thinking sight shes sight cant watch leavesam gave youll laugh youll youll definitely pant reading keeper havent bought series waiting time
947523,B008OZEWTS,4,"It was an easy read, I spent a rainy day reading it and was unable to put it down until I finished",27-Jul-2047,A2SM5OOG686TNW,Good story,0.75,easy read spent rainy reading unable finished


## Tokenize and apply models

In [21]:
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt

In [4]:
#read dataset .csv
df = pd.read_csv('CrystalBall_customerReviews_preprocessed.csv',sep=',',index_col=None)

#subset columns:2 and rows:1000
subset_df = df[[ 'std_rating','Review_Content_Parsed']].head(1000)
print(subset_df.shape)
subset_df.head(2)

(1000, 2)


Unnamed: 0,std_rating,Review_Content_Parsed
0,1.0,tracey kitts disappoint started menage novel s...
1,1.0,holy cliffhanger emma definitely struggling fe...


In [5]:
#find max length od review
subset_df.Review_Content_Parsed.str.len().max()

4878

In [6]:
X = subset_df.Review_Content_Parsed
y = subset_df.std_rating

In [7]:
#train-validation split

X_train, X_val, y_train, y_val = train_test_split(X,y, test_size = 0.3)

print("rows in train:",X_train.shape[0])
print("rows in validation:",X_val.shape[0])

rows in train: 700
rows in validation: 300
