In [45]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
import sqlite3
import re
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import string
import swifter

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_pickle('../data/merged_df.pkl')
df.columns

Index(['date', 'reviewID', 'reviewerID', 'reviewContent', 'review_rating',
       'review_usefulCount', 'review_coolCount', 'review_funnyCount',
       'flagged', 'restaurantID', 'name_x', 'location', 'yelpJoinDate',
       'friendCount', 'reviewCount', 'firstCount', 'usefulCount', 'coolCount',
       'funnyCount', 'complimentCount', 'tipCount', 'fanCount', 'name_y',
       'rating'],
      dtype='object')

In [3]:
df.head(3)

Unnamed: 0,date,reviewID,reviewerID,reviewContent,review_rating,review_usefulCount,review_coolCount,review_funnyCount,flagged,restaurantID,...,reviewCount,firstCount,usefulCount,coolCount,funnyCount,complimentCount,tipCount,fanCount,name_y,rating
0,2012-09-25,xvdJntJlo01tHu83-vXiRw,bNYesZ944s6IJVowOnB0iA,Good choice by our Chicago friends for our fin...,4,0,0,0,NR,KU_Ze0TpR2HgKG2OpTh2NA,...,48,5,41,5,5,2,0,1,Slurping Turtle,3.5
1,2012-06-23,Z4oAUd6mIOhxxDsle3trPA,BSh3h1J4mdSmEsb8FFdf0Q,"I'm not a Takashi fan, so I was a bit hesitant...",4,0,0,0,YR,KU_Ze0TpR2HgKG2OpTh2NA,...,116,2,93,16,19,10,0,2,Slurping Turtle,3.5
2,2012-10-10,71c7BDude0l5tNjtxRZzMg,XVvbDeYn5Dk-MteNHwjC7Q,I should have read the the 3 stars and below r...,2,0,0,0,NR,KU_Ze0TpR2HgKG2OpTh2NA,...,14,1,5,1,3,0,4,1,Slurping Turtle,3.5


In [4]:
from sklearn.model_selection import train_test_split

In [9]:
from sklearn import preprocessing
# 0 is NR, 1 is YR

le = preprocessing.LabelEncoder()
df['flagged'] = le.fit_transform(df['flagged'])
df.head()

Unnamed: 0,date,reviewID,reviewerID,reviewContent,review_rating,review_usefulCount,review_coolCount,review_funnyCount,flagged,restaurantID,...,reviewCount,firstCount,usefulCount,coolCount,funnyCount,complimentCount,tipCount,fanCount,name_y,rating
0,2012-09-25,xvdJntJlo01tHu83-vXiRw,bNYesZ944s6IJVowOnB0iA,Good choice by our Chicago friends for our fin...,4,0,0,0,0,KU_Ze0TpR2HgKG2OpTh2NA,...,48,5,41,5,5,2,0,1,Slurping Turtle,3.5
1,2012-06-23,Z4oAUd6mIOhxxDsle3trPA,BSh3h1J4mdSmEsb8FFdf0Q,"I'm not a Takashi fan, so I was a bit hesitant...",4,0,0,0,1,KU_Ze0TpR2HgKG2OpTh2NA,...,116,2,93,16,19,10,0,2,Slurping Turtle,3.5
2,2012-10-10,71c7BDude0l5tNjtxRZzMg,XVvbDeYn5Dk-MteNHwjC7Q,I should have read the the 3 stars and below r...,2,0,0,0,0,KU_Ze0TpR2HgKG2OpTh2NA,...,14,1,5,1,3,0,4,1,Slurping Turtle,3.5
3,2011-12-15,Vrzm2xmm2aBqBRqs3yK9Zw,om5ZiponkpRqUNa3pVPiRg,It must be called slurping turtle because thei...,3,8,3,4,1,KU_Ze0TpR2HgKG2OpTh2NA,...,2063,347,12660,9617,6682,6948,605,503,Slurping Turtle,3.5
4,2012-09-07,coKi6jCbVd4qSUpoE5_0iA,LZt80LbsfQ9kOz96H7c1bA,I'm surprised this place is getting so many ne...,4,2,2,1,0,KU_Ze0TpR2HgKG2OpTh2NA,...,132,0,154,84,22,24,3,7,Slurping Turtle,3.5


In [11]:
X = df['reviewContent']
y = df['flagged']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)
print(X_train.shape)
print(X_test.shape)

(471561,)
(202098,)


In [18]:
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct
df['cleaned_text'] = df['reviewContent'].apply(lambda x: remove_punctuation(x))
df['cleaned_text']

0         Good choice by our Chicago friends for our fin...
1         Im not a Takashi fan so I was a bit hesitant a...
2         I should have read the the 3 stars and below r...
3         It must be called slurping turtle because thei...
4         Im surprised this place is getting so many neg...
                                ...                        
673654    Michael James at Amp is awesome I moved to Nor...
673655    This is by far the best and most consistent re...
673656    If you want a great cajuncreole meal in Hampto...
673657    UPDATE  Finally heard back from them A worker ...
673658    COME JOIN IN THE CELEBRATION CALL 312 226 0340...
Name: cleaned_text, Length: 673659, dtype: object

In [19]:
# \r and \n
df['cleaned_text'] = df['cleaned_text'].str.replace("\r", " ")
df['cleaned_text'] = df['cleaned_text'].str.replace("\n", " ")
df['cleaned_text'] = df['cleaned_text'].str.replace("    ", " ")

In [20]:
# " when quoting text
df['cleaned_text'] = df['cleaned_text'].str.replace('"', '')

In [21]:
# Lowercasing the text
df['cleaned_text'] = df['cleaned_text'].str.lower()

In [22]:
punctuation_signs = list("?:!.,;")
df['cleaned_text'] = df['cleaned_text']

for punct_sign in punctuation_signs:
    df['cleaned_text'] = df['cleaned_text'].str.replace(punct_sign, '')

In [23]:
df['cleaned_text'] = df['cleaned_text'].str.replace("'s", "")

In [24]:
df['cleaned_text']

0         good choice by our chicago friends for our fin...
1         im not a takashi fan so i was a bit hesitant a...
2         i should have read the the 3 stars and below r...
3         it must be called slurping turtle because thei...
4         im surprised this place is getting so many neg...
                                ...                        
673654    michael james at amp is awesome i moved to nor...
673655    this is by far the best and most consistent re...
673656    if you want a great cajuncreole meal in hampto...
673657    update  finally heard back from them a worker ...
673658    come join in the celebration call 312 226 0340...
Name: cleaned_text, Length: 673659, dtype: object

In [28]:
wordnet_lemmatizer = WordNetLemmatizer()
nrows = len(df)
lemmatized_text_list = []

for row in tqdm(range(0, nrows)):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['cleaned_text']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

df['cleaned_text_lemma'] = lemmatized_text_list

100%|█████████████████████████████████████████████████████████████████████████| 673659/673659 [15:27<00:00, 726.65it/s]


In [37]:
df['cleaned_text_strip'] = df['cleaned_text_lemma'].apply(lambda x: " ".join(x.split()))

In [34]:
# Downloading the stop words list
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\BD\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [41]:
# Loading the stop words in english
stop_words = list(stopwords.words('english'))

In [46]:
df['cleaned_text_stopwords'] = df['cleaned_text_strip'].swifter.apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

  from pandas import Panel


HBox(children=(IntProgress(value=0, description='Pandas Apply', max=673659, style=ProgressStyle(description_wi…




In [47]:
df['cleaned_text_stopwords']

0         good choice chicago friends final meal head ai...
1         im takashi fan bite hesitant go well im glad g...
2         read 3 star review spent 180 include 18 tip 3 ...
3         must call slurp turtle service slow turtle act...
4         im surprise place get many negativemediocre re...
                                ...                        
673654    michael jam amp awesome move norfolk 2 years a...
673655    far best consistent restaurant norfolk less do...
673656    want great cajuncreole meal hampton roads big ...
673657    update finally hear back worker leave vm cell ...
673658    come join celebration call 312 226 0340 ticket...
Name: cleaned_text_stopwords, Length: 673659, dtype: object

In [66]:
df.to_pickle('../data/cleaned_text.pkl')

In [48]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_text_stopwords'], 
                                                    df['flagged'], 
                                                    test_size=0.15, 
                                                    random_state=8)

In [50]:
# Parameter election
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 300

In [58]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

(572610, 300)
(101049, 300)


In [60]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(features_train, labels_train)
import numpy as np
predicted = clf.predict(features_test)
np.mean(predicted == labels_test)

0.6046274579659373

In [62]:
from sklearn.pipeline import Pipeline

In [65]:
from sklearn.linear_model import SGDClassifier

clf_sgd = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42).fit(features_train, labels_train)

predicted = clf_sgd.predict(features_test)
np.mean(predicted == labels_test)

0.6200655127710318