## YeLP Reviews Sentiment Analysis and Topic Modeling

In [1]:
DF_PATH = '../data/raw/yelp.csv'
DF_SAVE_PATH = '../data/processed/yelp_processed.pkl'

### Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import string
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\20114\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\20114\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\20114\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\20114\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### To do:
1. Perform text cleaning to remove punctuation, stopwords, and performing tokenization. 
2. Apply lemmatization to normalize text.
3. Save the preprocessed data into a pkl file.

# ------------------------------------------------------------------------------

### Read dataset

In [3]:
df = pd.read_csv(DF_PATH)
df.shape

(10000, 10)

### Perform text cleaning to remove punctuation, stopwords, and performing tokenization

#### Remove punctuation

In [4]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
txt = df.sample(1).iloc[0]['text'].lower()
len(txt)

489

In [6]:
txt = ''.join([char for char in txt if char not in string.punctuation])
len(txt)

474

### Remove stopwords

In [7]:
stopwords = stopwords.words('english')

In [8]:
len(stopwords)

179

In [9]:
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [10]:
txt

'perfect patio for a saturday\n\nthis place walks a fine line between being too snooty and being just plain awesome  ive had a couple of experiences here and overall the food is really tasty  im a fan of the gnocchi even though its a little different than traditional gnocchi\nthe location is great as long as there isnt a douche bag convention going on and as long as the bartenders and servers keep their positive attitudes i think this place will be around for a while thanks'

In [11]:
txt = [w for w in word_tokenize(txt) if w not in stopwords]
len(txt)

44

In [12]:
print(txt)

['perfect', 'patio', 'saturday', 'place', 'walks', 'fine', 'line', 'snooty', 'plain', 'awesome', 'ive', 'couple', 'experiences', 'overall', 'food', 'really', 'tasty', 'im', 'fan', 'gnocchi', 'even', 'though', 'little', 'different', 'traditional', 'gnocchi', 'location', 'great', 'long', 'isnt', 'douche', 'bag', 'convention', 'going', 'long', 'bartenders', 'servers', 'keep', 'positive', 'attitudes', 'think', 'place', 'around', 'thanks']


### Apply lemmatization to normalize text.

In [13]:
lemmat = WordNetLemmatizer()
txt = [lemmat.lemmatize(w, pos = 'v') for w in txt]
txt = [lemmat.lemmatize(w, pos = 'n') for w in txt]
len(txt)

44

In [14]:
print(txt)

['perfect', 'patio', 'saturday', 'place', 'walk', 'fine', 'line', 'snooty', 'plain', 'awesome', 'ive', 'couple', 'experience', 'overall', 'food', 'really', 'tasty', 'im', 'fan', 'gnocchi', 'even', 'though', 'little', 'different', 'traditional', 'gnocchi', 'location', 'great', 'long', 'isnt', 'douche', 'bag', 'convention', 'go', 'long', 'bartender', 'server', 'keep', 'positive', 'attitude', 'think', 'place', 'around', 'thank']


#### Put them all in one function

In [15]:
def clean_text(txt):
    lemmat = WordNetLemmatizer()
    txt = re.sub('\n', '', txt) # remove new line
    txt = ''.join([char for char in txt if char not in string.punctuation])
    txt = [w.lower() for w in word_tokenize(txt) if w.lower() not in stopwords]
    txt = [lemmat.lemmatize(w, pos = 'v') for w in txt]
    txt = ' '.join([lemmat.lemmatize(w, pos = 'n') for w in txt])
    return txt

In [16]:
df['text'] = df['text'].apply(clean_text)

In [17]:
df['text']

0       wife take birthday breakfast excellent weather...
1       idea people give bad review place go show plea...
2       love gyro plate rice good also dig candy selec...
3       rosie dakota love chaparral dog park convenien...
4       general manager scott petello good egg go deta...
                              ...                        
9995    first visithad lunch today use groupon order b...
9996    call house deliciousnessi could go item item b...
9997    recently visit olive ivy business last week 3 ...
9998    nephew move scottsdale recently bunch friend b...
9999    45 location 45 star average think arizona real...
Name: text, Length: 10000, dtype: object

### Save the data into pkl file

In [18]:
df.to_pickle(DF_SAVE_PATH)

> **Conclusion:**
> * We did a rich preprocessing for the text data we have.
> * Remove punctuation.
> * Remove stopwords: to only remaine the data that have a meaning in the english language.
> * Apply lemmatization: to normalize text, and return the word to it's lemma. 