## YeLP Reviews Sentiment Analysis and Topic Modeling

In [1]:
DF_PATH = '../data/processed/yelp_processed.pkl'
DF_SAVE_PATH = '../data/processed/yelp_data_engineered.pkl'

### Import libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import string
import pickle
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

### To do:
1. Drop columns (business_id, date, review_id, type, user_id).
2. Convert column stars to be our target: (1 and 2: not happy, 3: neutral, 4 and 5: happy).
3. Extract features using Count Vectorizer then TF-IDF (to get feature considers the importance of words relative to their occurrence in the entire corpus).
4. Save data after being engineered into pkl file.

# ------------------------------------------------------------------------------

### Read dataset

In [3]:
df = pd.read_pickle(DF_PATH)
df.shape

(10000, 10)

### Drop columns (business_id, date, review_id, type, user_id).

In [4]:
cols_to_drop = ['business_id', 'date', 'review_id', 'user_id', 'type']
df = df.drop(cols_to_drop, axis=1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   stars   10000 non-null  int64 
 1   text    10000 non-null  object
 2   cool    10000 non-null  int64 
 3   useful  10000 non-null  int64 
 4   funny   10000 non-null  int64 
dtypes: int64(4), object(1)
memory usage: 390.8+ KB


### Convert column stars to be our target: (1 and 2: not happy, 3: neutral, 4 and 5: happy).

In [6]:
def star_to_sentiment(star):
    if star < 3:
        return 0
    elif star == 3:
        return 1
    else:
        return 2

In [7]:
df['stars'] = df['stars'].apply(star_to_sentiment) # 0 = negative, 1 = neutral, 2 = positive

### Extract features using TF-IDF (to get feature considers the importance of words relative to their occurrence in the entire corpus).

In [8]:
df['text']

0       wife take birthday breakfast excellent weather...
1       idea people give bad review place go show plea...
2       love gyro plate rice good also dig candy selec...
3       rosie dakota love chaparral dog park convenien...
4       general manager scott petello good egg go deta...
                              ...                        
9995    first visithad lunch today use groupon order b...
9996    call house deliciousnessi could go item item b...
9997    recently visit olive ivy business last week 3 ...
9998    nephew move scottsdale recently bunch friend b...
9999    45 location 45 star average think arizona real...
Name: text, Length: 10000, dtype: object

In [9]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_df = 0.85, max_features = 1000)
tfidf_matrix = tfidf.fit_transform(df['text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out(), index=df.index)
tfidf_df.head()

Unnamed: 0,10,100,12,15,20,25,30,40,50,able,...,yelp,yes,yet,yogurt,youll,young,youre,youve,yum,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.256543,0.0,0.0


In [10]:
result_df = pd.concat([df, tfidf_df], axis=1)
result_df = result_df.drop('text', axis=1)

### Save data after being engineered into pkl file.

In [11]:
result_df.to_pickle(DF_SAVE_PATH)