In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
from itertools import chain
from nltk import FreqDist
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [5]:
review_data = pd.read_csv("review_chinsese_philadelphia.csv")

# removing the duplicates
review_data.drop_duplicates(subset=['review_id','date','user_id'],inplace=True) 
review_data.set_index('Unnamed: 0')

Unnamed: 0_level_0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,L0E_Ld__jIATsQ1vA7ZGZw,Cvi8nZTwDVaQkqmmgQwQMw,D4DZztR7vkbiED6txGUYFg,4,0,0,1,Looks like there is a second Red Kings restaur...,2013-06-24 13:15:21
2,Vou3XAgjYJBuCO_-h1OQaA,M-00xuAOYLeaGEhPmmKjfg,eaDZlSuVS0EY67Ke6pRP6Q,5,0,0,0,Absolutely wonderful food & service! Go there ...,2011-04-23 00:52:05
3,hozoa1DsTCjpz4UC8YGsNw,ndgHYecgAWEqtjnjE7LtfA,MMRRS6YhVRx_iN5-JhMRYg,4,0,0,0,The new Han Dynasty took over the spot where t...,2012-10-08 15:29:44
4,tfucaiy6lG5UbvbmTD4STg,wABHp4fGUC-0acRKhmQO2A,MMRRS6YhVRx_iN5-JhMRYg,5,0,0,0,"Awesome food, great place to split a bunch of ...",2016-03-07 00:22:02
5,bu0BUkenT8c8nnyKUaF0-A,qmpRRF4HAKuGJEujoK-IdQ,eaDZlSuVS0EY67Ke6pRP6Q,5,0,0,0,"I've been coming to this place for years now, ...",2016-02-24 14:34:27
...,...,...,...,...,...,...,...,...,...
56943,fb_xKUoJb-A2wWTSmFndfg,pGmbV55YLG54t8r1W4F3xg,-1B9pP_CrRBJYPICE5WbRA,5,1,0,1,Spice 28 is a great happy hour option in the h...,2021-11-08 15:15:45
56944,4yFZtkdc7SPueWozB6zmwQ,ttg9bK1FuUDZQNIleuR-KQ,ytynqOUb3hjKeJfRj5Tshw,5,0,1,0,"If it wasn't for my friend dragging me out, I ...",2019-12-21 18:46:06
56945,d_POWSXOV2S7QSO6SOfCNg,I2owOyPHlcXYfhXr8hvodw,-1B9pP_CrRBJYPICE5WbRA,3,0,0,1,"So, myself and my person decided to try someth...",2015-10-29 00:58:39
56946,UKSW7aMzc32KQC_eNwRazg,iTA7VqmHkEX3sUtJLpxHdw,qDEcJ48kXdWxQGZefgG94w,5,2,0,0,Incredible and amazing. We always come here wh...,2019-07-01 06:38:29


In [6]:
# quantify the review column to sentiment value
sid = SentimentIntensityAnalyzer()
def sentiquantify(sen):
    score = sid.polarity_scores(sen)
    score = score['compound']
    return score


In [7]:
review_data['sentiment_score'] = review_data['text'].apply(sentiquantify)

In [9]:
review_data['sentiment_cag'] = review_data['sentiment_score'].apply(lambda x: 1 if x>0 else 0)
review_data

Unnamed: 0.1,Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,sentiment_score,sentiment_cag
0,1,L0E_Ld__jIATsQ1vA7ZGZw,Cvi8nZTwDVaQkqmmgQwQMw,D4DZztR7vkbiED6txGUYFg,4,0,0,1,Looks like there is a second Red Kings restaur...,2013-06-24 13:15:21,0.9829,1
1,2,Vou3XAgjYJBuCO_-h1OQaA,M-00xuAOYLeaGEhPmmKjfg,eaDZlSuVS0EY67Ke6pRP6Q,5,0,0,0,Absolutely wonderful food & service! Go there ...,2011-04-23 00:52:05,0.9431,1
2,3,hozoa1DsTCjpz4UC8YGsNw,ndgHYecgAWEqtjnjE7LtfA,MMRRS6YhVRx_iN5-JhMRYg,4,0,0,0,The new Han Dynasty took over the spot where t...,2012-10-08 15:29:44,0.9482,1
3,4,tfucaiy6lG5UbvbmTD4STg,wABHp4fGUC-0acRKhmQO2A,MMRRS6YhVRx_iN5-JhMRYg,5,0,0,0,"Awesome food, great place to split a bunch of ...",2016-03-07 00:22:02,0.9562,1
4,5,bu0BUkenT8c8nnyKUaF0-A,qmpRRF4HAKuGJEujoK-IdQ,eaDZlSuVS0EY67Ke6pRP6Q,5,0,0,0,"I've been coming to this place for years now, ...",2016-02-24 14:34:27,0.8481,1
...,...,...,...,...,...,...,...,...,...,...,...,...
56942,56943,fb_xKUoJb-A2wWTSmFndfg,pGmbV55YLG54t8r1W4F3xg,-1B9pP_CrRBJYPICE5WbRA,5,1,0,1,Spice 28 is a great happy hour option in the h...,2021-11-08 15:15:45,0.8504,1
56943,56944,4yFZtkdc7SPueWozB6zmwQ,ttg9bK1FuUDZQNIleuR-KQ,ytynqOUb3hjKeJfRj5Tshw,5,0,1,0,"If it wasn't for my friend dragging me out, I ...",2019-12-21 18:46:06,-0.5145,0
56944,56945,d_POWSXOV2S7QSO6SOfCNg,I2owOyPHlcXYfhXr8hvodw,-1B9pP_CrRBJYPICE5WbRA,3,0,0,1,"So, myself and my person decided to try someth...",2015-10-29 00:58:39,0.9207,1
56945,56946,UKSW7aMzc32KQC_eNwRazg,iTA7VqmHkEX3sUtJLpxHdw,qDEcJ48kXdWxQGZefgG94w,5,2,0,0,Incredible and amazing. We always come here wh...,2019-07-01 06:38:29,0.8453,1


In [65]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# remove stopwords
def remove_stopwords(data):
    review = data.apply(lambda x: ' '.join([y for y in x.split() if len(y)>2]))
    review_new = review.apply(lambda x: ' '.join([y for y in x.split() if y not in stop_words]))
    review_new = review_new.apply(str.lower)
    return review_new

In [49]:
review_final =  remove_stopwords(review_data['text'])

In [66]:
X_review = review_final
y = review_data['sentiment_cag'] 

In [73]:
tfidf = TfidfVectorizer(ngram_range=(1,2),stop_words = 'english')
X_tfidf = tfidf.fit_transform(X_review)

In [74]:
word = tfidf.get_feature_names_out()

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, random_state=0)
print('# train records: {}'.format(X_train.shape[0]))
print('# test records: {}'.format(X_test.shape[0]))

# train records: 42710
# test records: 14237


In [78]:
def text_reg(model,coef_show=1):
    ml = model.fit(X_train, y_train)
    acc = ml.score(X_test, y_test)
    print ('Model Accuracy: {}'.format(acc))
    
    if coef_show == 1: 
        coef = ml.coef_.tolist()[0]
        coeff_df = pd.DataFrame({'Word' : word, 'Coefficient' : coef})
        coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
        print('')
        print('-Top 100 positive-')
        print(coeff_df.head(100).to_string(index=False))
        print('')
        print('-Top 100 negative-')        
        print(coeff_df.tail(100).to_string(index=False))
        
        
text_reg(LogisticRegression())

Model Accuracy: 0.9044040177003583

-Top 100 positive-
            Word  Coefficient
           great    11.158386
       delicious    10.233432
            best     8.885194
            good     8.353774
            love     8.060474
      definitely     7.316404
         amazing     7.189464
          pretty     6.039344
            nice     6.005357
         awesome     5.874374
        favorite     5.555843
       excellent     5.149309
        friendly     5.032307
          better     4.836471
           super     4.552166
           fresh     4.404423
             fun     3.917017
         perfect     3.888724
           loved     3.805532
            like     3.746052
       fantastic     3.617448
           happy     3.564359
         friends     3.563625
       wonderful     3.368355
         enjoyed     3.329253
           sweet     3.282642
           yummy     3.262934
          philly     3.171085
            free     3.126036
          friend     3.080684
            spo