## Init

In [4]:
import pandas as pd
import numpy as np
import os
import re
import nltk
from sklearn import feature_extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
df_review_original = pd.read_csv('/content/drive/My Drive/STAT628/data/mexican_review.csv')
df_review = pd.read_csv('/content/drive/My Drive/STAT628/data/mexican_review_p.csv')
df_business = pd.read_csv('/content/drive/My Drive/STAT628/data/mexican_info.csv')
df = pd.merge(df_review, df_business[['business_id', 'city']], on = 'business_id', how = 'left')
stars = df.stars.values.tolist()
target = []
for i in stars:
    if i in [1.0, 2.0]: # 
        target.append(1.0)
    elif i in [4.0, 5.0]:
        target.append(3.0)
    else:
        target.append(2.0)
df['target'] = target
df.head()

Unnamed: 0,index,review_id,user_id,business_id,stars,date,text,useful,funny,cool,city,target
0,0,6BnQwlxRn7ZuWdzninM9sQ,JSrP-dUmLlwZiI7Dp3PQ2A,cHdJXLlKNWixBXpDwEGb_A,3.0,2015-04-01 16:30:00,love chinese food love mexican food go wrong c...,1,7,1,Phoenix,2.0
1,1,S337tATeouQJdoPYyir29w,2mxBNBeFrgDszqGS5tdEHA,d_L-rfS1vT3JMzgCUGtiow,5.0,2016-07-25 03:57:19,pick meat planet chef make mexican style dish ...,0,0,0,Las Vegas,3.0
2,2,j3vP8537KHvoXNHQIr3haA,Z_HE_KKT7N-WddPTzUQC7A,jScBTQtdAt-8RshaiBEHgw,5.0,2018-05-28 20:56:05,party 3 order fish tacos pork belly banh mi co...,1,0,1,Henderson,3.0
3,3,CvJy7CdHJqsZNq22fUF7hA,DAssyKNnYBenf0s1uP5iRw,dInxwF8kXVdfLEGTIBRrSw,2.0,2017-04-30 04:13:18,employees busy chat one employee head drive th...,0,1,0,Chandler,1.0
4,4,a6N51Ov3NEchmqsquNGtMA,qpYllTutvfoKvT5OEl7gGQ,HkbNItNrnXlNo59M0YyoMg,5.0,2016-09-30 19:51:43,come loco mill since open love come especially...,0,0,0,Tempe,3.0


## Functions

In [0]:
def get_importance(df, city, vocab_size = 5000, max_train_sample = 50000):
    df_new = df[df['city']==city].copy()

    np.random.RandomState(0)
    if df_new.shape[0] > max_train_sample: # if review size if too larget, randomly choose 'max_train_sample' reviews
        rand = np.random.choice(df_new.shape[0], size = max_train_sample, replace = False)
        text = df_new.text.values[rand].tolist()
        y = df_new.target.values[rand].tolist()
    else:
        text = df_new.text.values.tolist()
        y = df_new.target.values.tolist()

    vectorizer = CountVectorizer(max_features = vocab_size)
    word_vector = vectorizer.fit_transform(text).toarray()
    word_list = vectorizer.get_feature_names()

    del vectorizer
    del text

    clf = RandomForestClassifier(random_state=0, n_estimators=20, max_depth=100)
    clf.fit(word_vector, y)
    del word_vector
    del y

    word_importance = clf.feature_importances_.tolist()
    importance_dict = dict(zip(word_list, word_importance))

    del clf
    del word_list
    del word_importance

    return df_new, importance_dict

def get_important_review(df, importance_dict, max_length = 30):
    # words to ignore in negative reviews
    block_words = ['good', 'great', 'like', 'fuck', 'recommend', 'pretty', 
                   'excite', 'excellent', 'nice', 'decent', 'happy', 'love',
                   'prefer', 'enjoy', 'order', 'food', 'amaze', 'delicious',
                   'return', 'go', 'back']
    
    # edit importance_dict to lower the influence of positive words in negative reviews
    for b in block_words: 
        try:
            importance_dict[b] = 0.
        except:
            pass
    
    scores = []
    df = df.loc[df['target']==1.0] # only consider negative reviews
    text = df.text.values.tolist()
    useful = df.useful.tolist()
    funny = df.funny.tolist()
    cool = df.cool.tolist()
    for i in range(len(text)):
        score = 0
        text_list = text[i].split(' ')
        i_list = []
        for j in text_list:
            try:
                i_list.append(importance_dict[j])
            except:
                pass
        i_list.sort(reverse = True)
        score = sum(i_list[:max_length])
        social_score = useful[i] + funny[i] + cool[i]
        score += np.log(social_score+1)*0.01 # consider 'useful', 'funny', 'cool' numbers
        scores.append(score)
    
    df_new = df.copy()
    df_new['scores'] = scores
    df_new = df_new.sort_values('scores', ascending = False)

    del useful
    del funny
    del cool

    return df_new

def text_processing(text):
    
    text = text.lower()
    text = re.sub(r'[^a-zA-z0-9]', ' ', text)
    text = word_tokenize(text)
    text = [w for w in text if w not in stopwords.words('english')]
    text = [WordNetLemmatizer().lemmatize(w, pos = 'v') for w in text]
    text = ' '.join(text)
    
    return text

def word_filter(word_list, text):
    go_back = 0 # ignore sentences like 'I will never go back.'
    for g in ['worst', 'never', 'go', 'back', 'return']:
        if g in word_list:
            go_back += 1
    if go_back >=2:
        return True
    return False

def extract_sentence(df_business, df_city_scores, df_review_original, city, importance_dict, n_review = 3, max_length = 10):
    # max_length means how much high score words to consider in each sentence
    # n_review means how many top reviews to consider when extract sentences
    df_city_business = df_business[df_business['city']==city].copy()
    df_city_business = df_city_business[['business_id', 'name']]
    business = df_city_business.business_id.tolist()
    important_sentence = []
    important_indexes = []

    for bus in business:
        df_temp = df_city_scores[df_city_scores['business_id']==bus].copy()
        indexes = list(df_temp['index'])[:n_review]
        important_indexes.append(str(indexes))
        if indexes == []: # no negative review
            important_sentence.append('Currently your restaurant does not have 1 or 2 star reviews. Keep great!')
        else:
            texts = df_review_original.loc[df_review_original['index'].isin(indexes)].text.tolist()
            sentence_list = []
            for text in texts:
                text_list = re.split('\.|\n|\r|\!|\?', text)
                processed_list =  []
                scores = []
                for sentence in text_list:
                    processed_list.append(text_processing(sentence))
                for processed in processed_list:
                    word_list = processed.split(' ')
                    if word_filter(word_list, text):
                        score = 0
                    else:
                        i_list = []
                        for word in word_list:
                            try:
                                i_list.append(importance_dict[word])
                            except:
                                pass
                        i_list.sort(reverse = True)
                        score = sum(i_list[:max_length])
                    scores.append(score)
                sentence = text_list[scores.index(max(scores))]
                sentence = sentence.replace('\n', ' ')
                sentence = sentence.replace('\r', ' ')
                sentence = sentence.lstrip()
                sentence = sentence.rstrip()
                sentence = sentence+'.'
                sentence_list.append(sentence)
            important_sentence.append('|'.join(sentence_list))
    
    df_city_business['important_sentence'] = important_sentence
    df_city_business['indexes'] = important_indexes

    return df_city_business

def generate_csv(df, city):
    df_city, importance_dict = get_importance(df, city)
    df_city_scores = get_important_review(df_city, importance_dict)
    df_city_business = extract_sentence(df_business, df_city_scores, df_review_original, city, importance_dict)

    df_city_business.to_csv('/content/drive/My Drive/STAT628/data/ImportantSentence/sentence_'+str(city)+'.csv', index = False)

    return

## Extract by City

### Tunning

In [0]:
def rf_tune(vocab_size=5000, depth=100):
    city = 'Phoenix'
    df_new = df[df['city']==city].copy()

    np.random.RandomState(0)
    rand = np.random.choice(df_new.shape[0], size = 50000, replace=False)
    text = df_new.text.values[rand].tolist()
    y = df_new.target.values[rand].tolist()

    vectorizer = CountVectorizer(max_features = vocab_size)
    word_vector = vectorizer.fit_transform(text[:len(text)]).toarray()
    word_list = vectorizer.get_feature_names()

    del vectorizer

    p = 40000
    clf = RandomForestClassifier(random_state=0, n_estimators=20, max_depth = depth)
    clf.fit(word_vector[:p], y[:p])

    print('vocab-'+str(vocab_size)+' depth-'+str(depth))
    print('Train Accuracy: %3f' % (np.sum(clf.predict(word_vector[:p]) == y[:p])/len(y[:p])))
    print('Test Accuracy: %3f' % (np.sum(clf.predict(word_vector[p:]) == y[p:])/len(y[p:])))
    print()
    
    return

In [24]:
vocab_size_list = [1000, 5000, 10000]
depth_list = [50, 75, 100, None]

for vocab_size in vocab_size_list:
    for depth in depth_list:
        rf_tune(vocab_size, depth)

vocab-1000 depth-50
Train Accuracy: 0.955425
Test Accuracy: 0.800800

vocab-1000 depth-75
Train Accuracy: 0.982900
Test Accuracy: 0.801700

vocab-1000 depth-100
Train Accuracy: 0.992025
Test Accuracy: 0.799800

vocab-1000 depth-None
Train Accuracy: 0.998550
Test Accuracy: 0.804600

vocab-5000 depth-50
Train Accuracy: 0.926925
Test Accuracy: 0.795400

vocab-5000 depth-75
Train Accuracy: 0.966500
Test Accuracy: 0.802400

vocab-5000 depth-100
Train Accuracy: 0.985100
Test Accuracy: 0.811300

vocab-5000 depth-None
Train Accuracy: 0.998050
Test Accuracy: 0.805800

vocab-10000 depth-50
Train Accuracy: 0.911975
Test Accuracy: 0.774300

vocab-10000 depth-75
Train Accuracy: 0.958550
Test Accuracy: 0.789100

vocab-10000 depth-100
Train Accuracy: 0.978750
Test Accuracy: 0.796600

vocab-10000 depth-None
Train Accuracy: 0.998700
Test Accuracy: 0.800600



In [27]:
rf_tune(vocab_size = 20000, depth = 100)

vocab-20000 depth-100
Train Accuracy: 0.972625
Test Accuracy: 0.784500



In [0]:
city = 'Phoenix'

df_city, importance_dict = get_importance(df, city)
df_city_scores = get_important_review(df_city, importance_dict)

In [0]:
df_city_business = extract_sentence(df_business, df_city_scores, df_review_original, city, importance_dict)

In [32]:
df_city_business.head(10)

Unnamed: 0,business_id,name,important_sentence,indexes
0,1Dfx3zM-rW4n-31KeC8sJg,Taco Bell,After little miss attitude brought my card bac...,"[20431, 27350, 25702]"
1,Rs8Wi4OEjeOX7LVlzsXDOA,Sushi Mocorito,"You walk in no one greets you, you sit down li...","[36718, 5288, 27403]"
2,c7JoAt6a3Ufkpn3TrGU23A,Santanas Mexican Food,I came out with a fish taco & chimichanga plat...,"[38375, 39500]"
3,UdL8Z06DaNj6qhen0fEH9A,Mariscos El Dorado Sin,If there is one thing I dislike is paying high...,"[11085, 7702, 27340]"
4,3-aEgS7X2jrbxA7sA1nARw,La Flor De Calabaza,""" She didn't even apologize or anything then w...","[27880, 17530, 13432]"
5,rwscnQMpddjkVNaJQhNuHw,Birrieria Obregon,The food was cold and portions were small for ...,"[23024, 35599, 7633]"
6,9vub2LM7Djy8P-LPumcLXA,Tacos Chiwas,"The food tastes ""okay"", not amazing, not bad, ...","[38499, 3623, 21328]"
7,giCq1MmW-_S2tvNOAHvJcQ,Tarbell's The Tavern,It gave me time to look at the decor (nice) an...,"[2980, 614, 8671]"
8,v7UIeEvwNd3fleFE6icm3A,Tacos Culichi,Currently your restaurant does not have 1 or 2...,[]
9,yb7ZtgMWSZboG_sSUx9g2A,Fuego Taco Shop,The burrito was horrible though and the wipe o...,"[18146, 25504, 23957]"


In [45]:
sen = df_city_business.important_sentence.tolist()
indexes = df_city_business.indexes.tolist()

for i in range(100):
    print(df_city_business.business_id.values[i])
    print(df_city_business.indexes.tolist()[i])
    if sen[i] == 'Currently your restaurant does not have 1 or 2 star reviews. Keep great!':
        sen_list = [sen[i]]
    else:
        sen_list = sen_list = df_city_business.important_sentence.tolist()[i].split('|')
    index = indexes[i]
    for j in sen_list:
       print(j)
    print(index)
    print()

1Dfx3zM-rW4n-31KeC8sJg
[20431, 27350, 25702]
After little miss attitude brought my card back, I drove away and noticed the nice gay boy only gave me 3 fire sauces but I don't blame him since attitude magee seemed to rattle him a bit, so I backed up and waved down a new boy at the window and asked very nicely if i could please get more fire sauce and the nice mexican boy gave me such a big handful of fire sauce that was so unnecessary and ridiculous and gave me the most unpleasant look ever.
In addition, while I was waiting for the equally  burnt quesadillas, one of the employees (female with dirty blonde hair) came out to ask "Who was it.
Then like 10 years ago it was like "Yo Quiro Taco Bell" with that little annoying fucking Chihuahua which was OK Taco Bell was kind of hip and tasted craptastic, remember according to the movie Demolition Man with Sly all restaurants will eventually turn into Taco Bells.
[20431, 27350, 25702]

Rs8Wi4OEjeOX7LVlzsXDOA
[36718, 5288, 27403]
You walk in no

In [52]:
i = 78871
df_review_original[df_review_original['index']==i].text.tolist()[0]

'Went on a Wednesday night around 6:45.\n\nDecor is typical strip mall Mexican restaurant- Dos Equis signs and mariachi music. Service pace was good but the waitress spoke very little English. There were at least a dozen bottles of hot sauce on the table.\n\nThe waitress brought over chips and salsa plus bowls of a vegetable soup with seafood broth. Salsa was fresh and the soup was straightforward but interesting with the seafood broth.\n\nWe ordered fried calamari, which was a bit over cooked and bland even with the included sauce.\n\nMy companion got marlin tacos and enjoyed them. I got a mixed seafood cocktail which came with tostadas and crackers. The cocktail was very bland, with minimal seasoning or flavor.\n\nPricing was a couple bucks per item more than expected but not out of line for a seafood restaurant.\n\nGood Mexican food should always be fresh and flavorful. Overall the combination of bland food, above average pricing, and passable decor means I would not recommend.'

### Csv Generating

In [0]:
cities = ['Las Vegas', 'Phoenix', 'Scottsdale', 'Toronto', 'Charlotte', 'Mesa', 'Henderson', 'Tempe', 'Pittsburgh', 'Chandler']
for city in cities:
    generate_csv(df, city)

In [0]:
df_phoenix = pd.read_csv('/content/drive/My Drive/STAT628/data/ImportantSentence/sentence_Phoenix.csv')
df_ttt = df_business[df_business['city']=='Phoenix'].sort_values('review_count', ascending = False)

In [33]:
bids = df_ttt.business_id.values.tolist()[:30] # see 30 restaruants with most reviews in Phoenix
for bid in bids:
    print(bid)
    sen_list = df_phoenix[df_phoenix['business_id']==bid].important_sentence.tolist()[0].split('|')
    for j in sen_list:
        print(j)
    print()

frCxZS7lPhEnQRJ3UY6m7A
As much as I loved the salsa bar and the ambiance, I definitely feel like there's some other place out there that has better service and better salsa.
I asked for a lime (since I'm Mexican and I can't really eat my tacos without lime) and I never got it :( so I stole my boyfriend's lime from the margarita he ordered (without a drink menu by the way cause he was never given one) we finished our food, we payed and left.
I asked the server what vegetables came with the vegetable chilaquiles but there seemed to be a language barrier; he didn't really understand what I was asking him, only saying yes there are vegetables and when I asked again, that it was a popular choice.

cHdJXLlKNWixBXpDwEGb_A
I ordered a jade red burrito, they gave me something with slimy pork inside, and I asked for a veggie quesadilla, they gave me a quesadilla with gross fatty chicken and a horrible sauce.
I feel really bad about giving this place a bad review, especially since they were so ni

In [40]:
bids = ['frCxZS7lPhEnQRJ3UY6m7A', '3C5Z9homtzkWHouH2BHXYQ', 'cHdJXLlKNWixBXpDwEGb_A', 
        'of4V8nfW7GwJ03tLDdrOlA', 'Wc9UpJhOcdSj7olZkz7SJA', '_WtxQbDK7B-ExGdeG-2j6Q']
for bid in bids:
    print(df_business[df_business['business_id']==bid].name.tolist()[0])
    print(df_phoenix[df_phoenix['business_id']==bid].indexes.tolist()[0])
    sen_list = df_phoenix[df_phoenix['business_id']==bid].important_sentence.tolist()[0].split('|')
    for j in sen_list:
        print(j)
    print()

La Santisima
[261607, 282304, 282236]
As much as I loved the salsa bar and the ambiance, I definitely feel like there's some other place out there that has better service and better salsa.
I asked for a lime (since I'm Mexican and I can't really eat my tacos without lime) and I never got it :( so I stole my boyfriend's lime from the margarita he ordered (without a drink menu by the way cause he was never given one) we finished our food, we payed and left.
I asked the server what vegetables came with the vegetable chilaquiles but there seemed to be a language barrier; he didn't really understand what I was asking him, only saying yes there are vegetables and when I asked again, that it was a popular choice.

Taco Guild
[214960, 220842, 227485]
Eventually I got chips and then the folks next to us got food and we hadn't even been asked if we would like to place an order.
I asked for Coors Light because it was on the drink menu as coming in a bottle, which I prefer, server said they didn't