## Init

In [1]:
import pandas as pd
import numpy as np
import os
import re
import nltk
from sklearn import feature_extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.ph

In [2]:
df_review_original = pd.read_csv('/content/drive/My Drive/STAT628/data/mexican_review.csv')
df_review = pd.read_csv('/content/drive/My Drive/STAT628/data/mexican_review_p.csv')
df_business = pd.read_csv('/content/drive/My Drive/STAT628/data/mexican_info.csv')
df = pd.merge(df_review, df_business[['business_id', 'city']], on = 'business_id', how = 'left')
df.head()

Unnamed: 0,index,review_id,user_id,business_id,stars,date,text,useful,funny,cool,city
0,0,6BnQwlxRn7ZuWdzninM9sQ,JSrP-dUmLlwZiI7Dp3PQ2A,cHdJXLlKNWixBXpDwEGb_A,3.0,2015-04-01 16:30:00,love chinese food love mexican food go wrong c...,1,7,1,Phoenix
1,1,S337tATeouQJdoPYyir29w,2mxBNBeFrgDszqGS5tdEHA,d_L-rfS1vT3JMzgCUGtiow,5.0,2016-07-25 03:57:19,pick meat planet chef make mexican style dish ...,0,0,0,Las Vegas
2,2,j3vP8537KHvoXNHQIr3haA,Z_HE_KKT7N-WddPTzUQC7A,jScBTQtdAt-8RshaiBEHgw,5.0,2018-05-28 20:56:05,party 3 order fish tacos pork belly banh mi co...,1,0,1,Henderson
3,3,CvJy7CdHJqsZNq22fUF7hA,DAssyKNnYBenf0s1uP5iRw,dInxwF8kXVdfLEGTIBRrSw,2.0,2017-04-30 04:13:18,employees busy chat one employee head drive th...,0,1,0,Chandler
4,4,a6N51Ov3NEchmqsquNGtMA,qpYllTutvfoKvT5OEl7gGQ,HkbNItNrnXlNo59M0YyoMg,5.0,2016-09-30 19:51:43,come loco mill since open love come especially...,0,0,0,Tempe


##### To get more RAM

In [0]:
df = df[df['city']=='Las Vegas']
text = df.text.values.tolist()
vectorizer = CountVectorizer(max_features = 20000)
vectorizer.fit(text)
word_vector = vectorizer.transform(text).toarray()
word_list = vectorizer.get_feature_names()
word_vector.shape

(109846, 20000)

## Functions

In [0]:
def get_importance(df, city, vocab_size = 10000, max_train_sample = 50000):
    df_new = df[df['city']==city].copy()

    np.random.RandomState(0)
    if df_new.shape[0] > max_train_sample:
        rand = np.random.choice(df_new.shape[0], size = max_train_sample, replace = False)
        text = df_new.text.values[rand].tolist()
        y = df_new.stars.values[rand].tolist()
    else:
        text = df_new.text.values.tolist()
        y = df_new.stars.values.tolist()

    vectorizer = CountVectorizer(max_features = vocab_size)
    word_vector = vectorizer.fit_transform(text).toarray()
    word_list = vectorizer.get_feature_names()

    del vectorizer
    del text

    clf = RandomForestClassifier(random_state=0, n_estimators=20, max_depth=80)
    clf.fit(word_vector, y)
    del word_vector
    del y

    word_importance = clf.feature_importances_.tolist()
    importance_dict = dict(zip(word_list, word_importance))

    del clf
    del word_list
    del word_importance

    return df_new, importance_dict

def get_important_review(df, importance_dict, max_length = 30):
    scores = []
    text = df.text.values.tolist()
    useful = df.useful.tolist()
    funny = df.funny.tolist()
    cool = df.cool.tolist()
    for i in range(len(text)):
        score = 0
        text_list = text[i].split(' ')
        i_list = []
        for j in text_list:
            try:
                i_list.append(importance_dict[j])
            except:
                pass
        i_list.sort(reverse = True)
        score = sum(i_list[:max_length])
        social_score = useful[i] + funny[i] + cool[i]
        score += np.log(social_score+1)*0.01
        scores.append(score)
    
    df_new = df.copy()
    df_new['scores'] = scores
    df_new = df_new.loc[df_new['stars'].isin([1.0, 2.0])]
    df_new = df_new.sort_values('scores', ascending = False)

    del useful
    del funny
    del cool

    return df_new

def text_processing(text):
    
    text = text.lower()
    text = re.sub(r'[^a-zA-z0-9]', ' ', text)
    text = word_tokenize(text)
    text = [w for w in text if w not in stopwords.words('english')]
    text = [WordNetLemmatizer().lemmatize(w, pos = 'v') for w in text]
    text = ' '.join(text)
    
    return text

def extract_sentence(df_business, df_city_scores, df_review_original, city, importance_dict, n_review = 3, max_length = 10):
    df_city_business = df_business[df_business['city']==city].copy()
    df_city_business = df_city_business[['business_id', 'name']]
    business = df_city_business.business_id.tolist()
    important_sentence = []
    important_indexes = []
    
    def word_filter(word_list, text):
        block_words = ['good', 'great', 'like', 'fuck', 'recommend', 'pretty', 
                       'excite', 'excellent', 'nice', 'decent', 'happy', 'love',
                       'prefer', 'enjoy']
        for b in block_words:
            if b in word_list:
                return True
        go_back = 0
        for g in ['worst', 'never', 'go', 'back', 'return']:
            if g in word_list:
                go_back += 1
        if go_back >=2:
            return True
        if 'order' in word_list:
            if 'in order to' not in text:
                return True
        return False

    for bus in business:
        df_temp = df_city_scores[df_city_scores['business_id']==bus].copy()
        indexes = list(df_temp['index'])[:n_review]
        important_indexes.append(str(indexes))
        if indexes == []:
            important_sentence.append('Currently your restaurant does not have 1 or 2 star reviews. Keep great!')
        else:
            texts = df_review_original.loc[df_review_original['index'].isin(indexes)].text.tolist()
            sentence_list = []
            for text in texts:
                text_list = re.split('\.|\n|\r|\!', text)
                processed_list =  []
                scores = []
                for sentence in text_list:
                    processed_list.append(text_processing(sentence))
                for processed in processed_list:
                    word_list = processed.split(' ')
                    if word_filter(word_list, text):
                        score = 0
                    else:
                        i_list = []
                        for word in word_list:
                            try:
                                i_list.append(importance_dict[word])
                            except:
                                pass
                        i_list.sort(reverse = True)
                        score = sum(i_list[:max_length])
                    scores.append(score)
                sentence = text_list[scores.index(max(scores))]
                sentence = sentence.replace('\n', ' ')
                sentence = sentence.replace('\r', ' ')
                sentence = sentence.lstrip()
                sentence = sentence.rstrip()
                sentence = sentence+'.'
                sentence_list.append(sentence)
            important_sentence.append(str(sentence_list))
    
    df_city_business['important_sentence'] = important_sentence
    df_city_business['indexes'] = important_indexes

    return df_city_business

def generate_csv(df, city):
    df_city, importance_dict = get_importance(df, city)
    df_city_scores = get_important_review(df_city, importance_dict)
    df_city_business = extract_sentence(df_business, df_city_scores, df_review_original, city, importance_dict)

    df_city_business.to_csv('/content/drive/My Drive/STAT628/data/ImportantSentence/sentence_'+str(city)+'.csv', index = False)

    return

## Extract by City

### Tunning

In [0]:
city = 'Phoenix'
df_new = df[df['city']==city].copy()

np.random.RandomState(0)
rand = np.random.choice(df_new.shape[0], size = 50000, replace=False)
text = df_new.text.values[rand].tolist()
y = df_new.stars.values[rand].tolist()

vectorizer = CountVectorizer(max_features = 5000)
word_vector = vectorizer.fit_transform(text[:len(text)]).toarray()
word_list = vectorizer.get_feature_names()

del vectorizer

In [0]:
word_vector.shape

(50000, 5000)

In [0]:
p = 40000
clf = RandomForestClassifier(random_state=0, n_estimators=20, max_depth = 100)
clf.fit(word_vector[:p], y[:p])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=100, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [0]:
print('Train Accuracy: %3f' % (np.sum(clf.predict(word_vector[:p]) == y[:p])/len(y[:p])))
print('Test Accuracy: %3f' % (np.sum(clf.predict(word_vector[p:]) == y[p:])/len(y[p:])))

Train Accuracy: 0.986675
Test Accuracy: 0.555000


In [0]:
city = 'Phoenix'

df_city, importance_dict = get_importance(df, city)
df_city_scores = get_important_review(df_city, importance_dict)

In [0]:
df_city_business = extract_sentence(df_business, df_city_scores, df_review_original, city, importance_dict)

In [0]:
sen = df_city_business.important_sentence.tolist()
indexes = df_city_business.indexes.tolist()

for i in range(100):
    if sen[i] == 'Currently your restaurant does not have 1 or 2 star reviews. Keep great!':
        sen_list = [sen[i]]
    else:
        sen_list = eval(sen[i])
    index = indexes[i]
    for j in sen_list:
       print(j)
    print(index)
    print()

He asked how much fire sauce I wanted and I said a small handful and he gave me 3 for 6 items lol.
In addition, while I was waiting for the equally  burnt quesadillas, one of the employees (female with dirty blonde hair) came out to ask "Who was it??" With an attitude as if I was asking them to do something other than their jobs, she proceeded to laugh about it.
Taco Bell, Taco Bell, Taco hell how does your garden grow, with all the grease you serve in your food I just don't know.
[27350, 20431, 25702]

You walk in no one greets you, you sit down literally 20 minutes passed before waitress came another  40 minutes to get served  there was only 3 people in restaurant rude customer service Will Not Be coming here again.
I will actually say that service was bad when they open and it's gotten worse as time passes.
Once we paid (which, in hindsight, we really shouldn't have), she hovered over the table the whole time we were signing the receipts as in literally waiting to see what we would 

In [0]:
iii = 75418
print(df_review_original[df_review_original['index']==iii].stars.tolist()[0])
print(df_review_original[df_review_original['index']==iii].text.tolist()[0])

2.0
This was almost a one star review... I really enjoy the food burrito Bandito creates and I'm craving one right now! 
This happened a while back but the experience was so hilarious I must tell what ensued. 
I go in to order 2 breakfast burritos, one with chorizo and egg, the other with bean, egg and cheese. 

It's only 10am and they tell me that they haven't made any chorizo, the cook got a late start. OK I can work with you... make the chorizo burrito with bacon instead. OK. 

I wait and wait. The burritos finally come out. They are in the bag and the girl asks me if I want salsa... I say yes but she just ignores me and hands me the bag. I tell her again that I want salsa and she then gives me 2 cups. 

I drive all the way to work where my coworker is meeting me in the parking lot. I open a burrito since it isn't marked and see that there is ham in it. For some reason neither I nor my coworker enjoy ham though we love our bacon. I was aggravated. They didn't have chorizo at 10 am w

### Csv Generating

In [0]:
cities = ['Las Vegas', 'Phoenix', 'Scottsdale', 'Toronto', 'Charlotte', 'Mesa', 'Henderson', 'Tempe', 'Pittsburgh', 'Chandler']
for city in cities:
    generate_csv(df, city)

In [0]:
city = 'Phoenix'
generate_csv(df, city)

In [0]:
city = 'Las Vegas'
generate_csv(df, city)

In [5]:
df_phoenix = pd.read_csv('/content/drive/My Drive/STAT628/data/ImportantSentence/sentence_Phoenix.csv')

sen = df_phoenix.important_sentence.tolist()
indexes = df_phoenix.indexes.tolist()

for i in range(df_phoenix.shape[0]):
    if sen[i] == 'Currently your restaurant does not have 1 or 2 star reviews. Keep great!':
        sen_list = [sen[i]]
    else:
        sen_list = eval(sen[i])
    index = indexes[i]
    for j in sen_list:
       print(j)
    print(index)
    print()

He asked how much fire sauce I wanted and I said a small handful and he gave me 3 for 6 items lol.
In addition, while I was waiting for the equally  burnt quesadillas, one of the employees (female with dirty blonde hair) came out to ask "Who was it??" With an attitude as if I was asking them to do something other than their jobs, she proceeded to laugh about it.
Taco Bell, Taco Bell, Taco hell how does your garden grow, with all the grease you serve in your food I just don't know.
[27350, 20431, 25702]

You walk in no one greets you, you sit down literally 20 minutes passed before waitress came another  40 minutes to get served  there was only 3 people in restaurant rude customer service Will Not Be coming here again.
I will actually say that service was bad when they open and it's gotten worse as time passes.
I'm always down to try a new sushi spot, but definitely made a mistake saying yes to this place.
[36718, 27403, 5288]

I came out with a fish taco & chimichanga platter & went in

## ..

In [0]:
df_city_scores.head()

Unnamed: 0,index,review_id,user_id,business_id,stars,date,text,useful,funny,cool,city,scores
67255,67257,PGL7_NFPJwJFvGaJ3WjBJA,W8zauC9YPUURGeBU3dyxSA,WVCSiyXjEORiWEet8asURA,2.0,2014-03-27 03:53:51,well disappoint seriously disappoint guess sta...,7,4,3,Chandler,0.223885
399349,399365,C6eRj087EiDTW5U3vfFpPA,c3JYQjsOjc-Em8-Z3j6wog,0vdw9E5zzYDNsW_smjBmHA,2.0,2008-11-23 20:46:36,el sol bakery one place everyone say go omg go...,15,25,6,Chandler,0.221132
395889,395905,65MFDvH6o0V6JOFqEq-CVw,8m-vdsUAmDAfBVAvtI6BOA,wdCH53icp_R2jJDrCZk42g,1.0,2014-03-30 04:56:25,possible would give place zero star actually n...,13,5,1,Chandler,0.212903
144947,144951,wiKjTQ1lG6Xc0RN57XAAxg,25W8CVIdQDIkyPb8ISCGpQ,3SGa6kcFCBHr367NFAiQBg,1.0,2010-05-31 08:37:32,know reviewers talk live parallel universe eit...,4,3,4,Chandler,0.204668
293284,293296,iu6AL0j3UWv7OEuSjXrcsg,0mWRAT95QOSkc_PqKTURbg,xD4bjFpAcvKOWh_inZvkWg,1.0,2015-07-18 00:27:08,oh boy start place reccommend place location w...,7,3,4,Chandler,0.203227


In [0]:
df_city_scores.index[:3]

Int64Index([67255, 399349, 395889], dtype='int64')

In [0]:
df_city_business = df_business[df_business['city']==city].copy()
df_city_business = df_city_business[['business_id', 'name']]
df_city_business.head()

Unnamed: 0,business_id,name
3,voZnDQs6Hs3YpNcS-9TALg,New Mexican Grill
40,QvyG5fZ0mxo5yeLY8jabCA,Dos Gringos
70,51K92JdkWICGDvvVCBYhFA,Rubio's Coastal Grill
99,j3csEfGzkwnXATdRoZDT-A,Casa Reynoso
108,xDHuJaOQ5HqaLP0zgpJD9w,Taco Bell


In [0]:
business = df_city_business.business_id.tolist()

In [0]:
k = business[0]
df_temp = df_city_scores[df_city_scores['business_id']==k]
indexes = list(df_temp['index'])[:3]

In [0]:
indexes

[40126, 33324, 5979]

In [0]:
df_review_original.loc[df_review_original['index'].isin(indexes)]

Unnamed: 0,index,review_id,user_id,business_id,stars,date,text,useful,funny,cool
5979,5979,7xnCCIQ9xEfB3A410-eUZw,Q-qnahcPBo0XIR4NdUTQug,voZnDQs6Hs3YpNcS-9TALg,2.0,2014-11-15 18:52:56,I love the food at this restaurant. I usually ...,1,0,0
33324,33324,c-DZNRPi5KnF_FRe6NJpng,uUZPs0TGCqNdUA5cWGh_LQ,voZnDQs6Hs3YpNcS-9TALg,1.0,2014-09-20 22:43:28,"Okay, obviously anyone who likes this place i...",1,0,0
40126,40126,yTOvj_1sCi8rYyOmezHiVw,kJJzWREJlVJM0WjuGbXFQg,voZnDQs6Hs3YpNcS-9TALg,2.0,2015-01-03 15:30:12,This review is for the location on Gilbert Roa...,0,0,0


In [0]:
texts = df_review_original.loc[df_review_original['index'].isin(indexes)].text.tolist()

In [0]:
texts

["I love the food at this restaurant. I usually go to the one on Lindsay rd.  However,  today I had the worse experience ever with them. I called in an order before 11 am, they told me it would be ready in 15 mins. I got there and waited until 11:25 and still had no breakfast burrito. I asked them about it, they said it was on the way out in a minute. I waited another 5 mins. Needless to say, I left at 11:30 with no food and a refund, since they haven't even started cooking it yet. Not sure I will come back to this location again. The only reason they are getting 2 stars is because the woman up front who I believe is the manager,  was very nice apologized and refunded me my money and provided me a $5 gift card for my time.",
 'Okay,  obviously anyone who likes this place isn\'t a native Arizonan who knows what real Mexican food is!  First indication that it was not going to be great was the salsa tasted like it came from a jar.   We ordered a quesadilla which was pretty good,  however 

In [0]:
i_list

[]

In [0]:
importance_dict = dict(zip(word_list, word_importance))