In [8]:
!pip install -q transformers

In [9]:
import pandas as pd
import numpy as np
from transformers import pipeline

In [2]:
# Import business data
yelp_business = pd.read_json('Data/yelp_academic_dataset_business.json', lines=True)
# Rename to avoid two columns called 'stars'
yelp_business = yelp_business.rename(columns={"stars": "business_avgStars"})
yelp_business.head(3)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,business_avgStars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."


In [3]:
# Import reviews data
yelp_reviews = pd.read_json('Data/yelp_academic_dataset_review.json', lines=True)
yelp_reviews.head(3)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,3,0,0,0,Family diner. Had the buffet. Eclectic assortm...,2014-02-05 20:30:30


In [4]:
# Look at one particular review
yelp_reviews['text'][0]

"If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker."

In [5]:
# Combine both business & reviews data
yelp_reviews_business = pd.concat((yelp_reviews, yelp_business), axis=1)
yelp_reviews_business = yelp_reviews_business[['review_id', 'user_id', 'business_id','stars','text',
                                                                 'categories', 'business_avgStars']]
# For now drop lines with blanks for error handling, can figure out how to deal with this later
yelp_reviews_business = yelp_reviews_business.dropna()
yelp_reviews_business.head(3)

Unnamed: 0,review_id,user_id,business_id,business_id.1,stars,text,categories,business_avgStars
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,Pns2l4eNsfO8kk83dixA6A,3,"If you decide to eat here, just be aware it is...","Doctors, Traditional Chinese Medicine, Naturop...",5.0
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,mpf3x-BjTdTEA3yCZrAYPw,5,I've taken a lot of spin classes over the year...,"Shipping Centers, Local Services, Notaries, Ma...",3.0
2,saUsX_uimxRlCVr67Z4Jig,8g_iMtfSiwikVnbP2etR0A,YjUWPpI6HXG530lwP-fb2A,tUFrWirKiKi_TAnsVWINQQ,3,Family diner. Had the buffet. Eclectic assortm...,"Department Stores, Shopping, Fashion, Home & G...",3.5


In [58]:
# Filter down to only those that contain 'food' in categories
yelp_reviews_business_filtered = yelp_reviews_business[yelp_reviews_business['categories'].str.contains('Food')]

In [59]:
yelp_reviews_business_filtered.shape

(33405, 8)

In [60]:
yelp_reviews_business_filtered["Sentiment_stars"] = yelp_reviews_business_filtered["stars"].apply(lambda score: "positive" if score >= 3 else "negative")
yelp_reviews_business_filtered = yelp_reviews_business_filtered.reset_index()
yelp_reviews_business_filtered.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  yelp_reviews_business_filtered["Sentiment_stars"] = yelp_reviews_business_filtered["stars"].apply(lambda score: "positive" if score >= 3 else "negative")


Unnamed: 0,index,review_id,user_id,business_id,business_id.1,stars,text,categories,business_avgStars,Sentiment_stars
0,3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,MTSW4McQd7CbVtyjqoe9mw,5,"Wow! Yummy, different, delicious. Our favo...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",4.0,positive
1,4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,mWMc6_wTdE0EUBKIGXDVfA,4,Cute interior and owner (?) gave us tour of up...,"Brewpubs, Breweries, Food",4.5,positive
2,5,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,CF33F8-E6oudUQ46HnavjQ,1,I am a long term frequent customer of this est...,"Burgers, Fast Food, Sandwiches, Food, Ice Crea...",2.0,negative


In [10]:
sentiment_pipeline = pipeline("sentiment-analysis")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All model checkpoint layers were used when initializing TFDistilBertForSequenceClassification.

All the layers of TFDistilBertForSequenceClassification were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


In [74]:
preds = pd.DataFrame()

for i in range(yelp_reviews_business_filtered.shape[0]):
    data = yelp_reviews_business_filtered['text'][i][:512]
    df_pred = pd.DataFrame(sentiment_pipeline(data))
    preds = pd.concat((preds,df_pred))

preds = preds.reset_index()
yelp_sentiment_distilbert = pd.concat((yelp_reviews_business_filtered,preds), axis = 1)

In [77]:
yelp_sentiment_distilbert = pd.concat((yelp_reviews_business_filtered,preds), axis = 1)

In [78]:
yelp_sentiment_distilbert.head(7)

Unnamed: 0,index,review_id,user_id,business_id,business_id.1,stars,text,categories,business_avgStars,Sentiment_stars,index.1,label,score
0,3,AqPFMleE6RsU23_auESxiA,_7bHUi9Uuf5__HHc_Q8guQ,kxX2SOes4o-D3ZQBkiMRfA,MTSW4McQd7CbVtyjqoe9mw,5,"Wow! Yummy, different, delicious. Our favo...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...",4.0,positive,0,POSITIVE,0.999712
1,4,Sx8TMOWLNuJBWer-0pcmoA,bcjbaE6dDog4jkNY91ncLQ,e4Vwtrqf-wpJfwesgvdgxQ,mWMc6_wTdE0EUBKIGXDVfA,4,Cute interior and owner (?) gave us tour of up...,"Brewpubs, Breweries, Food",4.5,positive,0,POSITIVE,0.998531
2,5,JrIxlS1TzJ-iCu79ul40cQ,eUta8W_HdHMXPzLBBZhL1A,04UD14gamNjLY0IDYVhHJg,CF33F8-E6oudUQ46HnavjQ,1,I am a long term frequent customer of this est...,"Burgers, Fast Food, Sandwiches, Food, Ice Crea...",2.0,negative,0,NEGATIVE,0.997217
3,9,pUycOfUwM8vqX7KjRRhUEA,59MxRhNVhU9MYndMkz0wtw,gebiRewfieSdtt17PTW6Zg,bBDDEgkFA1Otx9Lfe7BZUQ,3,Had a party of 6 here for hibachi. Our waitres...,"Ice Cream & Frozen Yogurt, Fast Food, Burgers,...",1.5,positive,0,POSITIVE,0.976225
4,11,l3Wk_mvAog6XANIuGQ9C7Q,ZbqSHbgCjzVAqaa7NKWn5A,EQ-TZ2eeD_E0BHuvoaeG5Q,eEOYSgkmpB90uNA7lDOMRA,4,"Locals recommended Milktooth, and it's an amaz...","Vietnamese, Food, Restaurants, Food Trucks",4.0,positive,0,POSITIVE,0.999862
5,14,UBp0zWyH60Hmw6Fsasei7w,4Uh27DgGzsp6PqrH913giQ,otQS34_MymijPTdNBoBdCw,0bPLkL0QhhPO5kt1_EXmNQ,4,The bun makes the Sonoran Dog. It's like a snu...,"Food, Delis, Italian, Bakeries, Restaurants",4.5,positive,0,NEGATIVE,0.986714
6,20,vBK79c3_1Ff_oqkh5VpfGg,Ohhrhu1RkqfVciIVx_W5HQ,nRKndeZLQ3eDL10UMwS2rQ,WKMJwqnfZKsAae75RMP6jA,5,HOLY SMOKES!\n\nactual pumpkin pie mixed in wi...,"Coffee & Tea, Food, Cafes, Bars, Wine Bars, Re...",4.0,positive,0,POSITIVE,0.995198


In [81]:
yelp_sentiment_distilbert.to_csv("sentiment_results_distilbert.csv")

In [92]:
sentiment_sameCount = yelp_sentiment_distilbert[yelp_sentiment_distilbert['label'].str.lower()==yelp_sentiment_distilbert['Sentiment_stars'].str.lower()].shape[0]

In [93]:
sentiment_diffCount = yelp_sentiment_distilbert[yelp_sentiment_distilbert['label'].str.lower()!=yelp_sentiment_distilbert['Sentiment_stars'].str.lower()].shape[0]

In [94]:
print("Same sentiment label: ", sentiment_sameCount)
print("Different sentiment label: ", sentiment_diffCount)

Same sentiment label:  29254
Different sentiment label:  4151


In [95]:
df_diffSentiment = yelp_sentiment_distilbert[yelp_sentiment_distilbert['label'].str.lower()!=yelp_sentiment_distilbert['Sentiment_stars'].str.lower()]

In [97]:
df_diffSentiment['text'][5]

"The bun makes the Sonoran Dog. It's like a snuggie for the pup. A first, it seems ridiculous and almost like it's going to be too much, exactly like everyone's favorite blanket with sleeves. Too much softness, too much smush, too indulgent.  Wrong. It's warm, soft, chewy, fragrant, and it succeeds where other famed Sonoran Dogs fail. \n\nThe hot dog itself is flavorful, but I would prefer that it or the bacon have a little more bite or snap to better hold their own against the dominant mustard and onions. \n\nI'm with the masses on the carne asada caramelo. Excellent tortilla, salty, melty cheese, and great carne. \n\nSuper cheap and you can drive through."

In [100]:
df_diffSentiment[['stars', 'label', 'score']]

Unnamed: 0,stars,label,score
5,4,NEGATIVE,0.986714
10,1,POSITIVE,0.962190
13,5,NEGATIVE,0.984514
31,1,POSITIVE,0.999411
39,5,NEGATIVE,0.966219
...,...,...,...
33367,3,NEGATIVE,0.980247
33380,3,NEGATIVE,0.997115
33390,3,NEGATIVE,0.986983
33398,4,NEGATIVE,0.695079


In [101]:
df_diffSentiment['text'][10]

'If you want to pay for everything a la carte this is the place for you.  \nFood wasn\'t terrible not impressive.\nThey brought a basket of chips and some tomato sauce which I asked politely for something spicier and some pico de gallo.  She brought them happily to me and the salsa was much better.  When asked what we would like to drink I asked for a coke and she brought out a bottle which I stated I wanted the fountain drink.  She said "oh that\'s only Pepsi".  Never mentioned that they only had bottle drinks for coke.  \nWe ordered our food which was reasonably priced, asked for sour cream and also to put cheese on the taco.  She let us know cheese was extra.  \nIt was $2.50 extra for another basket of chips.  \nWhen I received the bill we paid more for the condiments then the actual food.  Side of sour cream 2.00, pico de gallo 2.50, salsa 2.00, chips 2.50, cheese 1.00 and the bottled coke that we didn\'t want 5.00.  \nJust a suggestion...when you order anything make sure to ask if