In [1]:
import json
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np

In [2]:
tds = pd.read_json('yelp_academic_dataset_tip.json', lines = True)
bds = pd.read_json('yelp_academic_dataset_business.json', lines = True)
tds1 = tds.drop(['date', 'compliment_count', 'user_id'], axis = 1)
bds1 = bds.drop(['hours', 'is_open', 'review_count', 'latitude', 'longitude', 'postal_code', 'city', 'address', 'stars', 'name'], axis = 1)
merged = pd.merge(bds1, tds1, on = 'business_id')

## Task 1

In [3]:
#### Task 1a

In [4]:
from textblob import TextBlob
merged['polarity'] = merged['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
merged['subjectivity'] = merged['text'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

#### Task 1b

In [5]:
meanprep = merged.drop(['business_id', 'attributes', 'categories', 'text'], axis = 1)
statemeans = meanprep.groupby('state').mean()
statemeans

Unnamed: 0_level_0,polarity,subjectivity
state,Unnamed: 1_level_1,Unnamed: 2_level_1
ABE,0.55,0.475
AL,-0.35,0.6
AZ,0.423214,0.607579
BC,0.253967,0.475706
CA,0.37,0.355
CO,0.331751,0.511477
DC,0.045055,0.460769
DE,0.522917,0.670833
FL,0.302845,0.490437
GA,0.286886,0.488676


#### Task 1c

In [12]:
import plotly
import plotly.graph_objects as go

fig = go.Figure(data=go.Choropleth(
    locations=statemeans.index, # Spatial coordinates
    z = statemeans['polarity'].astype(float), # Data to be color-coded
    locationmode = 'USA-states', # set of locations match entries in `locations`
    colorscale = 'Reds',
    colorbar_title = "Review Sentiment",
))

fig.update_layout(
    title_text = 'Sentiment of Yelp reviews by State',
    geo_scope='usa', # limite map scope to USA
)

fig.show()
plotly.offline.plot(fig, filename='map.html')

'map.html'

## Task 2

#### Task 2a

In [7]:
MAds = merged.loc[merged['state'] == 'MA']
MAds1 = MAds[MAds['categories'].notna()]
MAds1['is_Restaurant'] = MAds1['categories'].apply(lambda x: 1 if 'Restaurants' in x else 0)
MAds1

Unnamed: 0,business_id,state,attributes,categories,text,polarity,subjectivity,is_Restaurant
94,HPA_qyMEddpAEtFof02ixg,MA,"{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","Food, Pizza, Restaurants",Homemade bread is out of this world,0.000000,0.000000,1
95,HPA_qyMEddpAEtFof02ixg,MA,"{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","Food, Pizza, Restaurants","So as I'm sitting here, I love the decor! Look...",0.257623,0.492424,1
96,HPA_qyMEddpAEtFof02ixg,MA,"{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","Food, Pizza, Restaurants",Don't use the bathroom. They keep the cornmeal...,-0.044444,0.066667,1
97,HPA_qyMEddpAEtFof02ixg,MA,"{'RestaurantsGoodForGroups': 'True', 'HasTV': ...","Food, Pizza, Restaurants",The pizza here is delicious! My favorite in th...,0.600000,0.760000,1
124,6fT0lYr_UgWSCZs_w1PBTQ,MA,,"Specialty Schools, Massage Schools, Middle Sch...",Scam,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...
1161965,yQL8SrSETbbCI1U5esVJQw,MA,"{'GoodForKids': 'True', 'RestaurantsPriceRange...","Restaurants, Pizza, Italian",Best Pizza in Boston....Period.,1.000000,0.300000,1
1161966,yQL8SrSETbbCI1U5esVJQw,MA,"{'GoodForKids': 'True', 'RestaurantsPriceRange...","Restaurants, Pizza, Italian",Food is good but small portions !,0.193750,0.500000,1
1161967,yQL8SrSETbbCI1U5esVJQw,MA,"{'GoodForKids': 'True', 'RestaurantsPriceRange...","Restaurants, Pizza, Italian",Wow! Tiny place but the vibe was cool and past...,0.368750,0.787500,1
1161968,yQL8SrSETbbCI1U5esVJQw,MA,"{'GoodForKids': 'True', 'RestaurantsPriceRange...","Restaurants, Pizza, Italian",Great creative oizza,0.650000,0.875000,1


#### Task 2b

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(MAds1['text'])

In [9]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='saga')
x = vectors
y = MAds1['is_Restaurant']
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
print(cross_val_score(logreg, x, y, cv=3, scoring='roc_auc'))

[0.89407209 0.89034793 0.89682848]


#### Task 2c

In [11]:
print("For TF-IDF to work better as a classifier for if a restaurant is open or not, we could filter out more stopwords, given many stores could be described as good or awesome. Further, if we want to there are barely any non-restaurant data so we should probably add more of those data to make the model identify words not associated with restaurants better. Both of these things would possibly improve model accuracy")

For TF-IDF to work better as a classifier for if a restaurant is open or not, we could filter out more stopwords, given many stores could be described as good or awesome. Further, if we want to there are barely any non-restaurant data so we should probably add more of those data to make the model identify words not associated with restaurants better. Both of these things would possibly improve model accuracy
