In [None]:
import tarfile

In [None]:
my_tar = tarfile.open('/content/drive/MyDrive/yelp_dataset.tar')


In [None]:
my_tar.extractall('./yelp_dataset')

In [None]:
my_tar.close()

In [None]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from pathlib import Path
import json
from time import time
from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')



# **CONVERTING JSON TO CSV (USERS,BUSINESS,REVIEWS)FOR BETTER ANALYSIS** 

In [None]:
data_business = pd.read_json('/content/yelp_dataset/yelp_academic_dataset_business.json', lines=True)
data_business.fillna('NA', inplace=True)
print('Final Shape: ',data_business.shape)

we only need restaurants data in business so we can remove other data . But we can consider above left data for sentiment analysis later.

In [None]:
data_business = data_business[data_business['categories'].str.contains('Restaurants')] 
print('Final Shape: ',data_business.shape)

In [None]:
data_review_one = pd.read_json('/content/yelp_dataset/yelp_academic_dataset_review.json', chunksize=100000, lines=True)

data_review = pd.DataFrame() 
i=0
for df in data_review_one: 
  df = df[df['business_id'].isin(data_business['business_id'])] 
  data_review = pd.concat([data_review, df])
  i=i+1 
  print(i) 
  if i==4: 
    break

In [None]:
data_business = data_business[data_business['business_id'].isin(data_review['business_id'])]

In [None]:
print('Final businesses shape: ', data_business.shape)
print('Final review shape: ', data_review.shape)

In [None]:
csv_name = "yelp_business_final.csv"
data_business.to_csv(csv_name, index=False)

In [None]:
csv_name = "yelp_review_final.csv"
data_review.to_csv(csv_name, index=False)

In [None]:
df = pd.read_csv("/content/yelp_business_final.csv")
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
#loading reviews final dataset
df2=pd.read_csv("/content/yelp_review_final.csv")

In [None]:
df2.shape

In [None]:
df2.columns

In [None]:
df2.info()

In [None]:
df2.head()

In [None]:
#FINDINGS MISSING VALUES IN DATA 1 AND DATA 2
mis_value1=df.isnull().sum()
mis_val1_percent = 100 * df.isnull().sum() / len(df)
mis_val_table = pd.concat([mis_value1, mis_val1_percent], axis=1)
mis_val_table_ren_columns = mis_val_table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})
mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)
mis_val_table_ren_columns

In [None]:
# Dropping columns that aren't useful
df.drop('attributes',axis=1,inplace=True)
#data.drop('address',axis=1,inplace=True)
df.drop('postal_code',axis=1,inplace=True)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# now check missing values in reviews dataset and remove unneccesary columns
mis_value2=df2.isnull().sum()
mis_val2_percent = 100 * df2.isnull().sum() / len(df)
print(mis_val2_percent)
mis_val_table = pd.concat([mis_value1, mis_val2_percent], axis=1)
mis_val_table_ren_columns = mis_val_table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})
mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)
mis_val_table_ren_columns

so there is no missing values in review dataset so no need to remove columns

In [None]:
# Merging both datasets
merge_data = pd.merge(df,df2,on='business_id',how='inner')
merge_data

In [None]:
merge_data.shape

In [None]:
dffinal = df.merge(df2, on="business_id", how = 'inner')
dffinal.head()

In [None]:
dffinal.shape

# **EXPLORATORY DATA ANALYSIS**

In [None]:
res_count = df.name.value_counts()
res_count = res_count.sort_values(ascending = False)
res_count = res_count.iloc[0:15]

# plot
fig = plt.figure(figsize=(8, 6))
ax = sns.barplot(res_count.index, res_count.values)
plt.title("Restaurants with High Occurences",fontsize = 20)
x_locs,x_labels = plt.xticks()
plt.setp(x_labels, rotation = 60)
plt.ylabel('Number of Restaurants', fontsize = 12)
plt.xlabel('Restaurant', fontsize = 12)


In [None]:
# top 10 cities having restaurants
top_cities = df['city'].value_counts().head(10)
top_city_df = pd.DataFrame(data=top_cities)
top_city_df.reset_index(level=0, inplace=True)
top_city_df.rename(columns = {'index':'City','city':'Count'}, inplace = True)
fig,ax= plt.subplots(figsize=(8,6))
sns.barplot(x="City",y='Count',hue='City',data=top_city_df,ax=ax,dodge=False)

In [None]:
sns.jointplot(data=df, x='latitude', y='longitude')

We see that locations of businesses are concentrated in clusters. These clusters must be big cities. 

In [None]:
# Top 10 business categories
! pip install basemap
from mpl_toolkits.basemap import Basemap
fig = plt.figure(figsize=(10, 6))
plt.title("Geographic View of Restaurant Locations",fontsize = 20)
m=Basemap(projection='cyl', lon_0 = 0, lat_0=0, resolution='c')
m.fillcontinents(color='#FAFFCA',lake_color='#003875')
m.drawmapboundary(fill_color='#003875') 
m.drawcountries(linewidth=0.2, color="black")
m_coords = m(df["longitude"].tolist(), df["latitude"].tolist())
m.scatter(m_coords[0], m_coords[1], s=5, c='red', lw=3, zorder=5)

We see our data has businesses from certain cities of U.S. and not all over U.S.

In [None]:
ax = sns.catplot(x="review_count", y="name",data= df.nlargest(20,'review_count'), 
                 kind="bar",hue= "categories", dodge= False, height= 10 )

plt.subplots_adjust(top=0.9)
ax.fig.suptitle('Top 20 Most Reviewed Businesses And Categories Lables Used')




In [None]:
# to check how rating and reviews are related to each other as these are important factors for restaurant recommendation
fig = plt.figure(figsize=(8, 6))
sns.scatterplot(x = 'stars', y = 'review_count', data = df)
plt.title("Reviews vs Rating",fontsize = 20)
plt.ylabel('Number of Reviews', fontsize = 12)
plt.xlabel('Rating', fontsize = 12);


In [None]:
#  top 10 5 star restaurants sorted by review count
toprating_df = df[df["stars"]==5]
toprating_df = toprating_df.sort_values('review_count', ascending=False).head(10)

plt.figure(figsize=(15,7))
p = sns.barplot(x='name', y="review_count", data=toprating_df,color="r")
p.set_xticklabels(p.get_xticklabels(), rotation = 90, fontsize = 8)
p.set_title("Top 5 star-rated Restuarants sorted by review count")
p.set(xlabel="Restaurant", ylabel="Review Count")

In [None]:
df2.date = pd.to_datetime(df2.date)
df2['day'] = df2.date.dt.day
df2['month'] = df2.date.dt.month
df2['year'] = df2.date.dt.year
df2['hour'] = df2.date.dt.hour
df2['minute'] = df2.date.dt.minute
df2['second'] = df2.date.dt.second

df2.head()


In [None]:
g = sns.catplot(data=df2, x='year', kind='count', aspect=2)
ax = plt.gca()
ax.set_title('Number of reviews by Year')

We see number of reviews increased exponentially over the years. 

In [None]:
sns.catplot(data=df2, x='hour', aspect=2, kind='count')
ax = plt.gca()
ax.set_title('Review Time')

minimum no of reviews are given in morning .

In [None]:
m_df=dffinal[['user_id', 'business_id', 'stars_y']]
m_df.head()


dataset on thebasis of business categories

In [None]:
city = df[df['categories'].str.contains('Rest.*')==True]

# Creating dummies dataframe from series for 'categories' 
d_rest= pd.Series(city['categories']).str.get_dummies(',')

# Dropping Restaurants and Food columns as this analysis is for Resturants and these words are common to all entries
d_rest.drop(["Restaurants", " Restaurants", "Food", " Food"], axis=1, inplace=True)

# Removing the whitespaces from the column names
d_rest.columns = d_rest.columns.str.lstrip()

# Adding up all the rows to get the sum of columns and merging the columns with same names
ff_rest = d_rest.groupby(by=d_rest.columns, axis=1).sum()
     

indian restaurants plot

In [None]:
plt.figure(figsize=(15,6))
ff_rest.join(df).groupby('city').sum()['Indian'].sort_values(ascending=True).tail(10)\
                .plot(kind='barh',color='b')
plt.title('Top Cities in which  Indian Restaurants are peresnt',fontsize=18, pad=25.0) 
plt.xlabel('Counts', fontsize=15)
plt.ylabel('City', fontsize=15)
plt.show()

In [None]:
# drop text useful cool date funny from df2
df2_reduced = df2.drop(['text','useful','cool', 'date', 'funny','day','month','year','hour','minute','second'], axis = 1)
df2_reduced.head()
     

In [None]:
# merging some columns of business and reviews column
merged_df_df2 = df2_reduced.merge(df[['city','categories', 'business_id']], how = 'outer', on= 'business_id')
merged_df_df2.head()

In [None]:

merged_df_df2.to_csv("merged.csv",index=False)

In [None]:
merged_df_df2.shape

In [None]:
n_users = merged_df_df2.user_id.unique().shape[0]
n_items = merged_df_df2.business_id.unique().shape[0]

print('Number of users: {}'.format(n_users))
print('Number of items: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(float(merged_df_df2.shape[0]) / float(n_users*n_items) * 100))

In [None]:
# checking if there are duplicate reviews
data_dup = df2.groupby(['business_id','user_id']).agg(['count']).reset_index()

# computing users with more than one review

duplicates = data_dup.useful.sort_values(by = 'count',ascending = False).reset_index()
duplicates[duplicates['count'] >1].shape

Review share

In [None]:
labels = '5-Stars', '4-Stars', '1-Star', '3-Stars', '2-Stars'
sizes = merged_df_df2["stars"].value_counts()
colors = ['b', 'r', 'g', 'lightpink','y']
# Plot
plt.pie(sizes, labels=labels, colors =colors, autopct='%1.1f%%')
plt.axis('equal')
plt.show()

In [None]:
# baseline item based collaborative filtering
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [None]:
df.head()

In [None]:


bus_df = df[df['categories'].str.contains("Food|Coffee|Tea|Restaurants|Bakeries|Bars|Sports Bar|Pubs|Nighlife")]
bus_df.head()

In [None]:
bus_df.drop(['address','hours','is_open','latitude','longitude','state'],axis=1,inplace=True)
bus_df.head()

In [None]:
review_df = df2_reduced.drop('review_id',axis=1)
review_df.head()


In [None]:
restaurant_rating = pd.merge(bus_df, review_df, on='business_id')

In [None]:
restaurant_rating.head()

In [None]:
Restaurant_rating = pd.merge(bus_df, review_df, on='business_id')
restaurant_ratingCount = (restaurant_rating.
     groupby(by = ['name'])['stars_x'].
     count().
     reset_index().
     rename(columns = {'stars_x': 'totalRatingCount'})
     [['name', 'totalRatingCount']]
    )
restaurant_ratingCount.head()

In [None]:
#joining above two tables
rating_with_totalRatingCount = restaurant_rating.merge(restaurant_ratingCount, left_on = 'name', right_on = 'name', how = 'left')
rating_with_totalRatingCount.head()

In [None]:
populatity_threshold = rating_with_totalRatingCount['totalRatingCount'].quantile(0.90)

In [None]:
rating_popular_rest = rating_with_totalRatingCount.query('totalRatingCount >= @populatity_threshold')
rating_popular_rest.shape
     

In [None]:
rating_popular_rest['city'].value_counts()

In [None]:
us_city_user_rating = rating_popular_rest[rating_popular_rest['city'].str.contains("New Orleans|Philadelphia|Nashville|Santa Barbara|Tucson|Columbus|Saint Louis|Saint Petersburg|Indianapolis|Cherry Hill")]

In [None]:
us_city_user_rating

In [None]:
us_city_user_rating = us_city_user_rating.drop_duplicates(['user_id', 'name'])
restaurant_features = us_city_user_rating.pivot(index = 'name', columns = 'user_id', values = 'stars_x').fillna(0)

In [None]:
restaurant_features

In [None]:

restaurant_features_matrix = csr_matrix(restaurant_features.values)

In [None]:
restaurant_features_matrix

**FITTING THE BASELINE KNN MODEL**




In [None]:
knn_recomm = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn_recomm.fit(restaurant_features_matrix)

In [None]:
randomChoice = np.random.choice(restaurant_features.shape[0])
distances, indices = knn_recomm.kneighbors(restaurant_features.iloc[randomChoice].values.reshape(1, -1), n_neighbors = 11)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for Restaurant {0} on priority basis:\n'.format(restaurant_features.index[randomChoice]))
    else:
        print('{0}: {1}'.format(i, restaurant_features.index[indices.flatten()[i]]))

In [None]:
rating_df = df2[['user_id','business_id','stars']].copy()

In [None]:
rating_df.head()

In [None]:
rating_df.shape

# **Recommendation using Collaborative Filtering with Matrix Factorization**

In [None]:
#splitting data
from sklearn.model_selection import train_test_split
X_train, X, y_train, y = train_test_split(rating_df.drop('stars', axis=1), rating_df.stars, train_size=.8)
X_test, X_val, y_test, y_val = train_test_split(X, y, train_size=.5)
del X, y

print(f"Train Size: {round(X_train.shape[0]/rating_df.shape[0]*100)}%")
print("X train shape: ", X_train.shape)
print("y train shape: ", y_train.shape)

print(f"\nValidation Size: {round(X_val.shape[0]/rating_df.shape[0]*100)}%")
print("X val   shape: ", X_val.shape)
print("y val   shape: ", y_val.shape)

print(f"\nTest Size: {round(X_test.shape[0]/rating_df.shape[0]*100)}%")
print("X test  shape: ", X_test.shape)
print("y test  shape: ", y_test.shape)

**Average Baseline Accuracy**
Average model always predict average of all the ratings.

In [None]:
#Baseline accuracy
from sklearn.metrics import mean_squared_error as mse
mean_rating = y_train.mean()

train_baseline = mse(y_train, [mean_rating]*y_train.shape[0])
val_baseline = mse(y_val, [mean_rating]*y_val.shape[0])
test_baseline = mse(y_test, [mean_rating]*y_test.shape[0])


print(f"""Baseline MSE using mean rating:\n
          Train Data: {train_baseline:.4f},
          Val   Data: {val_baseline:.4f},
          Test  Data: {test_baseline:.4f}""")