In [1]:
import pandas as pd
import json
from tqdm import tqdm
import numpy as np

In [2]:
#Importing Review Data
line_count = len(open("../data/review.json").readlines())
user_ids, business_ids, stars, dates = [], [], [], []
with open("review.json") as f:
  for line in tqdm(f, total=line_count):
       blob = json.loads(line)
       user_ids += [blob["user_id"]]
       business_ids += [blob["business_id"]]
       stars += [blob["stars"]]
       dates += [blob["date"]]
ratings = pd.DataFrame(
   {"user_id": user_ids, "business_id": business_ids, "rating": stars, "date": dates}
)
user_counts = ratings["user_id"].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()


business_counts = ratings["business_id"].value_counts()
popular_business = business_counts.loc[business_counts >= 5].index.tolist()



#Importing Business data
line_count = len(open("../data/business.json").readlines())
business_ids,name, city, avg_rating,review_cnt,categories, \
latitude, longitude, is_open, \
attributes  = [], [], [], [],[],[], [], [], [], []

with open("business.json") as f:
  for line in tqdm(f, total=line_count):
       blob = json.loads(line)
       business_ids += [blob["business_id"]]
       name += [blob["name"]]
       city += [blob["city"]]
       avg_rating += [blob["stars"]]
       review_cnt += [blob["review_count"]]
       categories += [blob["categories"]]
       latitude += [blob["latitude"]]
       longitude += [blob["longitude"]]
       is_open += [blob["is_open"]]
       attributes += [blob['attributes']]
business = pd.DataFrame(
   {"business_id": business_ids, "name": name, "city": city, "avg_rating": avg_rating,"review_cnt": review_cnt,
    "categories": categories, "latitude": latitude, "longitude": longitude, "is_open": is_open, "attributes": attributes}
)


#Imporing User data

line_count = len(open("../data/user.json").readlines())
user_id, review_cnt, yelping_since, useful, funny, cool, fans, avg_user_rating, yrs_elite, \
compliment_hot, compliment_more, compliment_profile, compliment_cute, compliment_list, \
compliment_note, compliment_plain, compliment_cool, \
compliment_funny, compliment_writer, compliment_photos = [], [], [], [],[], [], [], [], [],[], [], [], [], [],[], [], [], [], [],[] 

with open("user.json") as f:
  for line in tqdm(f, total=line_count):
       blob = json.loads(line)
       user_id += [blob["user_id"]]
       review_cnt += [blob["review_count"]]
       yelping_since += [blob["yelping_since"]]
       useful += [blob["useful"]]
       funny += [blob["funny"]]
       cool += [blob["cool"]]
       fans += [blob["fans"]]
       avg_user_rating += [blob["average_stars"]]
       yrs_elite += [blob["elite"]]
       compliment_hot += [blob['compliment_hot']]
       compliment_more += [blob['compliment_more']]
       compliment_profile += [blob['compliment_profile']]
       compliment_cute += [blob['compliment_cute']]
       compliment_list += [blob['compliment_list']]
       compliment_note += [blob['compliment_note']]
       compliment_plain += [blob['compliment_plain']]
       compliment_funny += [blob['compliment_funny']]
       compliment_writer += [blob['compliment_writer']]
       compliment_photos += [blob['compliment_photos']]
user = pd.DataFrame(
   {"user_id": user_id, "review_cnt": review_cnt, "yelping_since": yelping_since, "useful": useful,"funny": funny,
    "cool": cool, "fans": fans, "avg_user_rating": avg_user_rating, "yrs_elite": yrs_elite, "compliment_hot": compliment_hot,
    "compliment_more": compliment_more, "compliment_profile": compliment_profile, "compliment_cute": compliment_cute,
   "compliment_list": compliment_list, "compliment_note": compliment_note, "compliment_plain": compliment_plain,
   "compliment_funny": compliment_funny, "compliment_writer": compliment_writer, "compliment_photos": compliment_photos}
)


#Calculating number of years since elite
user['years_elite'] = user['yrs_elite'].str.split(',').apply(lambda x: np.where(x==[''],0,len(x)))


#Merging ratings and business data
data_1=ratings.merge(business,how='left',on='business_id')

#Merging ratings,business and user data
data_1=data_1.merge(user,how='left',on='user_id')

#Filtering data using date
data_1['date']=pd.to_datetime(data_1['date'])
data_1=data_1[data_1.date>'2017-01-01']

#Filtering on city and Restaurants
data_2=data_1[(data_1.categories.str.contains('Restaurants',na=False)) & (data_1.city=="Las Vegas")]

user_counts=data_2['user_id'].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()

data_2=data_2[(data_2.user_id.isin(active_users))]

#Converting the categories to columns (One-hot encoded)
categorylist = data_2['categories'].tolist()
categories = [st.split(', ') for st in categorylist]
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
categoryarray = mlb.fit_transform(categories)
names = mlb.classes_
cat_df = pd.DataFrame(categoryarray,columns=names)
cat_df
merged_data = pd.concat([data_2.reset_index(),cat_df],axis=1)

100%|██████████| 6685900/6685900 [01:03<00:00, 104541.68it/s]
100%|██████████| 192609/192609 [00:03<00:00, 51348.27it/s]
100%|██████████| 1637138/1637138 [00:29<00:00, 56058.88it/s]


In [3]:
from ast import literal_eval

In [4]:
def function_dict_to_list(str1,str2,str3,merged_data):
    list1=[]
    for i in merged_data.index:
        try:
            list1.append(int(literal_eval(merged_data[str1][i][str2])[str3]))
        except:
            list1.append(0)
    return(list1)

In [5]:
merged_data['garage']=function_dict_to_list('attributes','BusinessParking','garage',merged_data)
merged_data['street']=function_dict_to_list('attributes','BusinessParking','street',merged_data)
merged_data['lot']=function_dict_to_list('attributes','BusinessParking','lot',merged_data)
merged_data['valet']=function_dict_to_list('attributes','BusinessParking','valet',merged_data)
merged_data['validated']=function_dict_to_list('attributes','BusinessParking','validated',merged_data)

In [6]:
def function_single_values(str1,str2,merged_data):
    list1=[]
    for i in merged_data.index:
        try:
            list1.append(int(merged_data[str1][i][str2]))
        except:
            list1.append(0)
    return(list1)

In [7]:
merged_data['OutdoorSeating']=function_single_values('attributes','OutdoorSeating',merged_data)
merged_data['RestaurantsTakeOut']=function_single_values('attributes','RestaurantsTakeOut',merged_data)
merged_data['HasTV']=function_single_values('attributes','HasTV',merged_data)
merged_data['GoodForKids']=function_single_values('attributes','GoodForKids',merged_data)
merged_data['RestaurantsReservations']=function_single_values('attributes','RestaurantsReservations',merged_data)
merged_data['BusinessAcceptsCreditCards']=function_single_values('attributes','BusinessAcceptsCreditCards',merged_data)
merged_data['RestaurantsPriceRange2']=function_single_values('attributes','RestaurantsPriceRange2',merged_data)
merged_data['BusinessAcceptsCreditCards']=function_single_values('attributes','BusinessAcceptsCreditCards',merged_data)

In [8]:
merged_data['dessert']=function_dict_to_list('attributes','GoodForMeal','dessert',merged_data)
merged_data['lunch']=function_dict_to_list('attributes','GoodForMeal','lunch',merged_data)
merged_data['brunch']=function_dict_to_list('attributes','GoodForMeal','brunch',merged_data)
merged_data['breakfast']=function_dict_to_list('attributes','GoodForMeal','breakfast',merged_data)
merged_data['latenight']=function_dict_to_list('attributes','GoodForMeal','latenight',merged_data)

In [9]:
merged_data['romantic']=function_dict_to_list('attributes','Ambience','romantic',merged_data)
merged_data['upscale']=function_dict_to_list('attributes','Ambience','upscale',merged_data)
merged_data['intimate']=function_dict_to_list('attributes','Ambience','intimate',merged_data)
merged_data['hipster']=function_dict_to_list('attributes','Ambience','hipster',merged_data)
merged_data['casual']=function_dict_to_list('attributes','Ambience','casual',merged_data)

In [10]:
merged_data.to_csv('../data/merged_data_small.csv')

In [11]:
#Calculating number of years since elite
user['years_elite'] = user['yrs_elite'].str.split(',').apply(lambda x: np.where(x==[''],0,len(x)))


#Merging ratings and business data
data_1=ratings.merge(business,how='left',on='business_id')

#Merging ratings,business and user data
data_1=data_1.merge(user,how='left',on='user_id')

#Filtering data using date
data_1['date']=pd.to_datetime(data_1['date'])
data_1=data_1[data_1.date>'2014-01-01']

#Filtering on city and Restaurants
data_2=data_1[(data_1.categories.str.contains('Restaurants',na=False)) & (data_1.city=="Las Vegas")]

user_counts=data_2['user_id'].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()

data_2=data_2[(data_2.user_id.isin(active_users))]

#Converting the categories to columns (One-hot encoded)
categorylist = data_2['categories'].tolist()
categories = [st.split(', ') for st in categorylist]
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
categoryarray = mlb.fit_transform(categories)
names = mlb.classes_
cat_df = pd.DataFrame(categoryarray,columns=names)
cat_df
merged_data = pd.concat([data_2.reset_index(),cat_df],axis=1)

In [12]:
def function_dict_to_list(str1,str2,str3,merged_data):
    list1=[]
    for i in merged_data.index:
        try:
            list1.append(int(literal_eval(merged_data[str1][i][str2])[str3]))
        except:
            list1.append(0)
    return(list1)

merged_data['garage']=function_dict_to_list('attributes','BusinessParking','garage',merged_data)
merged_data['street']=function_dict_to_list('attributes','BusinessParking','street',merged_data)
merged_data['lot']=function_dict_to_list('attributes','BusinessParking','lot',merged_data)
merged_data['valet']=function_dict_to_list('attributes','BusinessParking','valet',merged_data)
merged_data['validated']=function_dict_to_list('attributes','BusinessParking','validated',merged_data)

def function_single_values(str1,str2,merged_data):
    list1=[]
    for i in merged_data.index:
        try:
            list1.append(int(merged_data[str1][i][str2]))
        except:
            list1.append(0)
    return(list1)

merged_data['OutdoorSeating']=function_single_values('attributes','OutdoorSeating',merged_data)
merged_data['RestaurantsTakeOut']=function_single_values('attributes','RestaurantsTakeOut',merged_data)
merged_data['HasTV']=function_single_values('attributes','HasTV',merged_data)
merged_data['GoodForKids']=function_single_values('attributes','GoodForKids',merged_data)
merged_data['RestaurantsReservations']=function_single_values('attributes','RestaurantsReservations',merged_data)
merged_data['BusinessAcceptsCreditCards']=function_single_values('attributes','BusinessAcceptsCreditCards',merged_data)
merged_data['RestaurantsPriceRange2']=function_single_values('attributes','RestaurantsPriceRange2',merged_data)
merged_data['BusinessAcceptsCreditCards']=function_single_values('attributes','BusinessAcceptsCreditCards',merged_data)

merged_data['dessert']=function_dict_to_list('attributes','GoodForMeal','dessert',merged_data)
merged_data['lunch']=function_dict_to_list('attributes','GoodForMeal','lunch',merged_data)
merged_data['brunch']=function_dict_to_list('attributes','GoodForMeal','brunch',merged_data)
merged_data['breakfast']=function_dict_to_list('attributes','GoodForMeal','breakfast',merged_data)
merged_data['latenight']=function_dict_to_list('attributes','GoodForMeal','latenight',merged_data)

merged_data['romantic']=function_dict_to_list('attributes','Ambience','romantic',merged_data)
merged_data['upscale']=function_dict_to_list('attributes','Ambience','upscale',merged_data)
merged_data['intimate']=function_dict_to_list('attributes','Ambience','intimate',merged_data)
merged_data['hipster']=function_dict_to_list('attributes','Ambience','hipster',merged_data)
merged_data['casual']=function_dict_to_list('attributes','Ambience','casual',merged_data)

In [13]:
merged_data.to_csv('../data/merged_data.csv')