## **Imports**


In [306]:
import json
import pprint
import requests
import numpy as np
import pandas as pd

## **Overriding Defaults**


In [307]:
pd.set_option('display.max_rows', 20)

# **Business Dataset**


## **Loading Dataset**


In [308]:
data_file = open("./data/yelp_academic_dataset_business.json", encoding="utf-8")

data = []

for line in data_file:
  data.append(json.loads(line))

businessDf = pd.DataFrame(data)

In [310]:
businessDf.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')

## **Transforming Dataset**


In [313]:
# Select specific columns to create a new DataFrame
selected_columns = ['business_id', 'name', 'city', 'address', 'longitude', 'latitude', 'stars', 'review_count', 'categories']
newDf = businessDf[selected_columns]

# Rename columns in the new DataFrame
newDf = newDf.rename(columns={'stars': 'rating', 'count': 'review_count'})

In [314]:
restaurants_df = newDf[newDf['categories'].str.contains('Restaurants', case=False, na=False)]
print("Restaurants found:",len(restaurants_df))

Restaurants found: 52268


In [316]:
missing_values = restaurants_df.isnull().any(axis=1)
empty_strings = (restaurants_df == '').any(axis=1)

restaurants_df.loc[missing_values | empty_strings]

Unnamed: 0,business_id,name,city,address,longitude,latitude,rating,review_count,categories
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,Tampa Bay,,-82.456320,27.955269,4.0,10,"Vietnamese, Food, Restaurants, Food Trucks"
93,RK6-cJ9hj53RzOlCBmpT-g,Impasto,Tampa,,-82.502346,27.890814,5.0,5,"Restaurants, Italian, Food Trucks, Food"
320,FF45pKN_lzqG8Bqk-_HQvw,Go! Gyro! Go!,Saint Louis,,-90.266699,38.584487,4.5,49,"Street Vendors, Food, Food Trucks, Greek, Food..."
435,bPPy7yNFfb-6N_bHkn1qAw,Taste of the Caribbean,Indianapolis,,-86.086060,39.773392,3.0,17,"Restaurants, Caribbean, Food Trucks, Food"
553,adATTqggIQX5xxLDISkFTw,Just Churros,Tucson,,-110.992075,32.271231,5.0,25,"Food Trucks, Restaurants, Caterers, Event Plan..."
...,...,...,...,...,...,...,...,...,...
148069,OUjISvf-zceQAg2uvuPLWw,Ciao Down,Tucson,,-110.974715,32.222601,4.5,12,"Food Trucks, Pizza, Italian, Restaurants, Food"
148323,7uMwPg2lmF_KscKo9lFCwQ,American Express century and lounge,Philadelphia,,-75.245329,39.876141,5.0,5,"Nightlife, Food Court, Restaurants, Airport Lo..."
148627,RzKHpILaR1nAKDHeKxVY8A,Wanderlust Pizza,Saint Louis,,-90.225220,38.649590,4.0,19,"Restaurants, Street Vendors, Food, Food Trucks..."
149513,eTg443zUsuwc1u63RKtXWg,Gateway Dog House,Saint Louis,,-90.199404,38.627003,4.0,7,"Barbeque, Food, Street Vendors, Hot Dogs, Rest..."


In [317]:
restaurants_df = restaurants_df.replace('', np.nan).dropna()

#### **Filter out the categories which are relevant to Restaurants or Food**


In [321]:
# Provided list of categories
categories = [
    "Sandwiches", "American (Traditional)", "Pizza", 
    "Fast Food", "Breakfast & Brunch", "American (New)", "Burgers", "Mexican", 
    "Italian", "Coffee & Tea", "Seafood", "Chinese", "Salad", "Chicken Wings", 
    "Cafes", "Delis", "Bakeries", "Desserts", "Japanese", "Sushi Bars", "Barbeque",
    "Asian Fusion", "Steakhouses", "Diners", "Cocktail Bars", "Mediterranean",
    "Vegetarian", "Ice Cream & Frozen Yogurt", "Soup", "Tacos", "Juice Bars & Smoothies",
    "Southern", "Thai", "Cajun/Creole", "Tex-Mex", "Vegan", "Vietnamese", "Indian",
    "Latin American", "Chicken Shop", "Greek", "Hot Dogs", "Cheesesteaks",
    "Bagels", "Caribbean", "Middle Eastern", "Soul Food", "Ethnic Food",
    "French", "Korean", "Donuts", "Noodles", "Halal", "Wraps", "Spanish",
    "Cuban", "Bubble Tea", "Canadian (New)", "Pakistani", "Ramen", "Irish",
    "Fish & Chips", "Waffles", "Poke", "Hawaiian", "Acai Bowls", "Dim Sum",
    "Modern European", "German", "Fruits & Veggies", "African",
    "Szechuan", "New Mexican Cuisine", "Filipino", "Falafel", "Pretzels",
    "Puerto Rican", "Cupcakes", "Cantonese", "Gelato", "Kebab", "Turkish",
    "Lebanese", "Peruvian", "Taiwanese", "Brazilian", "Donairs", "Hot Pot",
    "British", "Kosher", "Pan Asian", "Colombian", "Ethiopian", "Salvadoran",
    "Patisserie/Cake Shop", "Empanadas", "Moroccan", "Venezuelan", "Laotian",
    "Afghan", "Dominican", "Polish", "Russian", "Persian/Iranian", "Basque",
    "Teppanyaki", "Mongolian", "Arabic", "Argentine", "Portuguese", "Malaysian",
    "Fondue", "Poutineries", "Honduran", "Belgian", "Indonesian", "Himalayan/Nepalese",
    "Haitian", "Burmese", "Macarons", "Ukrainian", "Cambodian", "Trinidadian",
    "Shanghainese", "Egyptian", "Armenian", "Pancakes", "Bangladeshi", "Australian",
    "Scandinavian", "Iberian", "Syrian", "Singaporean", "Uzbek", "Tuscan",
    "South African", "Czech", "Hungarian", "Senegalese", "Nicaraguan", "Austrian",
    "Scottish", "Sardinian", "Georgian", "Sri Lankan"
]

print("Categories Required:",len(categories)) 

Categories Required: 145


In [None]:
explodedDf = restaurants_df.assign(categories=restaurants_df['categories'].str.split(', ')).explode('categories')
filteredDf = explodedDf[explodedDf['categories'].isin(categories)]
filteredDf.head(5)

In [323]:
def remove_parentheses(text):
    if '(' in text:
        return text.split(' (', 1)[0]
    return text

filteredDf.loc[:, 'categories'] = filteredDf['categories'].apply(remove_parentheses)

#### **Dropping Duplicates**


In [324]:
duplicate_rows = filteredDf[filteredDf.duplicated(subset=['business_id', 'categories'])]
duplicate_rows.shape

(1886, 9)

In [325]:
filteredDf = filteredDf.drop_duplicates(subset=['business_id', 'categories'], keep='first')

In [326]:
columns_to_aggregate = filteredDf.columns.difference(['categories', 'business_id'])

agg_functions = {col: 'first' for col in columns_to_aggregate}

agg_functions['categories'] = lambda x: ', '.join(x)

clean_restaurants_df = filteredDf.groupby('business_id').agg(agg_functions).reset_index()

print("Restaurants after filter:",len(clean_restaurants_df))

clean_restaurants_df.head(5)

Restaurants after filter: 50843


Unnamed: 0,business_id,address,city,latitude,longitude,name,rating,review_count,categories
0,---kPU91CF4Lq2-WlRu9Lw,4903 State Rd 54,New Port Richey,28.217288,-82.733344,Frankie's Raw Bar,4.5,24,"Seafood, Latin American"
1,--0iUa4sNDFiZFrAdIWhZQ,6 S White Horse Pike,Clementon,39.81785,-74.993364,Pupuseria Y Restaurant Melba,3.0,14,"Mexican, Ethnic Food"
2,--7PUidqRWpRSpXebiyxTg,9910 108A Avenue,Edmonton,53.554659,-113.49304,Humpty's Family Restaurant,2.0,12,Breakfast & Brunch
3,--8IbOsAAxjKRoYsBFL-PA,4706 Paris Ave,Gentilly,30.006341,-90.074523,The Original Italian Pie,3.0,27,Italian
4,--ZVrH2X2QXBFdCilbirsw,1531 W Wynnewood Rd,Ardmore,39.997299,-75.292207,Chris's Sandwich Shop,4.5,32,"American, Pizza, Sandwiches, Wraps, Delis, Salad"


In [345]:
names = clean_restaurants_df.iloc[:150]['name']
duplicate_names = clean_restaurants_df.iloc[:150][names.duplicated(keep=False)].sort_values(by='name')

print("Duplicate rows within the first 150 rows:")
duplicate_names

Duplicate rows within the first 150 rows:


Unnamed: 0,business_id,address,city,latitude,longitude,name,rating,review_count,categories
106,-7AGjO0qnOld_GZK9dNNkQ,627 Cross Keys Rd,Sicklerville,39.735013,-75.0074,Buffalo Wild Wings,2.0,122,"American, Chicken Wings"
136,-9Kxik9BRNkft5rPcVvSsw,1424 Central Park Cir,O'Fallon,38.577906,-89.933815,Buffalo Wild Wings,2.0,115,"American, Chicken Wings"
62,-3725FZiIIYdwQtM4MKEIA,991 Baltimore Pike,Glen Mills,39.883915,-75.536518,Domino's Pizza,1.5,20,"Pizza, Sandwiches, Chicken Wings"
134,-99CauTgdResVER5NvRTfw,7940 Michigan Rd,Indianapolis,39.897838,-86.217208,Domino's Pizza,1.5,32,"Pizza, Chicken Wings, Sandwiches"
37,-1owBLC2h6DF5n_j77oq3g,101 Lineberry Blvd,Mount Juliet,36.231426,-86.510603,Hardee's,1.5,7,"Breakfast & Brunch, Burgers, Chicken Wings, Fa..."
85,-4wykApCGL78JzWYICOQzw,10909 Saint Charles Rock Rd,Bridgeton,38.734927,-90.39352,Hardee's,2.0,9,"Fast Food, Burgers"
137,-9QWlFM3KH0DBS1wiHZJXA,110 E Hwy 50,O Fallon,38.585132,-89.909162,Hardee's,2.5,13,"American, Fast Food, Burgers"
112,-7Rx5jVeQmlVoAU_oXrzew,24 W Marlton Pike,Cherry Hill,39.914764,-75.012779,McDonald's,1.0,11,"Burgers, Fast Food, Coffee & Tea"
144,-9yzQQ0d_rcOD2CzdTNO_Q,4240 Louisa St,New Orleans,30.004244,-90.036407,McDonald's,2.0,20,"Fast Food, Coffee & Tea, Burgers"
44,-2CPhK6ik9ZBgFX_F-dkxQ,"14961 N Florida Ave, Ste 14961",Tampa,28.087029,-82.45831,Subway,2.0,5,"Sandwiches, Fast Food"


## **Workers**


### **Generate User**

In [346]:
# key value pair of user email and details including token
users_dict = {}

# key value pair of restaurant name and owner email
restuarant_dict = {}

In [347]:

# fetch random user
def fetch_random_user():
    response = requests.get('https://randomuser.me/api/')
    data = response.json()['results'][0]
    user = {
        "firstName": data['name']['first'],
        "lastName": data['name']['last'],
        "email": data['email'],
        "password": "Mujtaba@123",
        "role": "Manager",
    }
    return user


# insert user in the DineEase database
def insert_user(user):    

    response = requests.post('http://dine-ease.dev/api/auth/register', json=user)
    
    if response.status_code == 201:
        login_response = requests.post('http://dine-ease.dev/api/login', json=user)
        login_data = login_response.json()
        
        user['id'] = login_data['details']['id']
        user['token'] = login_data['token']
        
        return user
    else:
        print(f"Error creating user. Status code: {response.status_code}")


# append user in dictionaries
def create_user(restaurant_name):
    while True:
        user = fetch_random_user()
        email = user['email']
        if users_dict.get(email):
            print(email, ": is duplicated")
            continue
        else:
            new_user = insert_user(user)
            users_dict[email] = new_user
            restuarant_dict[restaurant_name] = email
            break

In [348]:
pprint.pprint(users_dict)
pprint.pprint(restuarant_dict)

{}
{}


### **Insert Restaurant**

In [349]:
# for index, row in clean_restaurants_df.iterrows():
for index, row in clean_restaurants_df.iloc[:150].iterrows():

    payload = {
        'name': row['name'],
        'taxId': str(index).zfill(13),
        'categories': [row['categories']],
        'address': row['address'],
        'location': {
            'coordinates': [row['longitude'], row['latitude']],
            'country': row['city'],
        },
    }
    
    # Check if restaurant_owner exists in users_dict
    if not restuarant_dict.get(row['name']):
        create_user(row['name'])
    
    user = users_dict.get(restuarant_dict.get(row['name']))
    
    headers = { 'Authorization': 'Bearer ' + user['token'], 'Content-Type': 'application/json'}
    response = requests.post('http://dine-ease.dev/api/restaurant', json=payload, headers=headers)
    
    if response.status_code == 201:
        data = response.json()
        
        clean_restaurants_df.at[index, 'slug'] =  data['slug']
        clean_restaurants_df.at[index, 'id'] = data['id']
        clean_restaurants_df.at[index, 'userId'] = user['id']
    else:
        print(f"Request failed for row {index + 1}. Status code: {response.status_code}")
        break

In [351]:
clean_restaurants_df.to_csv('./data/restaurants.csv', index=False)