In [1]:
import pandas as pd

df = pd.read_json('./original_datasets/yelp_academic_dataset_business.json', lines=True)

print(df.head(5))

              business_id                      name   
0  Pns2l4eNsfO8kk83dixA6A  Abby Rappoport, LAC, CMQ  \
1  mpf3x-BjTdTEA3yCZrAYPw             The UPS Store   
2  tUFrWirKiKi_TAnsVWINQQ                    Target   
3  MTSW4McQd7CbVtyjqoe9mw        St Honore Pastries   
4  mWMc6_wTdE0EUBKIGXDVfA  Perkiomen Valley Brewery   

                           address           city state postal_code   
0           1616 Chapala St, Ste 2  Santa Barbara    CA       93101  \
1  87 Grasso Plaza Shopping Center         Affton    MO       63123   
2             5255 E Broadway Blvd         Tucson    AZ       85711   
3                      935 Race St   Philadelphia    PA       19107   
4                    101 Walnut St     Green Lane    PA       18054   

    latitude   longitude  stars  review_count  is_open   
0  34.426679 -119.711197    5.0             7        0  \
1  38.551126  -90.335695    3.0            15        1   
2  32.223236 -110.880452    3.5            22        0   
3  39.9555

In [2]:
#count the number of businesses in each city
city_counts = df['city'].value_counts()

#print the top 10 cities by business count
print(city_counts.head(10))

city
Philadelphia     14569
Tucson            9250
Tampa             9050
Indianapolis      7540
Nashville         6971
New Orleans       6209
Reno              5935
Edmonton          5054
Saint Louis       4827
Santa Barbara     3829
Name: count, dtype: int64


In [3]:
#create a JSON file with only the business in Reno
reno_business = df[df['city'] == 'Reno']
reno_business.to_json('./datasets/business_dataset_reno.json', orient='records', lines=True)

In [4]:
#create a JSON file with only the business in New Orleans
new_orleans_business = df[df['city'] == 'New Orleans']
new_orleans_business.to_json('./datasets/business_dataset_new_orleans.json', orient='records', lines=True)

In [5]:
#create a JSON file with only the business in Nashville
nashville_business = df[df['city'] == 'Nashville']
nashville_business.to_json('./datasets/business_dataset_nashville.json', orient='records', lines=True)

### User Filtering

In [6]:
def read_chunks(file, cols, chunk_size=500000):
    df = pd.read_json(
        path_or_buf=f'original_datasets/{file}.json', chunksize=chunk_size, lines=True
    )
    chunk_list = [chunk[cols] for chunk in df]
    
    return pd.concat(chunk_list, ignore_index=True, join='outer', axis=0)

In [None]:
users = read_chunks('yelp_academic_dataset_user', ['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny', 'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot', 'compliment_more', 'compliment_profile', 'compliment_cute', 'compliment_list', 'compliment_note', 'compliment_plain', 'compliment_cool', 'compliment_funny', 'compliment_writer', 'compliment_photos'])
print(users.head(5))

In [None]:
reviews = read_chunks('yelp_academic_dataset_review', ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date'])
print(reviews.head(5))

In [12]:
# get the reviews for the businesses in New Orleans
new_orleans_reviews = reviews[reviews['business_id'].isin(new_orleans_business['business_id'])]
new_orleans_reviews.to_json('./datasets/review_dataset_new_orleans.json', orient='records', lines=True)

In [None]:
# get the users who have reviewed businesses in New Orleans
new_orleans_users = users[users['user_id'].isin(new_orleans_reviews['user_id'])]
new_orleans_users.to_json('./datasets/user_dataset_new_orleans.json', orient='records', lines=True)