In [1]:
from collections import defaultdict

import json
import pandas as pd

In [17]:
def read_yelp_dataset(file_path):
    mo_business = []
    for chunk in pd.read_json(file_path, lines=True, chunksize=10000):
        filtered_chunk = chunk[chunk['state'].isin(['mo', 'MO', 'Mo', 'mO'])]
        chunk_json = filtered_chunk.to_dict(orient="records")
        mo_business.extend(chunk_json)
    return mo_business

def write_json(file_name, ext, contents):
    try:
        with open(file_name + "."+ ext, 'w') as f:
            json.dump(contents, f, indent=4)
        print(f"Successfully written to {file_name}.{ext}")
    except Exception as e:
        print("Error: ", e)
        raise
        
def write_csv(df, csv_name):
    df.to_csv(csv_name + '.csv',index=False)

# Sampling for Montreal

In [18]:
file_path = "../data/raw/business.json" 
mo_business = read_yelp_dataset(file_path)

print(f"Successfully loaded {len(mo_business)} entries from the Business dataset.")

if mo_business:
    print("\nFirst entry in the dataset:")
    print(json.dumps(mo_business[0], indent=2))

Successfully loaded 10913 entries from the Business dataset.

First entry in the dataset:
{
  "business_id": "mpf3x-BjTdTEA3yCZrAYPw",
  "name": "The UPS Store",
  "address": "87 Grasso Plaza Shopping Center",
  "city": "Affton",
  "state": "MO",
  "postal_code": "63123",
  "latitude": 38.551126,
  "longitude": -90.335695,
  "stars": 3.0,
  "review_count": 15,
  "is_open": 1,
  "attributes": {
    "BusinessAcceptsCreditCards": "True"
  },
  "categories": "Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services",
  "hours": {
    "Monday": "0:0-0:0",
    "Tuesday": "8:0-18:30",
    "Wednesday": "8:0-18:30",
    "Thursday": "8:0-18:30",
    "Friday": "8:0-18:30",
    "Saturday": "8:0-14:0"
  }
}


In [6]:
write_json("mo_business", "json", mo_business)

Successfully written to mo_busi.json


In [19]:
mo_business_ids = set()

for row in mo_business:        
    if row['business_id']:
        mo_business_ids.add(row['business_id'])

In [20]:
len(mo_business_ids)

10913

In [22]:
mo_business_ids

{'_Xe-dBjtCD3rmC7tbk08dQ',
 'oAsQSIYjp56Z6woPunfLxw',
 'UU503tMXdZR4oDZXuoKuGA',
 'EvgRJ-09uavss9YNgnAq7A',
 '6u51kembCosi-fLsPwUbCA',
 'u5_9rsVeGl7imoYB6I2ahw',
 'uasMzMp00F9azoVpqq-xOw',
 'VxY4CW0ijTBTlDvNhGUffw',
 'S2t65PFkp4tUM36jUH0pZA',
 'BOjpe0u5u_cm9SWMS2KLFg',
 'mUSuT-LtuGUjqmijPIozGg',
 'hRL0ckObJEJOJ2irlOAXDQ',
 'IslZvM5WfAw-iWBvaXWbNA',
 'uTAAG6zfdoApH7-1ryCxqQ',
 'KzCRmb7dpEk7FUTHyynanQ',
 'HGR-bEHTZak4bCVmIR-aKQ',
 'WY0ji9m-qAOc_6lK8h4pNw',
 'XTc_ZwG9zt7L_dmwZahGOQ',
 'HFLQ1jmfyfICzk95skEDDw',
 '068DMaEKghlGsSzJPuLucA',
 'AAUZbyoJDezAh53xQP3Tbg',
 'f_N7S_M86uHnfgBL0EPU3Q',
 '-NF_wBObzf5IdLzWyea7XA',
 'TNtcjnta11CpDebuBNdoug',
 'm6Q6z3SAkXVdidNqiKmBrQ',
 'MDe5ehdhf6YkEw1sLmeWDw',
 'kDJ7OtOx4VLUoN4v1m8ikA',
 'w_PuO0-yKhdLTMkNesX1dQ',
 'ShG57pFFwmHGpNK42NmUUg',
 'VKyfcuU-qa8IPhKQJvoNyA',
 '9fajAXTAT1A_pcGakz64hA',
 '_geBSCsu-QEoO64rpEZYiA',
 'yQrXytT_zKbex3-CmDz6vA',
 '_0ki9SYy99XWYWT2quHPNA',
 '4LPR7oVxoT0gWi8_c2zDgA',
 'vqupAQF3TJCO25nC7f2E6Q',
 'wt-nZcKEihRl6liR0P1ALw',
 

In [10]:
chunk_size = 10000

mo_business_reviews = []

for chunk in pd.read_json("../data/raw/review.json", lines=True, chunksize=chunk_size):
    filtered_chunk = chunk[chunk['business_id'].isin(mo_business_ids)]
    chunk_json = filtered_chunk.to_dict(orient="records")
    mo_business_reviews.extend(chunk_json)

In [11]:
len(mo_business_reviews)

502385

In [13]:
mo_business_reviews[4]

{'review_id': '-up4mW6WdqzGrRh7t_pLmA',
 'user_id': 'xbybLiQockAzC4xAlzFrGg',
 'business_id': 'EpREWeEpmR8f1qLHzzF0AA',
 'stars': 5,
 'useful': 0,
 'funny': 0,
 'cool': 0,
 'text': "After living in the STL area for way over 10 years now, I am both ashamed and remorseful to admit that I ate here for the FIRST time just a few weeks ago. I am ashamed because it's a St. Louis tradition and remorseful because I missed out on it for so..many...years! \n\nThis place is amazing. What is not to love! It's a factory where you can get a tour (Friday-Sunday, noon-5pm on the hour), there's a shop, bar, and dining area. They also host many event with live music on the weekends. It's fun to just kind of wander and look around - don't worry, you'll get a chance because there will likely be a wait before you dine. The beer is, of course, amazing. The Hefeweizen is my personal favorite and a great start for anyone who is not a huge beer drinker (like myself). Additionally, the food is homegrown and damn

In [20]:
business_review_freq = defaultdict(int)

for review in mo_business_reviews:
    if review["business_id"]:
        business_review_freq[review["business_id"]] += 1

print(sorted(business_review_freq.items(), key=lambda l:-l[1])[:5])

[('I_3LMZ_1m2mzR0oLIOePIg', 4093), ('iRIHK8-EwpeffwvoO4nzIA', 2170), ('2BMk_drsikKWslJCXmQtjQ', 2023), ('R8t9g5nvi7VFyS8zsgmj8Q', 1788), ('cQIh4YJlVtZI9TLF5_smOg', 1781)]


In [24]:
mo_business[0]

{'business_id': 'mpf3x-BjTdTEA3yCZrAYPw',
 'name': 'The UPS Store',
 'address': '87 Grasso Plaza Shopping Center',
 'city': 'Affton',
 'state': 'MO',
 'postal_code': '63123',
 'latitude': 38.551126,
 'longitude': -90.335695,
 'stars': 3.0,
 'review_count': 15,
 'is_open': 1,
 'attributes': {'BusinessAcceptsCreditCards': 'True'},
 'categories': 'Shipping Centers, Local Services, Notaries, Mailbox Centers, Printing Services',
 'hours': {'Monday': '0:0-0:0',
  'Tuesday': '8:0-18:30',
  'Wednesday': '8:0-18:30',
  'Thursday': '8:0-18:30',
  'Friday': '8:0-18:30',
  'Saturday': '8:0-14:0'}}

In [25]:
mo_business_reviews[0]

{'review_id': 'XW_LfMv0fV21l9c6xQd_lw',
 'user_id': '9OAtfnWag-ajVxRbUTGIyg',
 'business_id': 'lj-E32x9_FA7GmUrBGBEWg',
 'stars': 4,
 'useful': 0,
 'funny': 0,
 'cool': 0,
 'text': "Love going here for happy hour or dinner!  Great patio with fans to beat the StL heat!   Also...very accomodating at this location.  I like the Veal Milanese but with mixed greens instead of pasta!  they'll modify the menu to suit your taste!",
 'date': Timestamp('2014-06-27 22:44:01')}

In [29]:
mo_business_reviews_df = pd.DataFrame(mo_business_reviews)

In [30]:
mo_business_reviews_df['date'] = mo_business_reviews_df['date'].astype(str)

In [33]:
mo_business_reviews_df.to_csv('mo_business_reviews.csv',index=False)

In [43]:
mo_business_reviews_df.to_json('mo_business_reviews.json', orient="records", indent=4)

# New Statistics

# Writing Style