In [None]:
import pandas as pd
import json
from glob import glob

In [None]:
pd.options.display.max_colwidth = None

In [None]:
resource_path = "Dataset/"
result_path = "Dataset/Done/"

# Dataset

[Yelp Dataset](https://www.yelp.com/dataset)
* yelp_academic_dataset_business.json
* yelp_academic_dataset_review.json
* yelp_academic_dataset_user.json
* yelp_academic_dataset_checkin.json
* yelp_academic_dataset_tip.json
* Dataset/photos.json

# Business Data
Contains business data including location data, attributes, and categories.
* yelp_academic_dataset_business.json

#### 所有店家資料

In [None]:
all_shops = pd.read_json(resource_path + "yelp_academic_dataset_business.json", lines=True )

In [None]:
print(all_shops.keys(),end="\n-------------------------------------------------------------\n")
print("資料筆數:" + str(len(all_shops)),end="\n-------------------------------------------------------------\n")
all_shops[['attributes']].head(20)

In [None]:
all_shops = pd.read_json(resource_path + "yelp_academic_dataset_business.json", lines=True )

print(all_shops.keys(),end="\n-------------------------------------------------------------\n")
print("資料筆數:" + str(len(all_shops)),end="\n-------------------------------------------------------------\n")
all_shops[['business_id', 'name', 'address', 'city', 'state','stars', 'review_count']].head()

#### 取出餐廳資料

In [None]:
df_list = []

for i in range(len(all_shops)):
    if "Restaurants" in str(all_shops.loc[i, 'categories']):
        df_list.append(all_shops.loc[[i]])
        
restaurants = pd.concat(df_list).reset_index(drop=True)

In [None]:
print("資料筆數:" + str(len(restaurants)))
print("資料是否有重複:" + str(restaurants['business_id'].duplicated().any()))
print("資料是否有空值:" + str(restaurants.isnull().values.any()))

In [None]:
# 把所有餐廳資料存成csv檔
restaurants.to_csv(result_path + "restaurants.csv", index = False)

In [None]:
business_id = restaurants['business_id']

# Reviews Data
Contains full review text data including the user_id that wrote the review and the business_id the review is written for.
* yelp_academic_dataset_review.json

#### 所有店家評論

In [None]:
rv_chunk_data = pd.read_json(resource_path + "yelp_academic_dataset_review.json", lines=True, chunksize=100000 )

In [None]:
rv_chunk_list = []
for chunk in rv_chunk_data:
    rv_chunk_list.append(chunk)
    
len(rv_chunk_list)

In [None]:
print(rv_chunk_list[0].keys(),end="\n-------------------------------------------------------------\n")
print("評論筆數:" + str((len(rv_chunk_list)-1)*100000 + len(rv_chunk_list[-1])),end="\n-------------------------------------------------------------\n")
rv_chunk_list[0].loc[0:10, ['review_id', 'user_id', 'business_id', 'stars', 'text', 'date']]

#### 取出餐廳評論

In [None]:
# 把所有餐廳評論存成多個csv檔
counts = 0
for i in range(0, len(rv_chunk_list)):
    reviews = pd.merge(business_id, rv_chunk_list[i], on="business_id", how="inner")
    reviews.to_csv(result_path + f"reviews/reviews{i+1}.csv", index = False)
    counts += len(reviews)
    
print("評論筆數:" + str(counts))

In [None]:
# 把多個csv檔存成一個csv檔
files = glob(result_path + "reviews/reviews*.csv")
print(len(files))
df = pd.concat((pd.read_csv(file) for file in files)).reset_index(drop=True)
df.to_csv(result_path + "reviews.csv", index = False)

# User Data
User data including the user's friend mapping and all the metadata associated with the user.
* yelp_academic_dataset_user.json

In [None]:
ci_chunk_data = pd.read_json(resource_path + "yelp_academic_dataset_user.json", lines=True, chunksize=100000)

In [None]:
ci_chunk_list = []
for chunk in ci_chunk_data:
    ci_chunk_list.append(chunk)

In [None]:
print(ci_chunk_list[0].keys(),end="\n-------------------------------------------------------------\n")
print("資料筆數:" + str((len(ci_chunk_list)-1)*100000 + len(ci_chunk_list[-1])),end="\n-------------------------------------------------------------\n")
ci_chunk_list[0].loc[0:10, ['user_id', 'name', 'review_count', 'yelping_since', 'average_stars']]

In [None]:
# 把所有使用者資訊存成多個csv檔
for i in range(0, len(ci_chunk_list)):
    ci_chunk_list[i].to_csv(result_path + f"user/user{i+1}.csv", index = False)

In [None]:
# 把多個csv檔存成一個csv檔
files = glob(result_path + "user/user*.csv")
print(len(files))
df = pd.concat((pd.read_csv(file) for file in files)).reset_index(drop=True)
df.to_csv(result_path + "user.csv", index = False)

# Check-in Data
Checkins on a business
* yelp_academic_dataset_checkin.json

In [None]:
check = pd.read_json(resource_path + "yelp_academic_dataset_checkin.json",lines=True )

In [None]:
print(check.keys(),end="\n-------------------------------------------------------------\n")
print("資料筆數:" + str(len(check)),end="\n-------------------------------------------------------------\n")
check.head(2)

# Tips Data
Tips written by a user on a business. Tips are shorter than reviews and tend to convey quick suggestions.
* yelp_academic_dataset_tip.json

#### 所有店家Tips

In [None]:
all_tips = pd.read_json(resource_path + "yelp_academic_dataset_tip.json",lines=True )

In [None]:
print(all_tips.keys(),end="\n-------------------------------------------------------------\n")
print("資料筆數:" + str(len(all_tips)),end="\n-------------------------------------------------------------\n")
all_tips.head(5)

#### 餐廳Tips

In [None]:
restaurants_tips = pd.merge(business_id, all_tips, on="business_id", how="inner")
print("資料筆數:" + str(len(restaurants_tips)),end="\n-------------------------------------------------------------\n")
restaurants_tips[['user_id', 'business_id', 'text', 'date']].head(10)

# Photos Data
Contains photo data including the caption and classification (one of "food", "drink", "menu", "inside" or "outside").
* Dataset/photos.json

In [None]:
photos = pd.read_json("Dataset/photos.json", lines=True)

In [None]:
print(len(photos))
photos.head()

In [None]:
photos[~photos['label'].duplicated()]