In [None]:
import pandas as pd

In [None]:
pd.options.display.max_colwidth = None
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Dataset
* Dataset/Done/restaurants.csv
* Dataset/Done/reviews.csv
* Dataset/Done/user.csv
* Dataset/photos.json

# Restaurants

In [None]:
restaurants = pd.read_csv("Dataset/Done/restaurants.csv", sep=',')

In [None]:
print(restaurants.keys())
print("資料筆數: " + str(len(restaurants)))
restaurants.head()

In [None]:
print("資料是否有重複: " + str(restaurants['name'].duplicated().any()))
print("資料是否有空值:\n" + str(restaurants.isnull().any()))

In [None]:
# 查看有幾筆資料重複，發現重複的都是連鎖餐飲
display(restaurants['name'].duplicated().sum())
display(restaurants[restaurants['name'].duplicated()].head())

In [None]:
# 查看有幾筆餐廳資料的地址是null
display(restaurants['address'].isnull().sum())
display(restaurants[restaurants['address'].isnull()].head())

### Get all of restaurants in Portland 

In [None]:
Portland = restaurants[restaurants['city']=='Portland'].reset_index(drop=True)
len(Portland)

In [None]:
len(Portland[Portland['address'].isnull()])

In [None]:
Portland = Portland.dropna(subset=['address']).reset_index(drop=True)
len(Portland)

In [None]:
ls = ["Safeway", "Hongs Restaurant Equipment & Supplies", "Living Health"]

for i in ls:
    Portland = Portland.drop(Portland[Portland['name']==i].index)

Portland = Portland.reset_index(drop=True)
len(Portland)

In [None]:
Portland.to_json('Dataset/Done/portland.json', orient="records")

# Reviews

In [None]:
reviews = pd.read_csv("Dataset/Done/reviews.csv", sep=',')

In [None]:
diplay(reviews.keys())
reviews.head()

In [None]:
print("最早評論時間: " + reviews['date'].min())
print("最晚評論時間: " + reviews['date'].max())
print("資料筆數: " + str(len(reviews)))
print("資料是否有重複: " + str(reviews['text'].duplicated().any()))
print("資料是否有空值: " + str(reviews['text'].isnull().any()))

#### 處理重複資料

In [None]:
# 查看有幾筆資料重複
display(reviews['text'].duplicated().sum())
display(reviews[reviews['text'].duplicated()].head())

In [None]:
# 發現有部分使用者會留下相同的評論在同一間餐廳或不同間餐廳
display(reviews[reviews['text']=="The best cafe ever . fresh food and super clean place . fresh fruit smoothies and fresh baked pastries everyday with a fantastic coffee"])
display(reviews[reviews['text']=="Great prices and really good food. \n\nHad a beef Torta and beef tacos. The bread and tortillas were freshly made that day. \n\nThe sauce on the Torta was the perfect mixture of spice and flavor. \n\nThe beef has a great flavor, which tends to get lost at most the taquerias in this area. \n\nHad it Sunday night and two nights later we're going back for more!!"])

In [None]:
# 清除同一間餐廳裡重複的評論，並再次確認
reviews = reviews.drop_duplicates(subset=['business_id', 'text']).reset_index(drop=True)
reviews.duplicated(subset=['business_id', 'text']).any()
print("資料筆數: " + str(len(reviews)))

In [None]:
# 選取我們需要的欄位
reviews = reviews[['business_id', 'review_id', 'user_id', 'stars', 'text', 'date']]
reviews.head()

#### 查看哪個城市擁有最多評論

In [None]:
final_reviews = pd.merge(reviews, restaurants[['business_id','city']], on="business_id", how="left")
final_reviews.head(3)

In [None]:
print("各城市所有評論數:")
final_reviews['city'].value_counts().head()

In [None]:
print(f"Austin  有 {(len(restaurants[restaurants['city']=='Austin']))} 間餐廳")
print(f"Portland有 {(len(restaurants[restaurants['city']=='Portland']))} 間餐廳")
print(f"Atlanta 有 {(len(restaurants[restaurants['city']=='Atlanta']))} 間餐廳")

# User

In [None]:
user = pd.read_csv("Dataset/Done/user.csv")

In [None]:
print(user.keys())
print("資料筆數: " + str(len(user)))
user.head()

In [None]:
print("資料筆數: " + str(len(user)))
print("資料是否有重複: " + str(user['user_id'].duplicated().any()))
print("資料是否有空值: " + str(user['user_id'].isnull().any()))

# Photos 

In [None]:
photos = pd.read_json("Dataset/photos.json", lines=True)

In [None]:
print("資料筆數: " + str(len(photos)))
photos.head()

#### Get all photos of restaurant in Portland

In [None]:
Portland = pd.read_json("Dataset/Done/portland.json")

In [None]:
Portland = pd.merge(Portland, photos[['business_id', 'photo_id', 'label']], on='business_id', how='left')

In [None]:
print(len(Portland))
Portland.head(10)

In [None]:
print(f"在Portland是否有餐廳沒有相片: {Portland['photo_id'].isnull().any()}")
print(f"在Portland有幾間餐廳沒有相片: {Portland['photo_id'].isnull().sum()}")
print(f"在Portland是否有一間餐廳擁有多張相片: {Portland.duplicated('business_id').any()}")
print(f"在Portland共有多少張相片集中在部分餐廳: {Portland.duplicated('business_id').sum()}")

# Clear all variables to release memory

In [None]:
reset