In [None]:
%pip install --upgrade jupyter notebook ipykernel

In [35]:
# 统计函数
import json
from collections import Counter, defaultdict
from tabulate import tabulate

def data_load(file_name:str):

    with open(f'{file_name}.json', 'r', encoding='utf-8') as f:
        data = [json.loads(line) for line in f]
    
    if file_name == 'user':
        print('\n'.join(str({k: v for k, v in d.items() if k not in ['friends', 'elite']}) for d in data[:2]))
    else:
        print('\n'.join([str(d) for d in data[:2]]))
    return data

def process_data(data):
    # 预处理 data_user，删除值为空字符串、空列表或空字典的键
    cleaned_data = []
    for d in data:
        cleaned_data.append({k: v for k, v in d.items() if v not in ["", [], {}]})

    key_counter = defaultdict(Counter)
    for d in cleaned_data:
        key_counter[d['source']].update(d.keys())

    for k, v in key_counter.items():
        # v_dict = dict(sorted(v.items()))
        v_dict = dict(v.items())
        # print(f"\n{k}:")
        # for field, count in v_dict.items():
        #     print(f"  {field}: {count}")

        total = max(v_dict.values())
        # 生成表格数据
        table_data = [
            [field, count, f"{(1 - count / total) * 100:.2f}%", '']
            for field, count in v_dict.items()
        ]

        # 打印表格
        print(f"\n{k}:")
        print(tabulate(table_data, headers=["Field", "Count", "Percentage of Missing", "待补充的分析"], tablefmt="github"))

    return cleaned_data, key_counter

# 🌟User数据分析
`user.json`文件数据分析，其中仅yelp数据集中的user数据包含大量有效信息，amazon和goodreads数据集中的user数据仅包含user_id和source信息

分析结果：
- yelp的user数据中，friends字段包含太多user_id，而且难以从中提取有效信息，应当删除
- yelp的user数据中，elite字段缺失太多，仅存36875/558111=6.6%，其可能表示用户活跃的年份，参考价值可能不大，删除

In [33]:
data_user = data_load('user')

{'user_id': 'qVc8ODYU5SZjKXVBgXdI7w', 'name': 'Walker', 'review_count': 585, 'yelping_since': '2007-01-25 16:47:26', 'useful': 7217, 'funny': 1259, 'cool': 5994, 'fans': 267, 'average_stars': 3.91, 'compliment_hot': 250, 'compliment_more': 65, 'compliment_profile': 55, 'compliment_cute': 56, 'compliment_list': 18, 'compliment_note': 232, 'compliment_plain': 844, 'compliment_cool': 467, 'compliment_funny': 467, 'compliment_writer': 239, 'compliment_photos': 180, 'source': 'yelp'}
{'user_id': 'j14WgRoU_-2ZE1aw1dXrJg', 'name': 'Daniel', 'review_count': 4333, 'yelping_since': '2009-01-25 04:35:42', 'useful': 43091, 'funny': 13066, 'cool': 27281, 'fans': 3138, 'average_stars': 3.74, 'compliment_hot': 1145, 'compliment_more': 264, 'compliment_profile': 184, 'compliment_cute': 157, 'compliment_list': 251, 'compliment_note': 1847, 'compliment_plain': 7054, 'compliment_cool': 3131, 'compliment_funny': 3131, 'compliment_writer': 1521, 'compliment_photos': 1946, 'source': 'yelp'}


In [36]:
cleaned_data_user, key_counter_user = process_data(data_user)


yelp:
| Field              |   Count | Percentage of Missing   | 待补充的分析   |
|--------------------|---------|-------------------------|----------------|
| user_id            |  558111 | 0.00%                   |                |
| name               |  558111 | 0.00%                   |                |
| review_count       |  558111 | 0.00%                   |                |
| yelping_since      |  558111 | 0.00%                   |                |
| useful             |  558111 | 0.00%                   |                |
| funny              |  558111 | 0.00%                   |                |
| cool               |  558111 | 0.00%                   |                |
| elite              |   36875 | 93.39%                  |                |
| friends            |  558111 | 0.00%                   |                |
| fans               |  558111 | 0.00%                   |                |
| average_stars      |  558111 | 0.00%                   |                |
| complimen

# 🌟Item数据分析

In [28]:
data_item = data_load('item')

{'item_id': 'tUFrWirKiKi_TAnsVWINQQ', 'name': 'Target', 'address': '5255 E Broadway Blvd', 'city': 'Tucson', 'state': 'AZ', 'postal_code': '85711', 'latitude': 32.223236, 'longitude': -110.880452, 'stars': 3.5, 'review_count': 22, 'is_open': 0, 'attributes': {'BikeParking': 'True', 'BusinessAcceptsCreditCards': 'True', 'RestaurantsPriceRange2': '2', 'CoatCheck': 'False', 'RestaurantsTakeOut': 'False', 'RestaurantsDelivery': 'False', 'Caters': 'False', 'WiFi': "u'no'", 'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}", 'WheelchairAccessible': 'True', 'HappyHour': 'False', 'OutdoorSeating': 'False', 'HasTV': 'False', 'RestaurantsReservations': 'False', 'DogsAllowed': 'False', 'ByAppointmentOnly': 'False'}, 'categories': 'Department Stores, Shopping, Fashion, Home & Garden, Electronics, Furniture Stores', 'hours': {'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', 'Wednesday': '8:0-22:0', 'Thursday': '8:0-22:0', 'Friday': '8:0-23:0', 'Satu

In [37]:
cleaned_data_item, key_counter_item = process_data(data)


yelp:
| Field        |   Count | Percentage of Missing   | 待补充的分析   |
|--------------|---------|-------------------------|----------------|
| item_id      |   32869 | 0.00%                   |                |
| name         |   32869 | 0.00%                   |                |
| address      |   31568 | 3.96%                   |                |
| city         |   32869 | 0.00%                   |                |
| state        |   32869 | 0.00%                   |                |
| postal_code  |   32854 | 0.05%                   |                |
| latitude     |   32869 | 0.00%                   |                |
| longitude    |   32869 | 0.00%                   |                |
| stars        |   32869 | 0.00%                   |                |
| review_count |   32869 | 0.00%                   |                |
| is_open      |   32869 | 0.00%                   |                |
| attributes   |   32869 | 0.00%                   |                |
| categories   |   

# 🌟Review数据分析

In [39]:
data_review = data_load('review')

{'review_id': 'BiTunyQ73aT9WBnpR9DZGw', 'user_id': 'OyoGAe7OKpv6SyGZT5g77Q', 'item_id': '7ATYjTIgM3jUlt4UM3IypQ', 'stars': 5.0, 'useful': 1, 'funny': 0, 'cool': 1, 'text': "I've taken a lot of spin classes over the years, and nothing compares to the classes at Body Cycle. From the nice, clean space and amazing bikes, to the welcoming and motivating instructors, every class is a top notch work out.\n\nFor anyone who struggles to fit workouts in, the online scheduling system makes it easy to plan ahead (and there's no need to line up way in advanced like many gyms make you do).\n\nThere is no way I can write this review without giving Russell, the owner of Body Cycle, a shout out. Russell's passion for fitness and cycling is so evident, as is his desire for all of his clients to succeed. He is always dropping in to classes to check in/provide encouragement, and is open to ideas and recommendations from anyone. Russell always wears a smile on his face, even when he's kicking your butt in 

In [40]:
cleaned_data_review, key_counter_review = process_data(data)


yelp:
| Field        |   Count | Percentage of Missing   | 待补充的分析   |
|--------------|---------|-------------------------|----------------|
| item_id      |   32869 | 0.00%                   |                |
| name         |   32869 | 0.00%                   |                |
| address      |   31568 | 3.96%                   |                |
| city         |   32869 | 0.00%                   |                |
| state        |   32869 | 0.00%                   |                |
| postal_code  |   32854 | 0.05%                   |                |
| latitude     |   32869 | 0.00%                   |                |
| longitude    |   32869 | 0.00%                   |                |
| stars        |   32869 | 0.00%                   |                |
| review_count |   32869 | 0.00%                   |                |
| is_open      |   32869 | 0.00%                   |                |
| attributes   |   32869 | 0.00%                   |                |
| categories   |   