## 使用了相关部署：[MediaCrawler](https://github.com/NanmiCoder/MediaCrawler)

In [None]:
import ast
import json
import pandas as pd
import os
import numpy as np
import subprocess
import shutil
import matplotlib.pyplot as plt

# Data directories
data_dir = r"D:\Desktop\gaode"
xhs_data_dir = r"D:\Desktop\Coding\MediaCrawler\data\xhs"
log_file = os.path.join(data_dir, 'crawl_log.txt')

# Load log file to keep track of crawl status
if os.path.exists(log_file):
    with open(log_file, 'r') as lf:
        log_data = lf.readlines()
else:
    log_data = []

# Parse log data to determine which entries succeeded or failed
success_set = set()
failed_set = set()
for line in log_data:
    status, entry = line.strip().split(',', 1)
    if status == 'SUCCESS':
        success_set.add(entry)
    elif status == 'FAILED':
        failed_set.add(entry)

# Load files
print("Loading JSON and CSV files...")
files = [f for f in os.listdir(data_dir) if f.endswith('.json')]

# Load type codes
print("Loading type codes from CSV file...")
type_code_pd = pd.read_csv(r"D:\Desktop\gaode\type_code.csv")
print("Type codes loaded successfully.")
# Iterate through each JSON file to process the data
for file in files:
    print(f"Processing file: {file}")
    with open(os.path.join(data_dir, file), 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"Loaded {len(data)} entries from {file}.")

    # Match type_code_str in type_code_pd
    type_code_str = file.split('_')[0]
    if type_code_str.startswith('0'):
        type_code_str = type_code_str[1:]
    print(f"Matching type code: {type_code_str}")
    type_name = type_code_pd[type_code_pd['NEW_TYPE'] == int(type_code_str)]['Sub Category'].tolist()
    
    # Ensure the type code is valid
    if not type_name:
        print(f"Error: Type code {type_code_str} not found in type codes.")
        continue

    type_name = type_name[0]
    print(f"Type name matched: {type_name}")

    # Create relative directory for type
    relative_dir = os.path.join(data_dir, type_name)
    if not os.path.exists(relative_dir):
        os.makedirs(relative_dir)
        print(f"Created directory: {relative_dir}")
    

    # Iterate through data to extract information
    for i in range(len(data)):
        entry_name = data[i]['name']
        if entry_name in success_set:
            print(f"Skipping successful entry: {entry_name}")
            continue

        # Create directory for each shop
        i_dir = os.path.join(relative_dir, entry_name)
        if not os.path.exists(i_dir):
            os.makedirs(i_dir)
            print(f"Created directory for shop: {i_dir}")

        search_command = [
            "python", "D:\Desktop\Coding\MediaCrawler\main.py",
            "--platform", "xhs",
            "--lt", "qrcode",
            "--type", "search",
            "--keywords", f"武汉{entry_name}"
        ]
        print(f"Executing command: {' '.join(search_command)}")
        try:
            result = subprocess.run(search_command, check=True, text=True, capture_output=True)
            print(result.stdout)
            with open(log_file, 'a') as lf:
                lf.write(f"SUCCESS,{entry_name}\n")
        except FileNotFoundError as e:
            print(f"Error: {e}. Please ensure that the command and paths are correct.")
            with open(log_file, 'a') as lf:
                lf.write(f"FAILED,{entry_name}\n")
            continue  # Continue processing next entry
        except subprocess.CalledProcessError as e:
            print(f"命令执行失败，错误信息: {e}。请检查命令输出以获取详细信息。")
            print(f"错误输出: {e.stderr}")
            
            # 检查错误信息中是否包含 '出现验证码'
            if '出现验证码' in e.stderr:
                # 尝试从错误信息中提取数据
                lines = e.stderr.split('\n')
                for line in lines:
                    if 'Search notes res:' in line:
                        # 提取 'Search notes res:' 后面的数据
                        idx = line.find('Search notes res:')
                        json_str = line[idx + len('Search notes res:'):].strip()
                        
                        # 打印提取的 json_str 以检查内容
                        print(f"提取的 JSON 字符串: {json_str}")
                        
                        # 如果 json_str 不以 '{' 开头，找到第一个 '{' 的位置
                        if not json_str.startswith('{'):
                            brace_idx = json_str.find('{')
                            if brace_idx != -1:
                                json_str = json_str[brace_idx:]
                            else:
                                print("未找到 '{'，无法解析 JSON 数据。")
                                break  # 退出循环
                        try:
                            # 使用 ast.literal_eval 解析数据
                            crawled_data = ast.literal_eval(json_str)
                            
                            # 如果需要保存为标准 JSON 格式，可以使用 json.dumps()
                            json_data = json.dumps(crawled_data, ensure_ascii=False, indent=4)
                            
                            # 确保 JSON 目录存在
                            json_dest = os.path.join(i_dir, 'json')
                            if not os.path.exists(json_dest):
                                os.makedirs(json_dest)
                                print(f"创建 JSON 目录: {json_dest}")
                            
                            # 将 JSON 数据保存到文件
                            json_filename = os.path.join(json_dest, f"{entry_name}.json")
                            with open(json_filename, 'w', encoding='utf-8') as json_file:
                                json_file.write(json_data)
                            print(f"已将 JSON 数据保存到 {json_filename}")
                        except Exception as ex:
                            print(f"解析数据失败: {ex}")
                            print(f"数据内容: {json_str}")
                        break  # 处理完毕，退出循环
            else:
                print("错误信息中未找到验证码提示。")
            
            # 记录失败日志并继续处理下一个商铺
            with open(log_file, 'a') as lf:
                lf.write(f"FAILED,{entry_name}\n")
            continue  # 继续处理下一个商铺


            # Move search results to the corresponding folder
        images_src = os.path.join(xhs_data_dir, 'images')
        json_src = os.path.join(xhs_data_dir, 'json')
        images_dest = os.path.join(i_dir, 'images')
        json_dest = os.path.join(i_dir, 'json')
        
        if not os.path.exists(images_dest):
            os.makedirs(images_dest)
            print(f"Created images directory: {images_dest}")
        if not os.path.exists(json_dest):
            os.makedirs(json_dest)
            print(f"Created JSON directory: {json_dest}")
        
        # Move images and JSON results
        for img_file in os.listdir(images_src):
            print(f"Moving image file: {img_file} to {images_dest}")
            shutil.move(os.path.join(images_src, img_file), images_dest)
        for json_file in os.listdir(json_src):
            print(f"Moving JSON file: {json_file} to {json_dest}")
            shutil.move(os.path.join(json_src, json_file), json_dest)


# Visualization of Ratings
print("Starting visualization of ratings...")
all_summaries = []
for folder in os.listdir(data_dir):
    folder_path = os.path.join(data_dir, folder)
    if os.path.isdir(folder_path):
        summary_file = os.path.join(folder_path, f"{folder}_summary.csv")
        if os.path.exists(summary_file):
            print(f"Loading summary file: {summary_file}")
            all_summaries.append(pd.read_csv(summary_file))

if all_summaries:
    print("Combining all summary data...")
    combined_summary = pd.concat(all_summaries, ignore_index=True)
    combined_summary['rating_gaode'] = pd.to_numeric(combined_summary['rating_gaode'], errors='coerce')

    print("Plotting histogram of Gaode ratings...")
    plt.figure(figsize=(10, 6))
    plt.hist(combined_summary['rating_gaode'].dropna(), bins=20, edgecolor='black')
    plt.xlabel('Gaode Rating')
    plt.ylabel('Frequency')
    plt.title('Distribution of Gaode Ratings')
    plt.show()

print("Process completed.")
