In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import os
import json
import pandas as pd
from tqdm.notebook import tqdm

def map_score_to_5(score):
    return round(score * 5, 1)

model_name = "uer/roberta-base-finetuned-jd-binary-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

data_dir = r"D:\Desktop\gaode"
type_code_path = r"D:\Desktop\gaode\type_code.csv"
type_code_pd = pd.read_csv(type_code_path)
type_results = {}
files = [f for f in os.listdir(data_dir) if f.endswith('.json')]

for file in tqdm(files, desc="Processing files"):
    print(f"Processing file: {file}")
    file_path = os.path.join(data_dir, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    print(f"Loaded {len(data)} entries from {file}.")

    # 匹配商铺类型
    type_code_str = file.split('_')[0]
    if type_code_str.startswith('0'):
        type_code_str = type_code_str[1:]
    print(f"Matching type code: {type_code_str}")
    type_name_list = type_code_pd[type_code_pd['NEW_TYPE'] == int(type_code_str)]['Sub Category'].tolist()

    if not type_name_list:
        print(f"Error: Type code {type_code_str} not found in type codes.")
        continue

    type_name = type_name_list[0]
    print(f"Type name matched: {type_name}")

    # 创建类型结果容器
    if type_name not in type_results:
        type_results[type_name] = {}

    # 处理每个商铺
    for entry in data:
        shop_name = entry['name']
        if 'content' not in entry or not entry['content']:
            print(f"No comments found for shop: {shop_name}")
            continue

        # 对商铺的所有评论进行情感分析
        comments = entry['content']
        sentiment_scores = []
        for comment in comments:
            result = classifier(comment)
            if result:
                score = map_score_to_5(result[0]['score'])
                sentiment_scores.append(score)

        if not sentiment_scores:
            print(f"No valid sentiment scores for shop: {shop_name}")
            continue

        # 计算该商铺的平均评分
        avg_rating = round(np.mean(sentiment_scores), 1)
        print(f"Shop: {shop_name}, Average Rating: {avg_rating}")

        # 保存到类型结果容器
        type_results[type_name][shop_name] = avg_rating

# 将结果保存到 CSV 文件
for type_name, shops in type_results.items():
    output_path = os.path.join(data_dir, f"{type_name}_ratings.csv")
    df = pd.DataFrame(list(shops.items()), columns=['Shop Name', 'Rating'])
    df.to_csv(output_path, index=False, encoding='utf-8-sig')

print("All results have been processed and saved.")

