In [1]:
import pandas as pd
import openai
import tiktoken
import json
import os
from datetime import datetime
from pathlib import Path
from jsonschema import validate, ValidationError
import re
import math
import time
import csv
import sys
import chatgpt_helpers as ch
from openai import OpenAI
from typing import List, Any, Dict, Tuple

In [2]:
game_url = "https://www.taptap.cn/app/219082?os=android"

In [3]:
# Extract app id
match = re.search(r"/app/(\d+)", game_url)
if not match:
    raise ValueError(f"No TapTap app id found in URL: {game_url}")
app_id = match.group(1)

In [4]:
# Get the path to the review csv file
review_csv_path = "output/{}/reviews_{}.csv".format(app_id, app_id)

In [5]:
# Load review data
df = pd.read_csv(review_csv_path)
reviews = df[['review_id', 'review_content_raw_text']].dropna().to_dict(orient="records")

In [6]:
# Generate analysis_summary report
print("Generating analysis summary...")

Generating analysis summary...


In [7]:
out_dir = Path(f"output/{app_id}/report")
out_dir.mkdir(parents=True, exist_ok=True)    
detailed_review_analysis_output_path = out_dir / f"detailed_review_analysis_{app_id}.csv"

In [8]:
df_review_analysis_results_for_synthesis = pd.read_csv(detailed_review_analysis_output_path)

In [9]:
df_reviews = pd.DataFrame(reviews)
df_review_analysis_results_for_synthesis = pd.merge(
    df_review_analysis_results_for_synthesis,
    df_reviews,
    left_on="q0",
    right_on="review_id"
)

In [10]:
synthesis_prompts = dict()

In [11]:
for q, p in ch.synthesis_prompts_core_parts.items():
    current_full_prompt = p + "\n \n" + ch.rules
    synthesis_prompts[q] = current_full_prompt

print(synthesis_prompts)

{'q2': 'You are an analyst. ONLY use the reviews in INPUT. Summarize what players say about sensitive content.\nSensitive categories: Sexual Content/Nudity, Violence/Gore, Drugs/Alcohol/Tobacco, Religious/Political.\n \nRules:\n- Output bulleted points only (no paragraphs, no JSON).\n- Group points into short, factual insights (≤20 words each).\n- Each bullet must end with review IDs in square brackets, e.g. [37988997, 35631039].\n- Include ≤1 short quote (≤12 words) if useful for clarity.\n- Output must strictly be in English\n- If INPUT is empty, output: • No reviews matched.\n', 'q3': "You are an analyst. ONLY use the reviews provided in INPUT.\nSummarize what players say about the game's core combat mechanics.\nDefinition: Core combat mechanics = primary battle systems, controls, pacing, balance, skill systems, roles/classes, resource usage in combat, and related player strategy.\n \nRules:\n- Output bulleted points only (no paragraphs, no JSON).\n- Group points into short, factual

In [12]:
synthesis_prompts.keys()

dict_keys(['q2', 'q3', 'q5', 'q6', 'q8', 'q10', 'q12', 'q14', 'q15', 'q18', 'q20', 'q22', 'q24', 'q26', 'q28', 'q30', 'q32', 'q34', 'q36', 'q38', 'q40', 'q41'])

In [13]:
print(synthesis_prompts["q2"])

You are an analyst. ONLY use the reviews in INPUT. Summarize what players say about sensitive content.
Sensitive categories: Sexual Content/Nudity, Violence/Gore, Drugs/Alcohol/Tobacco, Religious/Political.
 
Rules:
- Output bulleted points only (no paragraphs, no JSON).
- Group points into short, factual insights (≤20 words each).
- Each bullet must end with review IDs in square brackets, e.g. [37988997, 35631039].
- Include ≤1 short quote (≤12 words) if useful for clarity.
- Output must strictly be in English
- If INPUT is empty, output: • No reviews matched.



In [14]:
df_rows_with_nonna_values_for_current_question = df_review_analysis_results_for_synthesis[
    df_review_analysis_results_for_synthesis["q2"].notna()
][["review_id", "review_content_raw_text"]]

In [15]:
df_rows_with_nonna_values_for_current_question

Unnamed: 0,review_id,review_content_raw_text
274,31736948,李清照都齐刘海漏大腿了，就给我个漏胸漏腹肌的李白在封面，还是你游也是要走个媚男路线不要女玩家...
556,27369929,期待，画风好温柔！不知道玩法是什么（不希望是卡牌🙏）然后希望可以科普历史？(●—●)
574,26895121,先给个四星观望一下吧。\n四星好评的原因就一个，是游戏介绍上说可以操作中国和其它所有文明争锋...
601,26594372,一点玩法和介绍都没有，画风还是偏女性向的，这你真敢叫华夏，能过审吗？
618,26498013,别像江南百景图一样吃历史饭还侮辱历史
619,26477738,希望我们可以是一个历史的旁观人，而不是历史的参与者。
638,26137664,三星吧，不给五星是因为你们只有这几张可以称为照片的东西，并没有实机演示，我也不好判断游戏的好...
640,26075629,别夹私货，颠倒黑白，江南百景图是前车之鉴。
641,26074897,期待，希望尊重历史
644,26003876,虽然不知道高丽有什么资格跟其他几个放在一起，但看在服饰的份上还是先打个五星期待一下吧


In [16]:
relevant_reviews = df_rows_with_nonna_values_for_current_question.to_dict(orient="records")

In [17]:
relevant_reviews

[{'review_id': 31736948,
  'review_content_raw_text': '李清照都齐刘海漏大腿了，就给我个漏胸漏腹肌的李白在封面，还是你游也是要走个媚男路线不要女玩家？别整这恶心人的事嗷。'},
 {'review_id': 27369929,
  'review_content_raw_text': '期待，画风好温柔！不知道玩法是什么（不希望是卡牌🙏）然后希望可以科普历史？(●—●)'},
 {'review_id': 26895121,
  'review_content_raw_text': '先给个四星观望一下吧。\n四星好评的原因就一个，是游戏介绍上说可以操作中国和其它所有文明争锋。\n不过不知道具体是什么模式，是《文明》这个游戏那样，还是像光荣的《三国志》做全球地图？'},
 {'review_id': 26594372,
  'review_content_raw_text': '一点玩法和介绍都没有，画风还是偏女性向的，这你真敢叫华夏，能过审吗？'},
 {'review_id': 26498013, 'review_content_raw_text': '别像江南百景图一样吃历史饭还侮辱历史'},
 {'review_id': 26477738,
  'review_content_raw_text': '希望我们可以是一个历史的旁观人，而不是历史的参与者。'},
 {'review_id': 26137664,
  'review_content_raw_text': '三星吧，不给五星是因为你们只有这几张可以称为照片的东西，并没有实机演示，我也不好判断游戏的好坏，画风还可以，我想这些可以戳中一些lsp了，不过，各个时代的历史人物都应该有自我的风格，三皇五帝，王八之气还是有的，太阴柔了，不符合帝王的气质，但是请尊重历史，时代的演变造就了精彩绝伦人人沉迷的历史，说实话，现在有些国风，就跟抄袭江南百景图一样(应该我记着是这样，狗头保命)，换汤不换药，玩家不是傻子，都能看的出来游戏的好坏，那怕你这游戏是自我的突破在同行内(就那最近的我看好的游戏来说《小小五千年》，和最近要出来的《无悔华夏》感觉一下get到了我的点)存在很多bug，这些都可以原谅，慢慢的改善也行，就像孩子的成长一样总会有磕磕绊绊很正常，不要抄袭，再说

In [18]:
current_synthesized_answer = ch.synthesize_long_form_answers_with_ai(relevant_reviews, synthesis_prompts["q2"])

⚙️  Synthesizing in 1 chunk(s)...
   ✅ Chunk 1/1 summarized (≈158 tokens).


In [19]:
current_synthesized_answer

'- Sexualized character art criticized: thigh-revealing Li Qingzhao and shirtless/abs Li Bai. [31736948]\n- Art seen as pandering to lecherous players and feminizing emperors. [26137664]\n- Requests to respect history; avoid distortion, bias, or agenda insertion. [26498013, 26075629, 26074897, 25970191, 26137664]\n- Concern that female-oriented art under "Huaxia" may face censorship. [26594372]\n- Debate over including Goryeo with other civilizations; national sensitivity noted. [26003876]\n- Prefer being historical observers, not participants, to reduce rewriting concerns. [26477738]'

In [27]:
# Synthesizing long form answers

In [28]:
items = relevant_reviews
prompt = synthesis_prompts["q2"]

In [43]:
prompt

'You are an analyst. ONLY use the reviews in INPUT. Summarize what players say about sensitive content.\nSensitive categories: Sexual Content/Nudity, Violence/Gore, Drugs/Alcohol/Tobacco, Religious/Political.\n \nRules:\n- Output bulleted points only (no paragraphs, no JSON).\n- Group points into short, factual insights (≤20 words each).\n- Each bullet must end with review IDs in square brackets, e.g. [37988997, 35631039].\n- Include ≤1 short quote (≤12 words) if useful for clarity.\n- Output must strictly be in English\n- If INPUT is empty, output: • No reviews matched.\n'

In [30]:
chunks = ch._chunk_items_by_tokens(items, prompt)

In [48]:
partial_summaries = []

In [51]:
for idx, chunk in enumerate(chunks, 1):
    user_content = "INPUT:\n" + ch._json({"items": chunk})
    print(ch.CHUNK_OUTPUT_MAX)
    summary = ch._call_response(ch.MODEL, prompt, user_content, ch.CHUNK_OUTPUT_MAX)
    partial_summaries.append(summary)
    print(f"   ✅ Chunk {idx}/{len(chunks)} summarized (≈{ch.count_tokens(summary)} tokens).")

800
   ✅ Chunk 1/1 summarized (≈7 tokens).


In [52]:
partial_summaries

['', '']

In [40]:
user_content = "INPUT:\n" + ch._json({"items": chunk})

In [42]:
print(user_content)

INPUT:
{"items":[{"review_id":31736948,"review_content_raw_text":"李清照都齐刘海漏大腿了，就给我个漏胸漏腹肌的李白在封面，还是你游也是要走个媚男路线不要女玩家？别整这恶心人的事嗷。"},{"review_id":27369929,"review_content_raw_text":"期待，画风好温柔！不知道玩法是什么（不希望是卡牌🙏）然后希望可以科普历史？(●—●)"},{"review_id":26895121,"review_content_raw_text":"先给个四星观望一下吧。\n四星好评的原因就一个，是游戏介绍上说可以操作中国和其它所有文明争锋。\n不过不知道具体是什么模式，是《文明》这个游戏那样，还是像光荣的《三国志》做全球地图？"},{"review_id":26594372,"review_content_raw_text":"一点玩法和介绍都没有，画风还是偏女性向的，这你真敢叫华夏，能过审吗？"},{"review_id":26498013,"review_content_raw_text":"别像江南百景图一样吃历史饭还侮辱历史"},{"review_id":26477738,"review_content_raw_text":"希望我们可以是一个历史的旁观人，而不是历史的参与者。"},{"review_id":26137664,"review_content_raw_text":"三星吧，不给五星是因为你们只有这几张可以称为照片的东西，并没有实机演示，我也不好判断游戏的好坏，画风还可以，我想这些可以戳中一些lsp了，不过，各个时代的历史人物都应该有自我的风格，三皇五帝，王八之气还是有的，太阴柔了，不符合帝王的气质，但是请尊重历史，时代的演变造就了精彩绝伦人人沉迷的历史，说实话，现在有些国风，就跟抄袭江南百景图一样(应该我记着是这样，狗头保命)，换汤不换药，玩家不是傻子，都能看的出来游戏的好坏，那怕你这游戏是自我的突破在同行内(就那最近的我看好的游戏来说《小小五千年》，和最近要出来的《无悔华夏》感觉一下get到了我的点)存在很多bug，这些都可以原谅，慢慢的改善也行，就像孩子的成长一样总会有磕磕绊绊很正常，不要抄袭，再说一遍，不要抄袭！当然我也期待你们游戏的内测，我已经预约了，

In [44]:
ch.CHUNK_OUTPUT_MAX

800

In [47]:
ch.count_tokens(partial_summaries[-1],ch.MODEL)

255