In [28]:
import numpy as np
from langchain.llms import Tongyi
from volcenginesdkarkruntime import Ark
from openai import OpenAI
from pandasql import sqldf
import pandas as pd
import os
import csv
import json
import requests
import math
import pickle,time
import traceback
import api_key

In [2]:
os.environ["DASHSCOPE_API_KEY"] = api_key.DASHSCOPE_API_KEY #通义api key
os.environ["ARK_API_KEY"] = api_key.ARK_API_KEY #豆包api key
os.environ["DEEPSEEK_API_KEY"] = api_key.DEEPSEEK_API_KEY #deepseek api key

In [3]:
class llm_connect:
    def __init__(self, llm_name, model='default'):
        self.llm_name = llm_name
        self.model = model

    #通义千问
    def tongyi(self, prompt, model='qwan-max'):
        tongyi_model = self.model if self.model!='default' else model
        tongyi = Tongyi(model = tongyi_model)
        return tongyi.invoke(prompt)

    #豆包
    def doubao(self, prompt, model='ep-20250123181249-vxkxv'): # 128k: ep-20250123181249-vxkxv  256k: ep-20250121172803-l9tmp
        client = Ark(api_key=os.environ.get("ARK_API_KEY"))
        doubao_model = self.model if self.model!='default' else model
        completion = client.chat.completions.create(
            model = doubao_model,
            messages=[
                {"role": "user", "content": prompt},
            ],
        )
        return completion.choices[0].message.content

    #deepseek
    def deepseek(self, prompt, model='ep-20250208150917-z5c9z'):#deepseek
        client = Ark(api_key=os.environ.get("ARK_API_KEY"))
        doubao_model = self.model if self.model!='default' else model
        completion = client.chat.completions.create(
            model = doubao_model,
            messages=[
                {"role": "user", "content": prompt},
            ],
        )
        return completion.choices[0].message.content


    def answer(self, prompt='hello'):
        if self.llm_name == "tongyi":
            return self.tongyi.invoke(prompt)
        elif self.llm_name == "doubao":
            return self.doubao(prompt=prompt)
        elif self.llm_name == "deepseek":
            return self.deepseek(prompt=prompt)
        else:
            print("大模型选择错误")

In [19]:
class area_clean:
    def __init__(self, input_data:pd.DataFrame, llm_name, model='default'):
        self.df_fail = None
        self.df_success = None
        self.input_data = input_data
        self.df_all = None
        self.transed_area_json_str_list = None
        self.cleaned_area_json_str_list = None
        self.llm_name = llm_name
        self.model = model
        self.llm = llm_connect(llm_name, model)
        self.df_all_clean = None

        #省市码表(proj_cbi.idl_ff_city)
        csv_reader1 = csv.reader(open("data/map_city.csv", encoding='utf-8-sig'))
        map_city = []
        for row in csv_reader1:
            map_city.append(row)
        self.df_map_city = pd.DataFrame(map_city[1:], columns=map_city[0])

        #区码表(proj_cbi.idl_ff_town)
        csv_reader2 = csv.reader(open("data/map_town_distinct.csv", encoding='utf-8-sig'))
        map_town = []
        for row in csv_reader2:
            map_town.append(row)
        self.df_map_town = pd.DataFrame(map_town[1:], columns=map_town[0])

        #已经处理的码表，无需再处理
        csv_reader3 = csv.reader(open("results/cleaned_area.csv", encoding='utf-8-sig'))
        cleaned_area = []
        for row in csv_reader3:
            cleaned_area.append(row)
        self.df_cleaned_area = pd.DataFrame(cleaned_area[1:], columns=cleaned_area[0])

        #需要处理的数据
        df_input_join_cleaned = pd.merge(self.input_data, self.df_cleaned_area, left_on=['zstate1','zcity3'], right_on=['zstate1','zcity3'], how='left', indicator=True)
        df_area_to_clean_all = df_input_join_cleaned[df_input_join_cleaned['_merge'] == 'left_only']
        self.df_area_to_clean = df_area_to_clean_all[['zstate1', 'zcity3']].drop_duplicates()


    def area_clean(self, area):
        prompt = f"""
以json格式提供多个中国省和市的拼音，以||分隔，其中可能有错误，请按要求的格式返回正确的省、市、区的全称、简称和推理过程。
现根据输入确定省市区的全称，再根据全称获取简称。如果无法完全确定答案，请不要进行推测和更正，直接返回原输入。

要求：1.输出的省是中国省份名或直辖市，市是中国地级行政区或直辖市或省直辖县级市，区为地级行政区以下的区域（包含县级市和区等），如果根据输入无法确定区，则返回null
     2.返回的省、市全称、市简称、区全称、区简称不要包含标点符号和空格，推理中不包含单引号
     3.返回的市简称、区简称，不要包含省、市、自治州、旗等后缀，如果是盟，则包含后缀
     4.如果输入市为区县名、县级市或街道等，根据输入的省将找到该地区隶属的地级行政区并填充到市，将原输入的市处理后填充到区
     5.如果输入市为no data或其他无法识别的信息，返回原输入
     6.返回的区一定要隶属于市，市一定要隶属于省，如果根据输入可以推断则填写，否则返回原输入
     7.结果以json字符串格式返回，中间以||分隔，不要包含任何标识符和换行符

以下是示例：
输入：{{"省":"guangdong","市":"futian district"}}||{{"省":"jiangsu","市":"Su zhou shi"}}||{{"省":"hebei","市":"jinzhou"}}||{{"省":"hebei","市":"miyun district"}}||{{"省":"sichuan","市":"barkam"}}
输出：{{"推理":"guangdong指广东。futian district指广东深圳的福田区，所属地级行政是深圳市，全称深圳市，简称深圳。福田区全称福田区，简称福田","省":"广东","市全称":"深圳市","市简称":"深圳","区全称":"福田区","区简称":"福田"}}||{{"推理":"jiangsu指江苏。Su zhou shi指江苏的苏州市，苏州市是地级行政区，全称苏州市，简称苏州。输入没有提供地区名，填充为null。","省":"江苏","市全称":"苏州市","市简称":"苏州","区全称":"null","区简称":"null"}}||{{"推理":"hebei是指河北。jinzhou是指河北的晋州市, 晋州市是县级市, 所属地级行政区是石家庄市, 全称是石家庄市, 简称石家庄。晋州市, 简称晋州，拼音简称是晋州, 填充到区。","省":"河北","市全称":"石家庄市","市简称":"石家庄","区全称":"晋州市","区简称":"晋州"}}||{{"推理":"hebei是指河北。miyun district在河北内无法确定具体地区，省和市都返回原输入，null填充到区。","省":"hebei","市全称":"miyun district","市简称":"miyun district","区全称":"null","区简称":"null"}}||{{"推理":"sichuan是指四川。barkam指四川的的马尔康市，马尔康市是县级市，所属地级行政区是阿坝藏族羌族自治州，全称阿坝藏族羌族自治州，简称阿坝。马尔康市的全称是马尔康市，简称马尔康，填充到区。。","省":"四川","市全称":"阿坝藏族羌族自治州","市简称":"阿坝","区全称":"马尔康市","区简称":"马尔康"}}

开始回答
输入：{area}
输出：
"""
        print(prompt)
        cleaned_area_chinese = self.llm.answer(prompt)
        return cleaned_area_chinese


    def area_to_pinyin(self, cleaned_area_chinese):
        cleaned_area_list = ""
        for i in range(len(cleaned_area_chinese)):
            area_json_str = cleaned_area_chinese[i]
            area_json = json.loads(area_json_str)
            province = area_json["省"]
            city = area_json["市全称"]
            cityshort = area_json["市简称"]
            town = area_json["区全称"]
            townshort = area_json["区简称"]
            cleaned_area_list += f"""{{"省":"{province}","市全称":"{city}","市简称":"{cityshort}","区全称":"{town}","区简称":"{townshort}"}}"""
            if i != len(cleaned_area_chinese)-1: cleaned_area_list += '||'
        prompt = f"""
以json格式提供多个中国省、市、区的中文，以||分隔，请返回其对应的汉语拼音和翻译过程。

要求：1.按照顺序将中国行政区划名转换成汉语拼音，省对应province，市全称对应city，市简称对应cityshort，区全称对应town，区简称对应townshort
     2.返回的拼音中不包含空格和特殊符号
     3.如果遇到多音字，请根据该地名的读音来转换
     4.输入如果是null或其他非中文的字符，该项返回null
     5.结果以json字符串格式返回，中间以||分隔，不要包含任何标识符和换行符

以下是示例：
输入：{{"省":"河北","市全称":"邢台市","市简称":"邢台","区全称":"信都区","区简称":"信都"}}||{{"省":"安徽","市全称":"六安市","市简称":"六安","区全称":"null","区简称":"null"}}||{{"省":"江西","市全称":"nanchangshi","市简称":"nanchang","区全称":"no data","区简称":"no data"}}
输出：{{"翻译过程":"河北拼音hebei，邢台市拼音xingtaishi，邢台拼音xingtai，信都区拼音xinduqu，信都拼音xindu","province":"hebei","city":"xingtaishi","cityshort":"xingtai","town":"xinduqu","townshort":"xindu"}}||{{"翻译过程":"安徽拼音anhui，六安市的六是多音字，用于该地名通常念lu，所以六安市拼音luanshi，六安拼音luan，null非中文返回null，null非中文返回null","province":"anhui","city":"luanshi","cityshort":"luan","town":"null","townshort":"null"}}||{{"翻译过程":"江西拼音jiangxi，nanchangshi非中文返回null，nanchang非中文返回null，no data非中文返回null，no data非中文返回null","province":"jiangxi","city":"null","cityshort":"null","town":"null","townshort":"null"}}

开始回答
输入：{cleaned_area_list}
输出：
"""
        print(prompt)
        cleaned_area_pinyin = self.llm.answer(prompt)
        return cleaned_area_pinyin


    def clean_batch(self, path, batch_number=10):
        #处理成输入格式
        area_list = []
        df_area_to_clean = self.df_area_to_clean
        for index,row in df_area_to_clean.iterrows():
            zstate1 = row['zstate1']
            zcity3 = row['zcity3']
            area_list.append([zstate1, zcity3])
        print(f"area个数：{len(area_list)}, 每批次{batch_number}个, 共{math.ceil(len(area_list)/batch_number)}个批次")
        all_batch = []
        area_batch = ""
        for i in range(len(area_list)):
            area = area_list[i]
            state = area[0]
            city = area[1]
            if ((i+1) % batch_number == 0) or (i == len(area_list)-1):
                area_batch += f"""{{"省":"{state}","市":"{city}"}}"""
                all_batch.append(area_batch)
                area_batch = ""
            else:
                area_batch += f"""{{"省":"{state}","市":"{city}"}}||"""

        cleaned_area_json_str_list = []
        if not os.path.exists(path):
            os.makedirs(path)
        for i in range(len(all_batch)):
            rerun = 1
            while 5 > rerun > 0:
                try:
                    cleaned_area = self.area_clean(all_batch[i])
                    print(cleaned_area)
                    cleaned_area_json_str_list += cleaned_area.split("||")
                    #保存到文件防止程序中断
                    with open(f'{path}/clean_result{i+1}.pickle', 'wb' ) as file_name:
                        pickle.dump(cleaned_area, file_name)
                except:
                    print("clean批次"+str(i+1)+"失败，重新运行")
                    traceback.print_exc()
                    rerun += 1
                else:
                    rerun = 0
                    print("clean批次"+str(i+1)+"完成")

            time.sleep(5)
        self.cleaned_area_json_str_list = cleaned_area_json_str_list


    def to_pinyin_batch(self, path, batch_number=10):
        cleaned_area_chinese = self.cleaned_area_json_str_list
        cleaned_area_chinese_batch = [cleaned_area_chinese[i:i+batch_number] for i in range(0,len(cleaned_area_chinese),batch_number)]
        print(f"area个数：{len(cleaned_area_chinese)}, 每批次{batch_number}个, 共{math.ceil(len(cleaned_area_chinese)/batch_number)}个批次")
        transed_area_json_str_list = []
        if not os.path.exists(path):
            os.makedirs(path)
        for i in range(len(cleaned_area_chinese_batch)):
            rerun = 1
            while 5 > rerun > 0:
                try:
                    transed_area = self.area_to_pinyin(cleaned_area_chinese_batch[i])
                    #print(transed_area)
                    transed_area_json_str_list += transed_area.split("||")
                    #保存到文件防止程序中断
                    with open(f'{path}/trans_result{i+1}.pickle', 'wb' ) as file_name:
                        pickle.dump(transed_area, file_name)
                except:
                    print("翻译批次"+str(i+1)+"失败，重新运行")
                    traceback.print_exc()
                    rerun += 1
                else:
                    rerun = 0
                    print("翻译批次"+str(i+1)+"完成")

        self.transed_area_json_str_list =  transed_area_json_str_list


    def clean_and_trans_batch(self, bin_path, results_path, batch_number=10):
        if not os.path.exists(bin_path):
            os.makedirs(bin_path)
        if not os.path.exists(results_path):
            os.makedirs(results_path)
        self.clean_batch(bin_path, batch_number=batch_number)
        self.to_pinyin_batch(bin_path, batch_number=batch_number)
        self.json_decode()
        self.test()
        self.save_results(results_path)

    # 从文件恢复
    def recover(self, path, name, start, end):
        area_list = []
        for i in range(start, end+1):
            with open(f'{path}/{name}{i}.pickle', 'rb' ) as file_name:
                unpickled_dict = pickle.load(file_name)
                area_list  +=  unpickled_dict.split("||")
        return area_list


    def json_decode(self):
        df_area_to_clean = self.df_area_to_clean
        cleaned_area_json_str_list = self.cleaned_area_json_str_list
        transed_area_json_str_list = self.transed_area_json_str_list
        cleaned_area_list = []
        if len(cleaned_area_json_str_list) != len(df_area_to_clean) or len(transed_area_json_str_list) != len(df_area_to_clean):
            print("长度不一致")
        for i in range(len(df_area_to_clean)):
            transed_area_json_str = json.loads(transed_area_json_str_list[i])
            chinese_area_json_str = json.loads(cleaned_area_json_str_list[i])
            area_json_str = {}
            area_json_str.update(transed_area_json_str)
            area_json_str.update(chinese_area_json_str)
            cleaned_area_list.append(area_json_str)
        df_clean_area = pd.DataFrame(cleaned_area_list)
        df_all = pd.concat([df_area_to_clean[['zstate1','zcity3']],df_clean_area],axis=1).reindex(columns=['zstate1', 'zcity3', '推理', '省', '市全称', '市简称', '区全称', '区简称', '翻译过程', 'province', 'city', 'cityshort', 'town', 'townshort'])
        self.df_all = df_all


    def test(self):
        df_all = self.df_all
        df_map_city = self.df_map_city
        df_province_map = pd.DataFrame({
            "province_pinyin":["xizang","xizangzizhiqu","neimenggu","neimengguzizhiqu","guangxizhuangzuzizhiqu","guangxizhuang","ningxiahuizuzizhiqu","xinjiangweiwuerzizhiqu","xinjiangweiwuerzuzizhiqu"],
            "province_eng":["tibet","tibet","inner mongolia","inner mongolia","guangxi","guangxi","ningxia","xinjiang","xinjiang"]
        })
        df_all['province_no_space'] = df_all['province'].str.replace(" ","")
        df_all_clean = pd.merge(df_all,df_province_map, left_on='province_no_space', right_on="province_pinyin", how='left')
        df_all_clean["province_clean"] = df_all_clean["province_eng"].combine_first(df_all_clean["province_no_space"])
        df_all_clean = df_all_clean[['province_clean', 'zstate1', 'zcity3', '推理', '省', '市全称', '市简称', '区全称', '区简称', '翻译过程', 'province', 'city', 'cityshort', 'town', 'townshort']]
        df_all_clean = df_all_clean.map(lambda x: np.NaN if x is None or x == 'null' else x)
        self.df_all_clean = df_all_clean

        df_fail = sqldf(f"""
                with fail as(
                select --count(*)
                a.*, b.provinceshortname,b.provincename,b.cityname,b.cityshortname,b.citypinyin,b.cityshortpinyin,b.provincepinyin--, c.cityname, c.cityshortpinyin
                , row_number() over(partition by a.zstate1, a.zcity3 order by b.cityshortname) as rn
                FROM df_all_clean a
                left join df_map_city b
                    on (replace(a.cityshort,' ','') = lower(b.cityshortpinyin) or replace(a.city,' ','') = lower(b.citypinyin))
                    and (a.province_clean = lower(b.provincepinyin) or a.zstate1 = lower(b.provincepinyin))
                --left join df_map_town c
                --    on a.区简称 = lower(c.cityshortpinyin)
                --    and b.provincename = c.provincename
                --    and b.cityname = c.cityareaname
                where --d.activation_country is not null -- ludp处理后数据
                 b.cityshortpinyin is not null -- 码表匹配成功
                )
                select *
                from fail a
                where rn = 1
                """, locals())

        df_success = sqldf(f"""
                with success as(
                select --count(*)
                a.*, b.provinceshortname,b.provincename,b.cityname,b.cityshortname,b.citypinyin,b.cityshortpinyin,b.provincepinyin--, c.cityname, c.cityshortpinyin
                , row_number() over(partition by a.zstate1, a.zcity3 order by b.cityshortname) as rn
                from df_all_clean a
                left join df_map_city b
                    on (replace(a.cityshort,' ','') = lower(b.cityshortpinyin) or replace(a.city,' ','') = lower(b.citypinyin))
                    and (a.province_clean = lower(b.provincepinyin) or a.zstate1 = lower(b.provincepinyin))
                --left join df_map_town c
                --    on a.区简称 = lower(c.cityshortpinyin)
                --    and b.provincename = c.provincename
                --    and b.cityname = c.cityareaname
                where --d.activation_country is not null -- ludp处理后数据
                 b.cityshortpinyin is not null -- 码表匹配成功
                )
                select *
                from success a
                where rn = 1
                """, locals())
        self.df_success = df_success
        self.df_fail = df_fail
        return df_fail, df_success


    def save_results(self, path='results'):
        self.df_success.to_csv(path+'/cleaned_area.csv', mode='a', index=False, header=None)
        self.df_fail.to_csv(path+'/cleaned_area_fail.csv', index=False)


# 首次清洗

In [26]:
#原始数据
csv_reader = csv.reader(open("data/all_fail.csv", encoding='utf-8-sig'))
area_list = []
for row in csv_reader:
    area_list.append(row)
df_area_list = pd.DataFrame(area_list[1:], columns=area_list[0])

In [27]:
df_area_list

Unnamed: 0,zstate1,zcity3
0,guangxi zhuang,longsheng various nationalities autonomo
1,jiangsu,yuhuatai district
2,inner mongolia,xuejiawan
3,guangxi zhuang,bama yaozu aut county
4,beijing,shunyi district
...,...,...
701,qinghai,da qaidam
702,shaanxi,qindu district
703,zhejiang,longwan district
704,anhui,tianjia an district


In [22]:
area_clean_1 = area_clean(df_area_list,'deepseek')

In [23]:
cleaned_area_chinese = area_clean_1.area_clean("""{"省":"sichuan","市":"barkam"}||{"省":"jiangxi","市":"xiongshi"}||{"省":"qinghai","市":"maqên"}||{"省":"guangxi zhuang","市":"rong county"}""")


以json格式提供多个中国省和市的拼音，以||分隔，其中可能有错误，请按要求的格式返回正确的省、市、区的全称、简称和推理过程。
现根据输入确定省市区的全称，再根据全称获取简称。如果无法完全确定答案，请不要进行推测和更正，直接返回原输入。

要求：1.输出的省是中国省份名或直辖市，市是中国地级行政区或直辖市或省直辖县级市，区为地级行政区以下的区域（包含县级市和区等），如果根据输入无法确定区，则返回null
     2.返回的省、市全称、市简称、区全称、区简称不要包含标点符号和空格，推理中不包含单引号
     3.返回的市简称、区简称，不要包含省、市、自治州、旗等后缀，如果是盟，则包含后缀
     4.如果输入市为区县名、县级市或街道等，根据输入的省将找到该地区隶属的地级行政区并填充到市，将原输入的市处理后填充到区
     5.如果输入市为no data或其他无法识别的信息，返回原输入
     6.返回的区一定要隶属于市，市一定要隶属于省，如果根据输入可以推断则填写，否则返回原输入
     7.结果以json字符串格式返回，中间以||分隔，不要包含任何标识符和换行符

以下是示例：
输入：{"省":"guangdong","市":"futian district"}||{"省":"jiangsu","市":"Su zhou shi"}||{"省":"hebei","市":"jinzhou"}||{"省":"hebei","市":"miyun district"}||{"省":"sichuan","市":"barkam"}
输出：{"推理":"guangdong指广东。futian district指广东深圳的福田区，所属地级行政是深圳市，全称深圳市，简称深圳。福田区全称福田区，简称福田","省":"广东","市全称":"深圳市","市简称":"深圳","区全称":"福田区","区简称":"福田"}||{"推理":"jiangsu指江苏。Su zhou shi指江苏的苏州市，苏州市是地级行政区，全称苏州市，简称苏州。输入没有提供地区名，填充为null。","省":"江苏","市全称":"苏州市","市简称":"苏州","区全称":"null","区简称":"null"}||{"推理":"hebei是指河北。jinzhou是指河北的晋州市, 晋州市是县级市, 

In [24]:
cleaned_area_json_str_list = cleaned_area_chinese.split("||")
cleaned_area_json_str_list

['\n\n{"推理":"sichuan指四川。barkam指四川的马尔康市，是县级市，所属地级行政区是阿坝藏族羌族自治州，全称阿坝藏族羌族自治州，简称阿坝。马尔康市全称马尔康市，简称马尔康，填充到区。","省":"四川","市全称":"阿坝藏族羌族自治州","市简称":"阿坝","区全称":"马尔康市","区简称":"马尔康"}',
 '{"推理":"jiangxi指江西。xiongshi无法识别对应的地级行政区或县级市，返回原输入。区填充为null。","省":"江西","市全称":"xiongshi","市简称":"xiongshi","区全称":"null","区简称":"null"}',
 '{"推理":"qinghai指青海。maqên指青海的玛沁县，所属地级行政区是果洛藏族自治州，全称果洛藏族自治州，简称果洛。玛沁县全称玛沁县，简称玛沁，填充到区。","省":"青海","市全称":"果洛藏族自治州","市简称":"果洛","区全称":"玛沁县","区简称":"玛沁"}',
 '{"推理":"guangxi zhuang指广西。rong county指广西的容县，所属地级行政区是玉林市，全称玉林市，简称玉林。容县全称容县，简称容，填充到区。","省":"广西","市全称":"玉林市","市简称":"玉林","区全称":"容县","区简称":"容"}']

In [25]:
cleaned_area_pinyin = area_clean_1.area_to_pinyin(cleaned_area_json_str_list)
cleaned_area_pinyin


以json格式提供多个中国省、市、区的中文，以||分隔，请返回其对应的汉语拼音和翻译过程。

要求：1.按照顺序将中国行政区划名转换成汉语拼音，省对应province，市全称对应city，市简称对应cityshort，区全称对应town，区简称对应townshort
     2.返回的拼音中不包含空格和特殊符号
     3.如果遇到多音字，请根据该地名的读音来转换
     4.输入如果是null或其他非中文的字符，该项返回null
     5.结果以json字符串格式返回，中间以||分隔，不要包含任何标识符和换行符

以下是示例：
输入：{"省":"河北","市全称":"邢台市","市简称":"邢台","区全称":"信都区","区简称":"信都"}||{"省":"安徽","市全称":"亳州市","市简称":"亳州","区全称":"null","区简称":"null"}||{"省":"江西","市全称":"nanchangshi","市简称":"nanchang","区全称":"no data","区简称":"no data"}
输出：{"翻译过程":"河北拼音hebei，邢台市拼音xingtaishi，邢台拼音xingtai，信都区拼音xinduqu，信都拼音xindu","province":"hebei","city":"xingtaishi","cityshort":"xingtai","town":"xinduqu","townshort":"xindu"}||{"翻译过程":"安徽拼音anhui，亳州市的亳是多音字，用于该地名通常念bo，所以亳州市拼音bozhoushi，亳州拼音bozhou，null非中文返回null，null非中文返回null","province":"anhui","city":"bozhoushi","cityshort":"bozhou","town":"null","townshort":"null"}||{"翻译过程":"江西拼音jiangxi，nanchangshi非中文返回null，nanchang非中文返回null，no data非中文返回null，no data非中文返回null","province":"jiangxi","city":"null","cityshort":"null","town":"nu

'\n\n{"翻译过程":"四川拼音sichuan，阿坝藏族羌族自治州拼音abazangzuqiangzuzizhizhou，阿坝拼音aba，马尔康市拼音maerkangshi，马尔康拼音maerkang","province":"sichuan","city":"abazangzuqiangzuzizhizhou","cityshort":"aba","town":"maerkangshi","townshort":"maerkang"}||{"翻译过程":"江西拼音jiangxi，xiongshi非中文返回null，xiongshi非中文返回null，null非中文返回null，null非中文返回null","province":"jiangxi","city":"null","cityshort":"null","town":"null","townshort":"null"}||{"翻译过程":"青海拼音qinghai，果洛藏族自治州拼音guoluozangzuzizhizhou，果洛拼音guoluo，玛沁县拼音maqinxian，玛沁拼音maqin","province":"qinghai","city":"guoluozangzuzizhizhou","cityshort":"guoluo","town":"maqinxian","townshort":"maqin"}||{"翻译过程":"广西拼音guangxi，玉林市拼音yulinshi，玉林拼音yulin，容县拼音rongxian，容拼音rong","province":"guangxi","city":"yulinshi","cityshort":"yulin","town":"rongxian","townshort":"rong"}'

In [9]:
area_clean_1.clean_and_trans_batch(bin_path='results/final_class/round_1', results_path='results', batch_number=10)

area个数：19, 每批次10个, 共2个批次

以json格式提供多个中国省和市的拼音，以||分隔，其中可能有错误，请按要求的格式返回正确的省、市、区的全称、简称和推理过程。
现根据输入确定省市区的全称，再根据全称获取简称。如果无法完全确定答案，请不要进行推测和更正，直接返回原输入。

要求：1.输出的省是中国省份名或直辖市，市是中国地级行政区或直辖市或省直辖县级市，区为地级行政区以下的区域（包含县级市和区等），如果根据输入无法确定区，则返回null
     2.返回的省、市全称、市简称、区全称、区简称不要包含标点符号和空格，推理中不包含单引号
     3.返回的市简称、区简称，不要包含省、市、自治州、旗等后缀，如果是盟，则包含后缀
     4.如果输入市为区县名、县级市或街道等，根据输入的省将找到该地区隶属的地级行政区并填充到市，将原输入的市处理后填充到区
     5.如果输入市为no data或其他无法识别的信息，返回原输入
     6.返回的区一定要隶属于市，市一定要隶属于省，如果根据输入可以推断则填写，否则返回原输入
     7.结果以json字符串格式返回，中间以||分隔，不要包含任何标识符和换行符

以下是示例：
输入：{"省":"guangdong","市":"futian district"}||{"省":"jiangsu","市":"Su zhou shi"}||{"省":"hebei","市":"jinzhou"}||{"省":"hebei","市":"miyun district"}||{"省":"sichuan","市":"barkam"}
输出：{"推理":"guangdong指广东。futian district指广东深圳的福田区，所属地级行政是深圳市，全称深圳市，简称深圳。福田区全称福田区，简称福田","省":"广东","市全称":"深圳市","市简称":"深圳","区全称":"福田区","区简称":"福田"}||{"推理":"jiangsu指江苏。Su zhou shi指江苏的苏州市，苏州市是地级行政区，全称苏州市，简称苏州。输入没有提供地区名，填充为null。","省":"江苏","市全称":"苏州市","市简称":"苏州","区全称":"null","区简称":"null"}||{"推理":"hebei是指河北。j

In [23]:
df_fail = area_clean_1.df_fail
df_success = area_clean_1.df_success

In [24]:
df_success

Unnamed: 0,zstate1,zcity3,推理,省,市全称,市简称,区全称,区简称,翻译过程,province,...,town,townshort,provinceshortname,provincename,cityname,cityshortname,citypinyin,cityshortpinyin,provincepinyin,rn
0,hebei,lutai district,hebei指河北。lutai district应为唐山市芦台区，所属地级市为唐山市，全称唐山...,河北,唐山市,唐山,芦台区,芦台,河北拼音hebei，唐山市拼音tangshanshi，唐山拼音tangshan，芦台区拼音l...,hebei,...,lutai qu,lutai,河北省,河北,唐山市,唐山,tangshanshi,Tangshan,Hebei,1
1,heilongjiang,wucui district,heilongjiang指黑龙江。wucui district可能为乌翠区，所属地级市是伊春...,黑龙江,伊春市,伊春,乌翠区,乌翠,黑龙江拼音heilongjiang，伊春市拼音yichunshi，伊春拼音yichun，乌翠...,heilongjiang,...,wucuiqu,wucui,黑龙江省,黑龙江,伊春市,伊春,yichunshi,Yichun,Heilongjiang,1
2,henan,xinyan,henan指河南。xinyan可能指信阳市的新县，信阳市是地级市简称信阳，新县全称新县简称新县。,河南,信阳市,信阳,新县,新县,河南拼音henan，信阳市拼音xinyangshi，信阳拼音xinyang，新县拼音xinx...,henan,...,xinxian,xinxian,河南省,河南,信阳市,信阳,xinyangshi,Xinyang,Henan,1


In [26]:
df_fail

Unnamed: 0,zstate1,zcity3,推理,省,市全称,市简称,区全称,区简称,翻译过程,province,...,cityshort,town,townshort,provinceshortname,provincename,cityname,cityshortname,citypinyin,cityshortpinyin,provincepinyin
0,No Data,No Data,省输入No Data，无法确定，返回原输入。市输入No Data，无法确定，返回原输入。区无...,No Data,No Data,No Data,,,省No Data非中文返回null，市全称No Data非中文返回null，市简称No Da...,,...,,,,,,,,,,
1,hubei,shaodong,hubei指湖北。shaodong在湖北省内无对应地级行政区，无法确定隶属关系，返回原输入。,湖北,shaodong,shaodong,,,湖北拼音hubei，市全称shaodong非中文返回null，市简称shaodong非中文返...,hubei,...,,,,,,,,,,
2,jiangxi,zhancai,jiangxi指江西。zhancai在江西省内无对应地级行政区，无法确定隶属关系，返回原输入。,江西,zhancai,zhancai,,,江西拼音jiangxi，市全称zhancai非中文返回null，市简称zhancai非中文返...,jiangxi,...,,,,,,,,,,
3,chongqing,No Data,chongqing是直辖市。市输入No Data无法识别，返回原输入。区无数据返回null。,重庆,No Data,No Data,,,重庆拼音chongqing，市全称No Data非中文返回null，市简称No Data非中...,chongqing,...,,,,,,,,,,
4,shandong,jiulinzi,shandong指山东。jiulinzi在山东省内无对应地级行政区，无法确定隶属关系，返回原输入。,山东,jiulinzi,jiulinzi,,,山东拼音shandong，市全称jiulinzi非中文返回null，市简称jiulinzi非...,shandong,...,,,,,,,,,,
5,shaanxi,shagedu,shaanxi指陕西。shagedu在陕西省内无对应地级行政区，无法确定隶属关系，返回原输入。,陕西,shagedu,shagedu,,,陕西拼音shaanxi(注意与山西shānxī区分)，市全称shagedu非中文返回null...,shaanxi,...,,,,,,,,,,
6,zhejiang,zhuangshi,zhejiang指浙江。zhuangshi在浙江省内无对应地级行政区，无法确定隶属关系，返回...,浙江,zhuangshi,zhuangshi,,,浙江拼音zhejiang，市全称zhuangshi非中文返回null，市简称zhuangsh...,zhejiang,...,,,,,,,,,,
7,hebei,tanghaizhensi team,hebei指河北。tanghaizhensi team无法对应有效行政区，返回原输入。区无数...,hebei,tanghaizhensi team,tanghaizhensi team,,,省hebei非中文返回null，市全称tanghaizhensi team非中文返回null...,,...,,,,,,,,,,
8,heilongjiang,horqin district,heilongjiang指黑龙江。horqin district属于内蒙古通辽市，与黑龙江省...,heilongjiang,horqin district,horqin district,,,省heilongjiang非中文返回null，市全称horqin district非中文返回...,,...,,,,,,,,,,
9,hebei,xugezhuang,hebei指河北。xugezhuang可能指徐各庄镇，属于唐山市丰润区，但无法确定地级行政区...,河北,xugezhuang,xugezhuang,,,河北拼音hebei，市全称xugezhuang非中文返回null，市简称xugezhuang...,hebei,...,,,,,,,,,,


# 二次清洗

In [38]:
area_clean_2 = area_clean(df_fail,'doubao')

In [39]:
area_clean_2.clean_and_trans_batch(bin_path='results/final_class/round_2', results_path='results/final_class', batch_number=20)

area个数：2, 每批次20个, 共1个批次
clean批次1完成
area个数：2, 每批次20个, 共1个批次
翻译批次1完成


In [41]:
df_fail2 = area_clean_2.df_fail

In [42]:
df_fail2

Unnamed: 0,province_clean,zstate1,zcity3,推理,省,市全称,市简称,区全称,区简称,翻译过程,...,cityshort,town,townshort,provinceshortname,provincename,cityname,cityshortname,citypinyin,cityshortpinyin,provincepinyin
0,inner mongolia,inner mongolia,ud district,inner mongolia指内蒙古。ud district在内蒙古内无法确定具体地区，省和...,内蒙古,ud district,ud district,,,内蒙古拼音neimenggu，ud district非中文返回null，ud distric...,...,,,,,,,,,,
1,heilongjiang,heilongjiang,hondlon district,heilongjiang指黑龙江。hondlon district在黑龙江内无法确定具体地区...,黑龙江,hondlon district,hondlon district,,,黑龙江拼音heilongjiang，hondlon district非中文返回null，ho...,...,,,,,,,,,,


# 三次清洗

In [43]:
area_clean_3 = area_clean(df_fail2, 'doubao')
area_clean_3.clean_and_trans_batch(bin_path='results/final_class/round_3', results_path='results/final_class', batch_number=20)
df_fail3 = area_clean_3.df_fail


area个数：2, 每批次20个, 共1个批次
clean批次1完成
area个数：2, 每批次20个, 共1个批次
翻译批次1完成


In [44]:
df_fail3

Unnamed: 0,province_clean,zstate1,zcity3,推理,省,市全称,市简称,区全称,区简称,翻译过程,...,cityshort,town,townshort,provinceshortname,provincename,cityname,cityshortname,citypinyin,cityshortpinyin,provincepinyin
0,inner mongolia,inner mongolia,ud district,inner mongolia指内蒙古自治区。ud district在内蒙古自治区内无法确定具...,内蒙古自治区,ud district,ud district,,,内蒙古自治区拼音neimengguzizhiqu，ud district非中文返回null，...,...,,,,,,,,,,
1,heilongjiang,heilongjiang,hondlon district,heilongjiang指黑龙江。hondlon district在黑龙江省内无法确定具体地...,黑龙江,hondlon district,hondlon district,,,黑龙江拼音heilongjiang，hondlon district非中文返回null，ho...,...,,,,,,,,,,
