In [None]:
import pandas as pd
from urllib.request import urlopen
from collections import Counter
from pypinyin import lazy_pinyin, pinyin

pd.set_option('display.max_rows', 500)

In [None]:
df_ming = pd.read_csv(
    "https://raw.githubusercontent.com/wainshine/Chinese-Names-Corpus/master/Chinese_Names_Corpus/Chinese_Names_Corpus_Gender%EF%BC%88120W%EF%BC%89.txt",
    skiprows=3,
)

In [None]:
df_ming = df_ming[df_ming.sex != "未知"]

In [None]:
df_ming.head()

In [None]:
df_ming["name_expanded"] = df_ming["dict"].str[1:].apply(lambda name: [name] * 2 if len(name) == 1 else list(name))
df_ming

In [None]:
df_xing = pd.read_excel("https://github.com/wainshine/Chinese-Names-Corpus/blob/master/Chinese_Names_Corpus/Chinese_Family_Name%EF%BC%881k%EF%BC%89.xlsx?raw=true")

In [None]:
df_xing.columns = ["xing", "frequency"]
df_xing = df_xing[df_xing.frequency > 6]
df_xing.head()

In [None]:
names_list_m = df_ming[df_ming["sex"] == "男"]["name_expanded"]
names_list_f = df_ming[df_ming["sex"] == "女"]["name_expanded"]

In [None]:
names_list_m

In [None]:
char_counter = {
    "m": [Counter([name[i] for name in names_list_m]) for i in range(2)],
    "f": [Counter([name[i] for name in names_list_f]) for i in range(2)],
}

In [None]:
data = {
    "gender": [],
    "count": [],
    "char": [],
    "position": [],
}
for gender in ["m", "f"]:
    for i in range(2):
        chars = char_counter[gender][i].keys() 
        counts = char_counter[gender][i].values()
        length = len(chars)
        data["char"].extend(chars)
        data["count"].extend(counts)
        data["gender"].extend([gender] * length)
        data["position"].extend([i + 1] * length)
        
df = pd.DataFrame(data).sort_values(
    ["gender", "position", "count"],
    ascending=[False, True, False],
    ignore_index=True
)
df

In [None]:
df["pinyin"] = df.char.apply(lambda char: lazy_pinyin(char)[0])
df["pinyin_tone"] = df.char.apply(lambda char: pinyin(char)[0][0])
df.drop("count", axis=1, inplace=True)
df

In [None]:
df.to_csv("ming_chars.csv", index=False)

In [None]:
df_xing.frequency = df_xing.frequency.astype("int")
df_xing["pinyin"] = df_xing.xing.apply(lambda xing: lazy_pinyin(xing)[0])
df_xing = df_xing.rename(columns={"frequency": "count"})
df_xing["pinyins_tone"] = df_xing.xing.apply(lambda xing: pinyin(xing, heteronym=True)[0])
df_xing

In [None]:
df_xing[df_xing["pinyins_tone"].str.len() != 1]

In [None]:
df_xing_multi_pinyins = pd.DataFrame({
    "xing": ["曾","肖","任","葛","覃","翟","宁","单","纪","华","解","缪","卜","仇","查","鞠","乐","阚","区","朴","盖","占","都","阙","薄","那","尉","郗",],
    "pinyin": ["zeng","xiao","ren","ge","qin","zhai","ning","shan","ji","hua","xie","miao","bu","qiu","zha","ju","yue","kan","ou","piao","ge","zhan","du","que","bo","na","yu","chi",],
    "pinyin_tone": ["zēng","xiāo","rén","gě","qín","zhái","nìng","shàn","jǐ","huà","xiè","miào","bǔ","qiú","zhā","jú","yuè","kàn","ōu","piáo","gě","zhān","dū","què","bó","nā","yù","chī",],
})
df_xing_multi_pinyins

In [None]:
df_xing = df_xing.merge(df_xing_multi_pinyins, on="xing", suffixes=["_l", ""], how="left")
df_xing["pinyin_tone"].fillna(df_xing["pinyins_tone"].str[0], inplace=True)
df_xing["pinyin"].fillna(df_xing["pinyin_l"], inplace=True)
df_xing.drop(["count", "pinyin_l", "pinyins_tone"], axis=1, inplace=True)
df_xing

In [None]:
df_xing.to_csv("xings.csv", index=False)