In [1]:
import pandas as pd
from urllib.request import urlopen
from collections import Counter
from pypinyin import lazy_pinyin

In [2]:
df_ming = pd.read_csv(
    "https://raw.githubusercontent.com/wainshine/Chinese-Names-Corpus/master/Chinese_Names_Corpus/Chinese_Names_Corpus_Gender%EF%BC%88120W%EF%BC%89.txt",
    skiprows=3,
)

In [3]:
df_ming = df_ming[df_ming.sex != "未知"]

In [4]:
df_ming.head()

Unnamed: 0,dict,sex
0,阿安,男
2,阿斌,男
3,阿滨,男
4,阿冰,女
5,阿冰冰,女


In [5]:
df_ming["name_expanded"] = df_ming["dict"].str[1:].apply(lambda name: [name] * 2 if len(name) == 1 else list(name))
df_ming

Unnamed: 0,dict,sex,name_expanded
0,阿安,男,"[安, 安]"
2,阿斌,男,"[斌, 斌]"
3,阿滨,男,"[滨, 滨]"
4,阿冰,女,"[冰, 冰]"
5,阿冰冰,女,"[冰, 冰]"
...,...,...,...
1145004,佐腾,男,"[腾, 腾]"
1145005,佐威,男,"[威, 威]"
1145006,佐为,男,"[为, 为]"
1145007,佐樱,女,"[樱, 樱]"


In [6]:
df_xing = pd.read_excel("https://github.com/wainshine/Chinese-Names-Corpus/blob/master/Chinese_Names_Corpus/Chinese_Family_Name%EF%BC%881k%EF%BC%89.xlsx?raw=true")

In [7]:
df_xing.columns = ["xing", "frequency"]
df_xing = df_xing[df_xing.frequency > 6]
df_xing.head()

Unnamed: 0,xing,frequency
0,王,9520.0
1,李,9340.0
2,张,8960.0
3,刘,6770.0
4,陈,6130.0


In [8]:
names_list_m = df_ming[df_ming["sex"] == "男"]["name_expanded"]
names_list_f = df_ming[df_ming["sex"] == "女"]["name_expanded"]

In [9]:
names_list_m

0          [安, 安]
2          [斌, 斌]
3          [滨, 滨]
6          [兵, 兵]
8          [超, 超]
            ...  
1145003    [山, 山]
1145004    [腾, 腾]
1145005    [威, 威]
1145006    [为, 为]
1145008    [子, 子]
Name: name_expanded, Length: 675460, dtype: object

In [10]:
char_counter = {
    "m": [Counter([name[i] for name in names_list_m]) for i in range(2)],
    "f": [Counter([name[i] for name in names_list_f]) for i in range(2)],
}

In [11]:
data = {
    "gender": [],
    "count": [],
    "char": [],
    "position": [],
}
for gender in ["m", "f"]:
    for i in range(2):
        chars = char_counter[gender][i].keys() 
        counts = char_counter[gender][i].values()
        length = len(chars)
        data["char"].extend(chars)
        data["count"].extend(counts)
        data["gender"].extend([gender] * length)
        data["position"].extend([i + 1] * length)
        
df = pd.DataFrame(data).sort_values(
    ["gender", "position", "count"],
    ascending=[False, True, False],
    ignore_index=True
)
df

Unnamed: 0,gender,count,char,position
0,m,13711,文,1
1,m,12760,志,1
2,m,11750,国,1
3,m,11692,建,1
4,m,11484,永,1
...,...,...,...,...
6070,f,1,烁,2
6071,f,1,师,2
6072,f,1,岸,2
6073,f,1,而,2


In [12]:
df["pinyin"] = df.char.apply(lambda char: lazy_pinyin(char)[0])
df

Unnamed: 0,gender,count,char,position,pinyin
0,m,13711,文,1,wen
1,m,12760,志,1,zhi
2,m,11750,国,1,guo
3,m,11692,建,1,jian
4,m,11484,永,1,yong
...,...,...,...,...,...
6070,f,1,烁,2,shuo
6071,f,1,师,2,shi
6072,f,1,岸,2,an
6073,f,1,而,2,er


In [13]:
df.to_csv("ming_chars.csv", index=False)

In [14]:
df_xing.frequency = df_xing.frequency.astype("int")
df_xing["pinyin"] = df_xing.xing.apply(lambda xing: lazy_pinyin(xing)[0])
df_xing = df_xing.rename(columns={"frequency": "count"})
df_xing

Unnamed: 0,xing,count,pinyin
0,王,9520,wang
1,李,9340,li
2,张,8960,zhang
3,刘,6770,liu
4,陈,6130,chen
...,...,...,...
417,濮,7,pu
418,水,7,shui
419,蔚,7,wei
420,郗,7,xi


In [15]:
df_xing.to_csv("xings.csv", index=False)