In [None]:
import time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import itertools
import collections
import matplotlib.pyplot as plt

# 读取数据
df = pd.read_csv("Chinese_Names_Corpus_Gender（120W）.txt", header=2)
df

In [None]:
df = df[df.sex != "未知"]
names = df["dict"].values

# 计算单字频率
chars = [list(name) for name in names]
chars_flatten = list(itertools.chain(*chars))
freq = collections.Counter(chars_flatten)
freq = pd.DataFrame(freq.items(), columns=["char", "freq"])
freq = freq.sort_values(by="freq", ascending=False)
freq

In [None]:
# 频率分布
char_rank = np.arange(freq.shape[0])
char_freq = freq["freq"].values
plt.plot(char_rank, char_freq)

In [None]:
# 由常见字组成的名字数据
dict_size = 500
dict = list(freq["char"].values[:dict_size])
dict_set = set(dict)
filtered = list(filter(lambda item: set(item[1]).issubset(dict_set), enumerate(names)))
ind = [idx for idx, name in filtered]
dat = df.iloc[ind]
dat["y"] = np.where(dat["sex"] == "男", 0, 1)
dat

In [None]:
# 划分训练集和测试集
train = dat.sample(n=10000, random_state=123)
test = dat.sample(n=1000, random_state=321)

# One-hot 编码
def char2index(char):
    return dict.index(char)

def name2index(name):
    return [char2index(char) for char in name]

def name2tensor(name):
    tensor = torch.zeros(len(name), 1, dict_size)
    for i, char in enumerate(name):
        tensor[i, 0, char2index(char)] = 1
    return tensor

char2index("李")

In [None]:
name2index("李兴")

In [None]:
name2tensor("李兴")

In [None]:
# 建立模型
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, 1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), dim=1)
        hidden = torch.tanh(self.i2h(combined))
        output = torch.sigmoid(self.h2o(hidden))
        return output, hidden

    def init_hidden(self):
        return torch.zeros(1, self.hidden_size)

In [None]:
# 测试模型输入输出
n_hidden = 128
rnn = RNN(dict_size, n_hidden)
input = name2tensor("李兴")
hidden = rnn.init_hidden()
output, next_hidden = rnn(input[0], hidden)
print(output)
print(next_hidden)

In [None]:
hidden = next_hidden
output, next_hidden = rnn(input[1], hidden)
print(output)
print(next_hidden)

In [None]:
# 正式模型训练
np.random.seed(123)
torch.random.manual_seed(123)

n = train.shape[0]
n_hidden = 64
nepoch = 5
bs = 100

rnn = RNN(dict_size, n_hidden)
opt = torch.optim.Adam(rnn.parameters(), lr=0.001)
train_ind = np.arange(n)
losses = []

t1 = time.time()
for k in range(nepoch):
    np.random.shuffle(train_ind)
    # Mini-batch 循环
    for j in range(0, n, bs):
        # 建立 mini-batch
        mb = train.iloc[train_ind[j:(j + bs)]]
        mb_size = mb.shape[0]
        loss = 0.0
        # 对 Mini-batch 中的每个名字进行循环
        for i in range(mb_size):
            name = mb["dict"].values[i]
            input = name2tensor(name)
            hidden = rnn.init_hidden()
            y = mb["y"].values[i]
            # 对名字中的每个字进行循环
            for s in range(input.shape[0]):
                output, hidden = rnn(input[s], hidden)
            loss = loss - y * torch.log(output) - (1.0 - y) * torch.log(1.0 - output)

        loss = loss / mb_size
        opt.zero_grad()
        loss.backward()
        opt.step()

        losses.append(loss.item())
        if j // bs % 10 == 0:
            print(f"epoch {k}, batch {j // bs}, loss = {loss.item()}")
t2 = time.time()
print(t2 - t1)

In [None]:
plt.plot(losses)

In [None]:
# 对测试集预测
ntest = test.shape[0]
true_label = test["y"].values
pred = np.zeros(ntest)
rnn.eval()
for i in range(ntest):
    input = name2tensor(test["dict"].values[i])
    hidden = rnn.init_hidden()
    with torch.no_grad():
        for s in range(input.shape[0]):
            output, hidden = rnn(input[s], hidden)
    pred[i] = output.item()
    if i % 100 == 0:
        print(f"processed {i}")
loss = -np.mean(true_label * np.log(pred) + (1.0 - true_label) * np.log(1.0 - pred))
print(loss)
pred_label = (pred > 0.5).astype(int)
print(np.mean(pred_label == true_label))

In [None]:
# 随机取样
np.random.seed(123)
torch.random.manual_seed(123)
ind = np.random.choice(ntest, 10)
ypred = 1 * (pred[ind] > 0.5)
test.iloc[ind]

In [None]:
# 真实标签
print(test["y"].values[ind])

# 预测标签
print(ypred)

In [None]:
names = ["李", "李雪", "李雪峰"]
for name in names:
    input = name2tensor(name)
    hidden = rnn.init_hidden()
    with torch.no_grad():
        for s in range(input.shape[0]):
            output, hidden = rnn(input[s], hidden)
    pred = output.item()
    print(f"namae: {name}, P(female) = {pred}")