In [24]:
from io import open
import glob
import os
def findFiles(path): return glob.glob(path)
findFiles('../input/multilingual-names/*.txt')

['../input/multilingual-names/Vietnamese.txt',
 '../input/multilingual-names/Greek.txt',
 '../input/multilingual-names/Japanese.txt',
 '../input/multilingual-names/Dutch.txt',
 '../input/multilingual-names/Irish.txt',
 '../input/multilingual-names/Russian.txt',
 '../input/multilingual-names/Korean.txt',
 '../input/multilingual-names/Scottish.txt',
 '../input/multilingual-names/Czech.txt',
 '../input/multilingual-names/Italian.txt',
 '../input/multilingual-names/Arabic.txt',
 '../input/multilingual-names/Portuguese.txt',
 '../input/multilingual-names/Spanish.txt',
 '../input/multilingual-names/Chinese.txt',
 '../input/multilingual-names/French.txt',
 '../input/multilingual-names/English.txt',
 '../input/multilingual-names/German.txt',
 '../input/multilingual-names/Polish.txt']

In [25]:
import unicodedata
import string
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )
unicodeToAscii('Ślusàrski')

'Slusarski'

In [26]:
# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('../input/multilingual-names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

In [27]:
category_lines['Chinese'][:5]

['Ang', 'AuYong', 'Bai', 'Ban', 'Bao']

In [28]:
def split_line(name:str):
    return list(name)

split_line("Slusarski")

['S', 'l', 'u', 's', 'a', 'r', 's', 'k', 'i']

In [29]:
data_pairs = []
for category_name, lines in category_lines.items():
    for name in lines:
        data_pairs.append((split_line(name), category_name))

data_pairs[:5]

[(['N', 'g', 'u', 'y', 'e', 'n'], 'Vietnamese'),
 (['T', 'r', 'o', 'n'], 'Vietnamese'),
 (['L', 'e'], 'Vietnamese'),
 (['P', 'h', 'a', 'm'], 'Vietnamese'),
 (['H', 'u', 'y', 'n', 'h'], 'Vietnamese')]

In [30]:
import random
random.seed(114514)

index = list(range(len(data_pairs)))

random.shuffle(index)

valid_index = index[:2000]
test_index = index[2000:4000]
train_index = index[4000:]

valid_pairs = [data_pairs[i] for i in valid_index]
test_pairs = [data_pairs[i] for i in test_index]
train_pairs = [data_pairs[i] for i in train_index]

valid_pairs[:5]

[(['L', 'e', 'z', 'h', 'e', 'b', 'o', 'k', 'o', 'v'], 'Russian'),
 (['R', 'a', 'k', 'h', 'm', 'a', 'n', 'i', 'n', 'o', 'v'], 'Russian'),
 (['A', 'v', 'e', 'r', 'k', 'o', 'v', 'i', 'c', 'h'], 'Russian'),
 (['T', 'a', 'l', 'o', 'v', 'i', 'r', 'k', 'o'], 'Russian'),
 (['D', 'j', 'u', 'm', 'a', 'e', 'v'], 'Russian')]

In [31]:
class WordDict:
    def __init__(self) -> None:
        self.index2word = {}
        self.word2index = {}
        self.dict_size = 0
    def add_word(self, word:str):
        if word not in self.word2index:
            self.word2index[word] = self.dict_size
            self.index2word[self.dict_size] = word
            self.dict_size += 1
    def index(self, word:str):
        if word in self.word2index:
            return self.word2index[word]
        else:
            return -1
    def word(self, index:int):
        if index in self.index2word:
            return self.index2word[index]
        else:
            return "<unk>"
name_dict = WordDict()
category_dict = WordDict()

name_dict.add_word("<pad>")
for name, category_name in data_pairs:
    for char in name:
        name_dict.add_word(char)
    category_dict.add_word(category_name)

In [32]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from typing import Tuple, List

class NameDataset(Dataset):
    def __init__(self, pairs:Tuple[List[str], str], name_dict:WordDict, category_dict:WordDict) -> None:
        super().__init__()
        self.pairs = pairs
        self.name_dict = name_dict
        self.category_dict = category_dict

    def __getitem__(self, index):
        name, category = self.pairs[index]
        x = torch.LongTensor([self.name_dict.index(char) for char in name])
        y = torch.LongTensor([self.category_dict.index(category)])
        return (x,y)

    def __len__(self):
        return len(self.pairs)
valid_dataset = NameDataset(valid_pairs, name_dict, category_dict)
test_dataset = NameDataset(test_pairs, name_dict, category_dict)
train_dataset = NameDataset(train_pairs, name_dict, category_dict)

valid_dataset[5]

(tensor([24,  9, 31, 29, 30, 46,  9,  8, 35, 12,  4]), tensor([15]))

In [33]:
#由于不同的姓名长度不统一，下面定义一个函数填充长度不足的词组
def collate_fn(pair_list):
    input_list, label_list = zip(*pair_list)

    max_len = max([len(input_tensor) for input_tensor in input_list])

    collated_input = []
    for input_tensor in input_list:
        padding_len = max_len - len(input_tensor)
        padding_tensor = torch.zeros((padding_len,), dtype=torch.long)
        padding_tensor[:] = name_dict.index("<pad>")
        collated_input.append(torch.cat([padding_tensor,input_tensor], dim=0))

    collated_input = torch.stack(collated_input, dim=0)
    collated_label = torch.cat(label_list, dim=0)
    return collated_input, collated_label

In [34]:
from torch.utils.data import DataLoader
valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
for data,label in valid_dataloader:
    print(data)
    print(label)

tensor([[ 0,  0, 10,  5, 36, 12,  5, 42,  9, 32,  9, 33],
        [ 0, 43, 13, 32, 12, 14, 13,  6, 19,  6,  9, 33],
        [ 0,  0, 20, 33,  5,  8, 32,  9, 33, 19, 21, 12],
        [ 0,  0,  0,  7, 13, 31,  9, 33, 19,  8, 32,  9],
        [ 0,  0,  0,  0,  0, 17, 40,  3, 14, 13,  5, 33],
        [ 0, 24,  9, 31, 29, 30, 46,  9,  8, 35, 12,  4],
        [ 0,  0,  0,  0,  7,  3,  8,  3, 30, 12,  5, 33],
        [ 0,  0,  0, 20, 36, 12, 19,  2, 13,  6,  9, 33],
        [ 0,  0,  0,  0,  0,  0,  0,  0,  7,  9, 14, 13],
        [ 0,  0,  0,  0,  0, 26, 31,  5, 21, 12, 19,  6],
        [ 0,  0,  0,  0,  0,  0, 10, 19,  6, 30, 42,  4],
        [ 0,  0,  0,  0,  0,  0,  0, 43, 19, 29,  2,  5],
        [ 0,  0,  0,  0,  0,  0,  0,  1,  9,  4, 21,  5],
        [ 0,  0,  0,  0,  0,  0, 11, 19,  8, 30, 32,  4],
        [ 0,  0,  0,  7, 30, 13, 31, 31, 13,  2,  9, 33],
        [ 0,  0,  0,  0,  0,  0,  7,  9, 42, 19, 13, 30],
        [ 0,  0,  0,  7, 36, 13,  8,  5, 33, 30, 32,  4],
        [ 0,  

In [35]:
for batch in valid_dataloader:
    print(batch[0][:4])
    print(batch[1][:4])
    break

tensor([[ 0,  0, 10,  5, 36, 12,  5, 42,  9, 32,  9, 33],
        [ 0, 43, 13, 32, 12, 14, 13,  6, 19,  6,  9, 33],
        [ 0,  0, 20, 33,  5,  8, 32,  9, 33, 19, 21, 12],
        [ 0,  0,  0,  7, 13, 31,  9, 33, 19,  8, 32,  9]])
tensor([5, 5, 5, 5])


In [36]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [37]:
drop_out=0.1
batch_size=512
# dataloader
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) # shuffle=True打乱采样顺序（随机抽取Batch）
loss_func = nn.CrossEntropyLoss(reduction = "sum").to(device) # 先把每个Batch的loss加起来不平均
# [len(data[0])]*128

In [38]:
# epoch_n = 100
# for i in range(epoch_n):
#     print(f'Epoch: {i+1:02} ')
#     train(model, optimizer, loss_func, train_dataloader)
#     evaluate(model, valid_dataloader)

In [39]:
class RNN(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim,out_dim,bidirectional,dropout):
        super().__init__()
        self.embedding=nn.Embedding(vocab_size,embedding_dim)
        self.rnn=nn.LSTM(embedding_dim,hidden_dim,num_layers=4,bidirectional=bidirectional,dropout=dropout,batch_first=True)
        self.fc=nn.Linear(hidden_dim*4,out_dim)
        self.dropout=nn.Dropout(dropout)
    def forward(self,text):
        embedded=self.dropout(self.embedding(text))
        #packed_embedded=nn.utils.rnn.pack_padded_sequence(embedded,text_lengths)
        packed_out,(hidden,cell)=self.rnn(embedded)
        hidden=self.dropout(torch.cat((hidden[-4,:,:],hidden[-3,:,:],hidden[-2,:,:],hidden[-1,:,:]),dim=1))
        return self.fc(hidden)
model2=RNN(vocab_size=name_dict.dict_size,embedding_dim=64,hidden_dim=128,out_dim=category_dict.dict_size,dropout=drop_out,bidirectional=True)
model2=model2.to(device)
optimizer=torch.optim.Adam(params=model2.parameters())

In [40]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model2):,} trainable parameters')
count_parameters(model2)

The model has 1,397,266 trainable parameters


1397266

In [41]:
def train_RNN(model:nn.Module, optimizer:torch.optim.Optimizer, loss_func, dataloader):
    sum_loss = 0
    train_num = 0
    model.train() # 将model设置为训练状态
    for input_data, label in dataloader:
        optimizer.zero_grad() # 初始化梯度

        pred = model(input_data.to(device))
        loss:Tensor = loss_func(pred, label.to(device))
        sum_loss += loss.detach()
        train_num += len(input_data)
        # 对loss求导数
        loss.backward()
        optimizer.step() # 更新参数
    print(f"\tAvg loss:{(sum_loss/train_num).item()}")
def evaluate_RNN(model:nn.Module, dataloader):
    acc_num = 0
    all_num = 0
    model.eval() # 将model设置为评估状态
    for input_data, label in dataloader:
        # text_lengths=[len(input_data[0])]*batch_size
        pred = model(input_data.to(device))
        acc_num += (pred.max(dim=1)[1] == label.to(device)).sum().item()
        all_num += len(input_data)
    print(f"\tacc rate: {acc_num/all_num}")

In [43]:

for i in range(20):
    print(f'Epoch: {i+1:02} ')
    train_RNN(model2, optimizer, loss_func, train_dataloader)
    evaluate_RNN(model2, valid_dataloader)



Epoch: 01 
	Avg loss:0.30550628900527954
	acc rate: 0.828
Epoch: 02 
	Avg loss:0.2902951240539551
	acc rate: 0.8315
Epoch: 03 
	Avg loss:0.2727334797382355
	acc rate: 0.831
Epoch: 04 
	Avg loss:0.2527172863483429
	acc rate: 0.8305
Epoch: 05 
	Avg loss:0.2378111332654953
	acc rate: 0.8265
Epoch: 06 
	Avg loss:0.22893038392066956
	acc rate: 0.8295
Epoch: 07 
	Avg loss:0.21017734706401825
	acc rate: 0.833
Epoch: 08 
	Avg loss:0.1972125768661499
	acc rate: 0.832
Epoch: 09 
	Avg loss:0.1853923499584198
	acc rate: 0.833
Epoch: 10 
	Avg loss:0.16778559982776642
	acc rate: 0.8315
Epoch: 11 
	Avg loss:0.1575263887643814
	acc rate: 0.8245
Epoch: 12 
	Avg loss:0.1601351797580719
	acc rate: 0.8205
Epoch: 13 
	Avg loss:0.15196548402309418
	acc rate: 0.8265
Epoch: 14 
	Avg loss:0.14000365138053894
	acc rate: 0.835
Epoch: 15 
	Avg loss:0.13331280648708344
	acc rate: 0.832
Epoch: 16 
	Avg loss:0.1283184140920639
	acc rate: 0.8185
Epoch: 17 
	Avg loss:0.12164068222045898
	acc rate: 0.8325
Epoch: 18 
	A