In [1]:
from io import open
import glob
import os
def findFiles(path): return glob.glob(path)
findFiles('../input/multilingual-names/*.txt')

In [2]:
import unicodedata
import string
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )
unicodeToAscii('Ślusàrski')

In [3]:
# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('../input/multilingual-names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

In [4]:
category_lines['Chinese'][:5]

In [5]:
def split_line(name:str):
    return list(name)

split_line("Slusarski")

In [6]:
data_pairs = []
for category_name, lines in category_lines.items():
    for name in lines:
        data_pairs.append((split_line(name), category_name))

data_pairs[:5]

In [7]:
import random
random.seed(114514)

index = list(range(len(data_pairs)))

random.shuffle(index)

valid_index = index[:2000]
test_index = index[2000:4000]
train_index = index[4000:]

valid_pairs = [data_pairs[i] for i in valid_index]
test_pairs = [data_pairs[i] for i in test_index]
train_pairs = [data_pairs[i] for i in train_index]

valid_pairs[:5]

In [8]:
class WordDict:
    def __init__(self) -> None:
        self.index2word = {}
        self.word2index = {}
        self.dict_size = 0
    def add_word(self, word:str):
        if word not in self.word2index:
            self.word2index[word] = self.dict_size
            self.index2word[self.dict_size] = word
            self.dict_size += 1
    def index(self, word:str):
        if word in self.word2index:
            return self.word2index[word]
        else:
            return -1
    def word(self, index:int):
        if index in self.index2word:
            return self.index2word[index]
        else:
            return "<unk>"
name_dict = WordDict()
category_dict = WordDict()

name_dict.add_word("<pad>")
for name, category_name in data_pairs:
    for char in name:
        name_dict.add_word(char)
    category_dict.add_word(category_name)

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from typing import Tuple, List

class NameDataset(Dataset):
    def __init__(self, pairs:Tuple[List[str], str], name_dict:WordDict, category_dict:WordDict) -> None:
        super().__init__()
        self.pairs = pairs
        self.name_dict = name_dict
        self.category_dict = category_dict

    def __getitem__(self, index):
        name, category = self.pairs[index]
        x = torch.LongTensor([self.name_dict.index(char) for char in name])
        y = torch.LongTensor([self.category_dict.index(category)])
        return (x,y)

    def __len__(self):
        return len(self.pairs)
valid_dataset = NameDataset(valid_pairs, name_dict, category_dict)
test_dataset = NameDataset(test_pairs, name_dict, category_dict)
train_dataset = NameDataset(train_pairs, name_dict, category_dict)

valid_dataset[5]

In [10]:
#由于不同的姓名长度不统一，下面定义一个函数填充长度不足的词组
def collate_fn(pair_list):
    input_list, label_list = zip(*pair_list)

    max_len = max([len(input_tensor) for input_tensor in input_list])

    collated_input = []
    for input_tensor in input_list:
        padding_len = max_len - len(input_tensor)
        padding_tensor = torch.zeros((padding_len,), dtype=torch.long)
        padding_tensor[:] = name_dict.index("<pad>")
        collated_input.append(torch.cat([padding_tensor,input_tensor], dim=0))

    collated_input = torch.stack(collated_input, dim=0)
    collated_label = torch.cat(label_list, dim=0)
    return collated_input, collated_label

In [11]:
from torch.utils.data import DataLoader
valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)
for data,label in valid_dataloader:
    print(data)
    print(label)

In [12]:
for batch in valid_dataloader:
    print(batch[0][:4])
    print(batch[1][:4])
    break

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [37]:
drop_out=0.1
batch_size=512
# dataloader
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn) # shuffle=True打乱采样顺序（随机抽取Batch）
loss_func = nn.CrossEntropyLoss(reduction = "sum").to(device) # 先把每个Batch的loss加起来不平均
# [len(data[0])]*128

In [26]:
# epoch_n = 100
# for i in range(epoch_n):
#     print(f'Epoch: {i+1:02} ')
#     train(model, optimizer, loss_func, train_dataloader)
#     evaluate(model, valid_dataloader)

In [27]:
class RNN(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim,out_dim,bidirectional,dropout):
        super().__init__()
        self.embedding=nn.Embedding(vocab_size,embedding_dim)
        self.rnn=nn.LSTM(embedding_dim,hidden_dim,num_layers=4,bidirectional=bidirectional,dropout=dropout,batch_first=True)
        self.fc=nn.Linear(hidden_dim*4,out_dim)
        self.dropout=nn.Dropout(dropout)
    def forward(self,text):
        embedded=self.dropout(self.embedding(text))
        #packed_embedded=nn.utils.rnn.pack_padded_sequence(embedded,text_lengths)
        packed_out,(hidden,cell)=self.rnn(embedded)
        hidden=self.dropout(torch.cat((hidden[-4,:,:],hidden[-3,:,:],hidden[-2,:,:],hidden[-1,:,:]),dim=1))
        return self.fc(hidden)
model2=RNN(vocab_size=name_dict.dict_size,embedding_dim=64,hidden_dim=128,out_dim=category_dict.dict_size,dropout=drop_out,bidirectional=True)
model2=model2.to(device)
optimizer=torch.optim.Adam(params=model2.parameters())

In [28]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model2):,} trainable parameters')
count_parameters(model2)

In [29]:
def train_RNN(model:nn.Module, optimizer:torch.optim.Optimizer, loss_func, dataloader):
    sum_loss = 0
    train_num = 0
    loss_list=[]
    counter=0
    model.train() # 将model设置为训练状态
    for input_data, label in dataloader:
        counter=counter+1
        optimizer.zero_grad() # 初始化梯度

        pred = model(input_data.to(device))
        loss:Tensor = loss_func(pred, label.to(device))
        if(counter%10==0):
            loss_list.append(loss.item())
        sum_loss += loss.detach()
        train_num += len(input_data)
        # 对loss求导数
        loss.backward()
        optimizer.step() # 更新参数
    print(f"\tAvg loss:{(sum_loss/train_num).item()}")
    return loss_list
def evaluate_RNN(model:nn.Module, dataloader):
    acc_num = 0
    all_num = 0
    model.eval() # 将model设置为评估状态
    for input_data, label in dataloader:
        # text_lengths=[len(input_data[0])]*batch_size
        pred = model(input_data.to(device))
        acc_num += (pred.max(dim=1)[1] == label.to(device)).sum().item()
        all_num += len(input_data)
    print(f"\tacc rate: {acc_num/all_num}")

In [30]:
total_loss_list=[]
for i in range(20):
    print(f'Epoch: {i+1:02} ')
    each_loss=train_RNN(model2, optimizer, loss_func, train_dataloader)
    total_loss_list=total_loss_list+each_loss
    evaluate_RNN(model2, valid_dataloader)



In [31]:
import pandas as pd
import matplotlib.pyplot as plt
def plot_progress(progress):
        df = pd.DataFrame(progress, columns=['loss'])
        df.plot(ylim=(0), figsize=(16,8), alpha=0.4, marker='.', grid=True)
        plt.title('Loss Changes')
plot_progress(total_loss_list)


In [42]:
em_dim=[32,64,128,256]
hi_dim=[32,64,128,256]
def crazy_train(em_dim_list,hi_dim_list):
    loss_list=[]
    for i in em_dim_list:
        for j in hi_dim_list:
            model2=RNN(vocab_size=name_dict.dict_size,embedding_dim=i,hidden_dim=j,out_dim=category_dict.dict_size,dropout=drop_out,bidirectional=True)
            model2=model2.to(device)
            optimizer=torch.optim.Adam(params=model2.parameters())
            total_loss_list=[]
            for i in range(20):
                print(f'Epoch: {i+1:02} ')
                each_loss=train_RNN(model2, optimizer, loss_func, train_dataloader)
                total_loss_list=total_loss_list+each_loss
                evaluate_RNN(model2, valid_dataloader)
            loss_list.append(total_loss_list)
    return loss_list
loss_list=crazy_train(em_dim,hi_dim)

In [52]:
plt.rcParams['font.family'] = 'serif'
plt.figure(figsize=(16,10),dpi=400)
plt.plot(range(len(loss_list[0])),loss_list[0],label='Embedding dim=32,Hidden dim=32')
plt.plot(range(len(loss_list[1])),loss_list[1],label='Embedding dim=32,Hidden dim=64')
plt.plot(range(len(loss_list[2])),loss_list[2],label='Embedding dim=32,Hidden dim=128')
plt.plot(range(len(loss_list[3])),loss_list[3],label='Embedding dim=32,Hidden dim=256')
plt.plot(range(len(loss_list[0])),loss_list[4],label='Embedding dim=64,Hidden dim=32')
plt.plot(range(len(loss_list[1])),loss_list[5],label='Embedding dim=64,Hidden dim=64')
plt.plot(range(len(loss_list[2])),loss_list[6],label='Embedding dim=64,Hidden dim=128')
plt.plot(range(len(loss_list[3])),loss_list[7],label='Embedding dim=64,Hidden dim=256')
plt.plot(range(len(loss_list[0])),loss_list[8],label='Embedding dim=128,Hidden dim=32')
plt.plot(range(len(loss_list[1])),loss_list[9],label='Embedding dim=128,Hidden dim=64')
plt.plot(range(len(loss_list[2])),loss_list[10],label='Embedding dim=128,Hidden dim=128')
plt.plot(range(len(loss_list[3])),loss_list[11],label='Embedding dim=128,Hidden dim=256')
plt.legend()