In [1]:
from io import open
import glob
import os
def findFiles(path): return glob.glob(path)
findFiles('../input/multilingual-names/*.txt')


['../input/multilingual-names/Vietnamese.txt',
 '../input/multilingual-names/Greek.txt',
 '../input/multilingual-names/Japanese.txt',
 '../input/multilingual-names/Dutch.txt',
 '../input/multilingual-names/Irish.txt',
 '../input/multilingual-names/Russian.txt',
 '../input/multilingual-names/Korean.txt',
 '../input/multilingual-names/Scottish.txt',
 '../input/multilingual-names/Czech.txt',
 '../input/multilingual-names/Italian.txt',
 '../input/multilingual-names/Arabic.txt',
 '../input/multilingual-names/Portuguese.txt',
 '../input/multilingual-names/Spanish.txt',
 '../input/multilingual-names/Chinese.txt',
 '../input/multilingual-names/French.txt',
 '../input/multilingual-names/English.txt',
 '../input/multilingual-names/German.txt',
 '../input/multilingual-names/Polish.txt']

In [2]:
import unicodedata
import string
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )
unicodeToAscii('Ślusàrski')

'Slusarski'

In [3]:
# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('../input/multilingual-names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

In [4]:
category_lines['Chinese'][:5]

['Ang', 'AuYong', 'Bai', 'Ban', 'Bao']

In [5]:
def split_line(name:str):
    return list(name)

split_line("Slusarski")

['S', 'l', 'u', 's', 'a', 'r', 's', 'k', 'i']

In [6]:
data_pairs = []
for category_name, lines in category_lines.items():
    for name in lines:
        data_pairs.append((split_line(name), category_name))

data_pairs[:5]

[(['N', 'g', 'u', 'y', 'e', 'n'], 'Vietnamese'),
 (['T', 'r', 'o', 'n'], 'Vietnamese'),
 (['L', 'e'], 'Vietnamese'),
 (['P', 'h', 'a', 'm'], 'Vietnamese'),
 (['H', 'u', 'y', 'n', 'h'], 'Vietnamese')]

In [7]:
import random
random.seed(114514)

index = list(range(len(data_pairs)))

random.shuffle(index)

valid_index = index[:2000]
test_index = index[2000:4000]
train_index = index[4000:]

valid_pairs = [data_pairs[i] for i in valid_index]
test_pairs = [data_pairs[i] for i in test_index]
train_pairs = [data_pairs[i] for i in train_index]

valid_pairs[:5]

[(['L', 'e', 'z', 'h', 'e', 'b', 'o', 'k', 'o', 'v'], 'Russian'),
 (['R', 'a', 'k', 'h', 'm', 'a', 'n', 'i', 'n', 'o', 'v'], 'Russian'),
 (['A', 'v', 'e', 'r', 'k', 'o', 'v', 'i', 'c', 'h'], 'Russian'),
 (['T', 'a', 'l', 'o', 'v', 'i', 'r', 'k', 'o'], 'Russian'),
 (['D', 'j', 'u', 'm', 'a', 'e', 'v'], 'Russian')]

In [8]:
class WordDict:
    def __init__(self) -> None:
        self.index2word = {}
        self.word2index = {}
        self.dict_size = 0
    def add_word(self, word:str):
        if word not in self.word2index:
            self.word2index[word] = self.dict_size
            self.index2word[self.dict_size] = word
            self.dict_size += 1
    def index(self, word:str):
        if word in self.word2index:
            return self.word2index[word]
        else:
            return -1
    def word(self, index:int):
        if index in self.index2word:
            return self.index2word[index]
        else:
            return "<unk>"
name_dict = WordDict()
category_dict = WordDict()

name_dict.add_word("<pad>")
for name, category_name in data_pairs:
    for char in name:
        name_dict.add_word(char)
    category_dict.add_word(category_name)

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from typing import Tuple, List

class NameDataset(Dataset):
    def __init__(self, pairs:Tuple[List[str], str], name_dict:WordDict, category_dict:WordDict) -> None:
        super().__init__()
        self.pairs = pairs
        self.name_dict = name_dict
        self.category_dict = category_dict

    def __getitem__(self, index):
        name, category = self.pairs[index]
        x = torch.LongTensor([self.name_dict.index(char) for char in name])
        y = torch.LongTensor([self.category_dict.index(category)])
        return (x,y)

    def __len__(self):
        return len(self.pairs)
valid_dataset = NameDataset(valid_pairs, name_dict, category_dict)
test_dataset = NameDataset(test_pairs, name_dict, category_dict)
train_dataset = NameDataset(train_pairs, name_dict, category_dict)

valid_dataset[5]

(tensor([24,  9, 31, 29, 30, 46,  9,  8, 35, 12,  4]), tensor([15]))

In [10]:
#由于不同的姓名长度不统一，下面定义一个函数填充长度不足的词组
def collate_fn(pair_list):
    input_list, label_list = zip(*pair_list)

    max_len = max([len(input_tensor) for input_tensor in input_list])

    collated_input = []
    for input_tensor in input_list:
        padding_len = max_len - len(input_tensor)
        padding_tensor = torch.zeros((padding_len,), dtype=torch.long)
        padding_tensor[:] = name_dict.index("<pad>")
        collated_input.append(torch.cat([input_tensor, padding_tensor], dim=0))

    collated_input = torch.stack(collated_input, dim=0)
    collated_label = torch.cat(label_list, dim=0)
    return collated_input, collated_label

In [11]:
from torch.utils.data import DataLoader
valid_dataloader = DataLoader(valid_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

In [12]:
for batch in valid_dataloader:
    print(batch[0][:4])
    print(batch[1][:4])
    break


tensor([[10,  5, 36, 12,  5, 42,  9, 32,  9, 33,  0,  0],
        [43, 13, 32, 12, 14, 13,  6, 19,  6,  9, 33,  0],
        [20, 33,  5,  8, 32,  9, 33, 19, 21, 12,  0,  0],
        [ 7, 13, 31,  9, 33, 19,  8, 32,  9,  0,  0,  0]])
tensor([5, 5, 5, 5])


In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [14]:
from torch import Tensor
class NameCNN(nn.Module):
    def __init__(self, embedding_size, embedding_dim, out_features, dropout) -> None:
        super().__init__()

        self.embedding = nn.Embedding(embedding_size, embedding_dim)
        self.conv1 = nn.Conv1d(embedding_dim, 2*embedding_dim, 3, padding=1)
        self.conv2 = nn.Conv1d(2*embedding_dim, 4*embedding_dim, 3, padding=1)
        self.conv3 = nn.Conv1d(4*embedding_dim, 8*embedding_dim, 3, padding=1)
        self.pool1 = nn.MaxPool1d(2)
        self.pool2 = nn.MaxPool1d(2)
        self.linear = nn.Linear(8*embedding_dim, out_features)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x:Tensor):
        x = self.embedding(x) # B x T -> B x T x C
        x = x.transpose(1,2) # B x T x C -> B x C x T
        x = self.conv1(x) # B x C x T -> B x 2C x T
        x = nn.functional.relu(x) # relu本身不带参数，使用nn.functional.relu不需要初始化一个relu层组件
        x = self.pool1(x) # B x 2C x T -> B x 2C x T/2
        x = self.dropout(x)
        x = self.conv2(x) # B x 2C x T/2 -> B x 4C x T/2
        x = nn.functional.relu(x)
        x = self.pool2(x) # B x 4C x T/2 -> B x 4C x T/4
        x = self.dropout(x)
        x = self.conv3(x) # B x 4C x T/4 -> B x 8C x T/4
        x = nn.functional.relu(x)
        x = x.max(dim=2)[0] # final max pool  B x 8C x T/4 -> B x 8C
        x = self.dropout(x)
        x = self.linear(x) # B x 8C -> B x class_num
        return x # 这里没有用softmax转化为概率，我们在loss function中完成这一步骤
    

In [15]:
# dataloader
valid_dataloader = DataLoader(valid_dataset, batch_size=128, shuffle=False, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False, collate_fn=collate_fn)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True, collate_fn=collate_fn) # shuffle=True打乱采样顺序（随机抽取Batch）
model = NameCNN(embedding_size=name_dict.dict_size, embedding_dim=64, out_features=category_dict.dict_size, dropout=0.2).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=5e-4, momentum=0.9)
loss_func = nn.CrossEntropyLoss(reduction = "sum").to(device) # 先把每个Batch的loss加起来不平均

In [16]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')
count_parameters(model)

The model has 529,810 trainable parameters


529810

In [17]:
def train(model:nn.Module, optimizer:torch.optim.Optimizer, loss_func, dataloader):
   
    sum_loss = 0
    train_num = 0
    model.train() # 将model设置为训练状态
    for input_data, label in dataloader:
        optimizer.zero_grad() # 初始化梯度
        pred = model(input_data.to(device))
        loss:Tensor = loss_func(pred, label.to(device))
        sum_loss += loss.detach()
        
        train_num += len(input_data)
        # 对loss求导数
        loss.backward()
        optimizer.step() # 更新参数
    return (sum_loss/train_num).item()
def evaluate(model:nn.Module, dataloader):
    acc_num = 0
    all_num = 0

    model.eval() # 将model设置为评估状态
    for input_data, label in dataloader:
        input_data=input_data.to(device)
        label=label.to(device)
        pred = model(input_data)
        acc_num += (pred.max(dim=1)[1] == label).sum().item()
        all_num += len(input_data)
    
    return acc_num/all_num

In [18]:
from tqdm import tqdm
epoch_n = 20
loss_list=[]


for i in range(epoch_n):
    loss=train(model, optimizer, loss_func, train_dataloader)

    accuracy=evaluate(model, valid_dataloader)


    print(f'Epoch: {i+1:02} ')
    print(f'\tTrain Loss: {loss:.3f} | Train Acc: {accuracy*100:.2f}%')


Epoch: 01 
	Train Loss: 1.404 | Train Acc: 71.50%
Epoch: 02 
	Train Loss: 0.948 | Train Acc: 77.65%
Epoch: 03 
	Train Loss: 0.777 | Train Acc: 79.25%
Epoch: 04 
	Train Loss: 0.679 | Train Acc: 81.60%
Epoch: 05 
	Train Loss: 0.607 | Train Acc: 82.20%
Epoch: 06 
	Train Loss: 0.568 | Train Acc: 81.40%
Epoch: 07 
	Train Loss: 0.536 | Train Acc: 82.65%
Epoch: 08 
	Train Loss: 0.496 | Train Acc: 81.45%
Epoch: 09 
	Train Loss: 0.466 | Train Acc: 81.25%
Epoch: 10 
	Train Loss: 0.449 | Train Acc: 82.00%
Epoch: 11 
	Train Loss: 0.416 | Train Acc: 83.40%
Epoch: 12 
	Train Loss: 0.398 | Train Acc: 82.65%
Epoch: 13 
	Train Loss: 0.379 | Train Acc: 83.35%
Epoch: 14 
	Train Loss: 0.361 | Train Acc: 83.80%
Epoch: 15 
	Train Loss: 0.351 | Train Acc: 83.05%
Epoch: 16 
	Train Loss: 0.330 | Train Acc: 84.10%
Epoch: 17 
	Train Loss: 0.330 | Train Acc: 83.10%
Epoch: 18 
	Train Loss: 0.316 | Train Acc: 83.20%
Epoch: 19 
	Train Loss: 0.313 | Train Acc: 83.70%
Epoch: 20 
	Train Loss: 0.295 | Train Acc: 83.60%
