In [1]:
from io import open
import glob
import os
import unicodedata
import string

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torchvision import datasets,transforms

from sklearn.model_selection import train_test_split

def findFiles(path): return glob.glob(path)
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

In [2]:
n_letters

57

In [3]:
all_letters

"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'"

In [5]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )
print(unicodeToAscii('Elgün'))

Elgun


In [6]:
category_lines = {}
all_categories = []

data_df = pd.DataFrame()


def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    index = all_categories.index(category)
    data_df = pd.concat([data_df, pd.DataFrame({'Name':lines,'Country':category,'index':index})])
    category_lines[category] = lines

n_categories = len(all_categories)

In [7]:
data_df.reset_index(drop=True,inplace=True)

In [8]:
data_df

Unnamed: 0,Name,Country,index
0,Khoury,Arabic,0
1,Nahas,Arabic,0
2,Daher,Arabic,0
3,Gerges,Arabic,0
4,Nazari,Arabic,0
...,...,...,...
20069,Truong,Vietnamese,17
20070,Van,Vietnamese,17
20071,Vinh,Vietnamese,17
20072,Vuong,Vietnamese,17


In [9]:
data_df['Country'].value_counts()

Country
Russian       9408
English       3668
Arabic        2000
Japanese       991
German         724
Italian        709
Czech          519
Spanish        298
Dutch          297
French         277
Chinese        268
Irish          232
Greek          203
Polish         139
Scottish       100
Korean          94
Portuguese      74
Vietnamese      73
Name: count, dtype: int64

In [10]:
from imblearn.over_sampling import RandomOverSampler

In [11]:
ros = RandomOverSampler(random_state=12)

In [12]:
x,y = ros.fit_resample(np.array(data_df['Name']).reshape(-1,1),np.array(data_df['index']).reshape(-1,1))

In [13]:
data_df_balance = pd.DataFrame({'name':x.reshape(1,-1)[0].tolist(),'index':y.reshape(1,-1)[0].tolist()})
data_df_balance['Country'] = data_df_balance.apply(lambda row: all_categories[row[1]], axis = 1)
data_df_balance

  data_df_balance['Country'] = data_df_balance.apply(lambda row: all_categories[row[1]], axis = 1)


Unnamed: 0,name,index,Country
0,Khoury,0,Arabic
1,Nahas,0,Arabic
2,Daher,0,Arabic
3,Gerges,0,Arabic
4,Nazari,0,Arabic
...,...,...,...
169339,Dao,17,Vietnamese
169340,Giang,17,Vietnamese
169341,Tieu,17,Vietnamese
169342,Dinh,17,Vietnamese


In [14]:
data_df_balance['Country'] = data_df_balance.apply(lambda row: all_categories[row[1]], axis = 1)

  data_df_balance['Country'] = data_df_balance.apply(lambda row: all_categories[row[1]], axis = 1)


In [15]:
data_df_balance

Unnamed: 0,name,index,Country
0,Khoury,0,Arabic
1,Nahas,0,Arabic
2,Daher,0,Arabic
3,Gerges,0,Arabic
4,Nazari,0,Arabic
...,...,...,...
169339,Dao,17,Vietnamese
169340,Giang,17,Vietnamese
169341,Tieu,17,Vietnamese
169342,Dinh,17,Vietnamese


In [16]:
data_df_balance['Country'].value_counts()

Country
Arabic        9408
Chinese       9408
Spanish       9408
Scottish      9408
Russian       9408
Portuguese    9408
Polish        9408
Korean        9408
Japanese      9408
Italian       9408
Irish         9408
Greek         9408
German        9408
French        9408
English       9408
Dutch         9408
Czech         9408
Vietnamese    9408
Name: count, dtype: int64

In [17]:
n_categories

18

In [18]:
for category in category_lines:
    print(f'{category} length : {len(category_lines[category])}')

Arabic length : 2000
Chinese length : 268
Czech length : 519
Dutch length : 297
English length : 3668
French length : 277
German length : 724
Greek length : 203
Irish length : 232
Italian length : 709
Japanese length : 991
Korean length : 94
Polish length : 139
Portuguese length : 74
Russian length : 9408
Scottish length : 100
Spanish length : 298
Vietnamese length : 73


In [19]:
data, test_data = train_test_split(data_df_balance, test_size=0.10)
train_data, valid_data = train_test_split(data,test_size=0.10)

In [20]:
class CreateDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe
    
    def letterToIndex(self,letter):
        return all_letters.find(letter)

    def lineToTensor(self,line):
        tensor = torch.zeros(len(line), 1, n_letters)
        for li, letter in enumerate(line):
          tensor[li][0][self.letterToIndex(letter)] = 1
        return tensor   
         
    def __getitem__(self, index):
        row = self.dataframe.iloc[index]
        features = self.lineToTensor(row.iloc[0])
        label = torch.tensor(row.iloc[-2],dtype=torch.long)
        return features, label
    
    def __len__(self):
        return len(self.dataframe)

In [21]:
train_dataset = CreateDataset(dataframe=train_data)
train_loader = DataLoader(train_dataset, shuffle=True)

In [22]:
valid_data

Unnamed: 0,name,index,Country
50196,Reynders,3,Dutch
1589,Harb,0,Arabic
18478,Vazhenin,14,Russian
139700,Mendes,13,Portuguese
160088,Chung,17,Vietnamese
...,...,...,...
33388,Shi,1,Chinese
100707,Di antonio,9,Italian
140788,Rosario,13,Portuguese
166195,Banh,17,Vietnamese


In [23]:
valid_dataset = CreateDataset(dataframe=valid_data)
valid_loader = DataLoader(valid_dataset, shuffle=True)

In [24]:
test_dataset = CreateDataset(dataframe=test_data)
test_loader = DataLoader(test_dataset, shuffle=True)

In [25]:
example = iter(train_loader)
feature, label = next(example)

In [26]:
label

tensor([14])

In [27]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [28]:
torch.cuda.is_available()

True

In [28]:
import torch

In [165]:
all_categories

['Arabic',
 'Chinese',
 'Czech',
 'Dutch',
 'English',
 'French',
 'German',
 'Greek',
 'Irish',
 'Italian',
 'Japanese',
 'Korean',
 'Polish',
 'Portuguese',
 'Russian',
 'Scottish',
 'Spanish',
 'Vietnamese']

In [29]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

In [30]:
def category_from_output(output):
    category_idx = torch.argmax(output).item()
    return all_categories[category_idx]

In [31]:
class myRnn(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(myRnn, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self,input, hidden):
        combined = torch.cat((input,hidden),1)
        hidden = self.i2h(combined)
        output = self.h2o(hidden)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size).to(device)

n_hidden = 128
     

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class RNN(nn.Module):
    """LSTM class"""
    def __init__(self, input_size, hidden_size, output_size):
        '''

        :param input_size: number of input coming in
        :param hidden_size: number of he hidden units
        :param output_size: size of the output
        '''
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.input_size = input_size

        #LSTM
        self.lstm = nn.LSTM(input_size, hidden_size).to(device)
        self.hidden2Cat = nn.Linear(hidden_size, output_size).to(device)
        self.hidden = self.init_hidden()

    def forward(self, input):

        lstm_out, self.hidden = self.lstm(input, self.hidden)
        output = self.hidden2Cat(lstm_out[-1]) #many to one
        output = F.log_softmax(output, dim=1)

        return output

    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_size).to(device),
                torch.zeros(1, 1, self.hidden_size).to(device),)

In [33]:
criterion = nn.NLLLoss()

epochs = 11
learning_rate = 0.001 # If you set this too high, it might explode. If too low, it might not learn
n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories) #LSTM model
optimizer = optim.Adam(rnn.parameters(), lr=learning_rate)

In [34]:
def train(category_tensor, line_tensor):
    rnn.zero_grad()
    rnn.hidden = rnn.init_hidden()
    output = rnn(line_tensor)[-1]
    loss = criterion(output.unsqueeze(0), category_tensor)
    loss.backward()
    optimizer.step()
    return output.unsqueeze(0), loss.item()

for epoch in range(epochs):

    current_loss  = 0
    acc = []

    for i, (features, labels) in enumerate(train_loader):
       
        features = features.to(device)
        labels = labels.to(device)
        
        out, loss = train(labels,features[0])

        top_n, top_i = out.topk(1)
        output_category = top_i[0]
        # print(output_category)
        acc.append(torch.sum(output_category == labels).item())

    current_loss += loss
    
    print(f'Epoch: {epoch}, Accuracy:  {np.mean(acc) * 100} % , Loss {current_loss}')
    # if (epoch % 10 == 0):
    #     print(f'Epoch: {epoch}')
    #     print(f'Loss: {current_loss}')

Epoch: 0, Accuracy:  81.64003266067887 % , Loss 0.23224204778671265
Epoch: 1, Accuracy:  92.7446634783623 % , Loss 9.298280929215252e-06
Epoch: 2, Accuracy:  93.79957424472181 % , Loss 5.483612312673358e-06
Epoch: 3, Accuracy:  94.19762626851744 % , Loss 0.008392306044697762
Epoch: 4, Accuracy:  94.43456199696723 % , Loss 0.0020698329899460077
Epoch: 5, Accuracy:  94.54100081651697 % , Loss 0.011267011985182762
Epoch: 6, Accuracy:  94.63504607488628 % , Loss 0.007473013363778591
Epoch: 7, Accuracy:  94.68170418756561 % , Loss 0.4784899353981018
Epoch: 8, Accuracy:  94.74294296045726 % , Loss 0.012747940607368946
Epoch: 9, Accuracy:  94.82896885570979 % , Loss 0.0019702562130987644
Epoch: 10, Accuracy:  94.82313659162487 % , Loss 0.006849026307463646


In [52]:
def evaluate(line_tensor):

    rnn.hidden = rnn.init_hidden()
    output = rnn(line_tensor)

    return output

In [36]:
    def letterToIndex(letter):
        return all_letters.find(letter)

    def lineToTensor(line):
        tensor = torch.zeros(len(line), 1, n_letters)
        for li, letter in enumerate(line):
          tensor[li][0][letterToIndex(letter)] = 1
        return tensor   

In [53]:


np.set_printoptions(suppress=True)
device_cpu = torch.device("cpu")

In [54]:
exp_normalize(t.to(device_cpu).numpy())

NameError: name 'exp_normalize' is not defined

In [39]:
probs = np.exp(t.to(device_cpu).numpy()) / (np.exp(t.to(device_cpu).numpy())).sum()
probs*100

NameError: name 't' is not defined

In [55]:
torch.set_printoptions(precision=2)
t = None
def predict(input_line, n_predictions=5):
    global t
    print('\n> %s' % input_line)
    with torch.no_grad():
        output = evaluate(lineToTensor(input_line).to(device))
        t = output
       
        # Get top N categories
        topv, topi = output.topk(5, 1, True)
        probs = np.exp(topv.to(device_cpu).numpy()) / (np.exp(topv.to(device_cpu).numpy())).sum()
        probs*100
        predictions = []
        for i in range(n_predictions):
            value = probs[0][i].item()
            category_index = topi[0][i].item()
            print('(%.2f) %s' % (value, all_categories[category_index]))
            predictions.append([value, all_categories[category_index]])

In [122]:
predict('Smith')


> Smith
(0.92) Scottish
(0.07) Czech
(0.01) English
(0.00) German
(0.00) Chinese


In [128]:
predict('Rose')


> Rose
(0.98) French
(0.01) German
(0.01) English
(0.00) Scottish
(0.00) Dutch


In [127]:
predict("Shannon")


> Shannon
(0.99) Irish
(0.00) English
(0.00) Russian
(0.00) Spanish
(0.00) Korean


In [29]:
from sklearn.metrics import classification_report

In [30]:
y_true = []
pred = []

In [47]:
valid_acc = []
for i, (features, labels) in enumerate(valid_loader):
       
        features = features.to(device)
        labels = labels.to(device)
        y_true.append(labels.item())
        rnn.hidden = rnn.init_hidden()
        output = rnn(features[0])[-1]

        top_n, top_i = output.topk(1)
        output_category = top_i[0]
        pred.append(output_category.item())
        
        valid_acc.append(torch.sum(output_category == labels).item())

np.mean(valid_acc) * 100

94.58040810970408

In [48]:
print(classification_report(y_true,pred, target_names=all_categories))

              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       835
     Chinese       0.96      0.90      0.93       823
       Czech       0.97      0.97      0.97       811
       Dutch       0.94      0.96      0.95       831
     English       0.92      0.84      0.88       834
      French       0.92      0.95      0.93       846
      German       0.93      0.90      0.91       796
       Greek       0.99      1.00      1.00       868
       Irish       0.98      0.95      0.96       806
     Italian       0.97      0.95      0.96       849
    Japanese       0.98      0.99      0.98       875
      Korean       0.93      0.91      0.92       879
      Polish       0.97      0.99      0.98       847
  Portuguese       0.87      1.00      0.93       899
     Russian       0.98      0.91      0.94       863
    Scottish       0.92      0.99      0.96       857
     Spanish       0.95      0.81      0.88       863
  Vietnamese       0.90    

In [49]:
test_acc = []
y_true = []
pred = []
for i, (features, labels) in enumerate(test_loader):
       
        features = features.to(device)
        labels = labels.to(device)
        y_true.append(labels.item())
        
        rnn.hidden = rnn.init_hidden()
        output = rnn(features[0])[-1]

        top_n, top_i = output.topk(1)
        output_category = top_i[0]
        pred.append(output_category.item())
        
        test_acc.append(torch.sum(output_category == labels).item())

np.mean(test_acc) * 100

94.89223501623856

In [50]:
print(classification_report(y_true,pred, target_names=all_categories))

              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       968
     Chinese       0.97      0.91      0.94       984
       Czech       0.97      0.97      0.97       901
       Dutch       0.95      0.97      0.96       910
     English       0.93      0.84      0.88       933
      French       0.91      0.96      0.93      1005
      German       0.95      0.93      0.94       905
       Greek       0.99      1.00      1.00       975
       Irish       0.98      0.94      0.96       939
     Italian       0.96      0.95      0.96       971
    Japanese       0.99      0.98      0.98       936
      Korean       0.89      0.92      0.91       893
      Polish       0.95      0.99      0.97       907
  Portuguese       0.89      1.00      0.94       928
     Russian       0.97      0.92      0.94       938
    Scottish       0.93      0.99      0.96       971
     Spanish       0.96      0.84      0.89       947
  Vietnamese       0.92    

In [None]:
test_acc = []
y_true = []
pred = []
for i, (features, labels) in enumerate(train_loader):
       
        features = features.to(device)
        labels = labels.to(device)
        y_true.append(labels.item())
        
        rnn.hidden = rnn.init_hidden()
        output = rnn(features[0])[-1]

        top_n, top_i = output.topk(1)
        output_category = top_i[0]
        pred.append(output_category.item())
        
        test_acc.append(torch.sum(output_category == labels).item())

np.mean(test_acc) * 100

95.21535635133559

In [None]:
print(classification_report(y_true,pred, target_names=all_categories))

              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00      7584
     Chinese       0.94      0.91      0.93      7693
       Czech       0.98      0.96      0.97      7612
       Dutch       0.95      0.95      0.95      7562
     English       0.93      0.91      0.92      7591
      French       0.94      0.94      0.94      7555
      German       0.95      0.93      0.94      7696
       Greek       1.00      0.99      1.00      7658
       Irish       0.96      0.96      0.96      7616
     Italian       0.95      0.95      0.95      7667
    Japanese       1.00      0.99      0.99      7570
      Korean       0.93      0.90      0.91      7696
      Polish       0.97      0.99      0.98      7610
  Portuguese       0.87      0.99      0.93      7673
     Russian       0.99      0.97      0.98      7614
    Scottish       0.94      0.96      0.95      7594
     Spanish       0.93      0.83      0.88      7582
  Vietnamese       0.91    

In [134]:
def evaluate(line_tensor):

    model_1.hidden = model_1.init_hidden()
    output = model_1(line_tensor)

    return output

In [135]:
torch.set_printoptions(precision=2)
t = None
def predict(input_line, n_predictions=5):
    global t
    print('\n> %s' % input_line)
    with torch.no_grad():
        output = evaluate(lineToTensor(input_line).to(device))
        t = output
       
        # Get top N categories
        topv, topi = output.topk(5, 1, True)
        probs = np.exp(topv.to(device_cpu).numpy()) / (np.exp(topv.to(device_cpu).numpy())).sum()
        probs*100
        predictions = []
        for i in range(n_predictions):
            value = probs[0][i].item()
            category_index = topi[0][i].item()
            print('(%.2f) %s' % (value, all_categories[category_index]))
            predictions.append([value, all_categories[category_index]])

In [164]:
predict('Markiz')


> Markiz
(0.42) Korean
(0.34) Russian
(0.17) Polish
(0.06) Spanish
(0.01) English


In [166]:
torch.save(rnn.to(torch.device('cpu')).state_dict(), './rnn.pth')

In [33]:
n_hidden = 128
n_letters = 57
n_categories = 18
model_1 = RNN(n_letters, n_hidden, n_categories)
model_1.load_state_dict(torch.load('./rnn.pth'))
model_1.eval()

RNN(
  (lstm): LSTM(57, 128)
  (hidden2Cat): Linear(in_features=128, out_features=18, bias=True)
)

In [34]:
model_1.to(device)

RNN(
  (lstm): LSTM(57, 128)
  (hidden2Cat): Linear(in_features=128, out_features=18, bias=True)
)

In [35]:
test_acc = []
y_true = []
pred = []
for i, (features, labels) in enumerate(test_loader):
       
        features = features.to(device)
        labels = labels.to(device)
        y_true.append(labels.item())
        
        model_1.hidden = model_1.init_hidden()
        output = model_1(features[0])[-1]
        top_n, top_i = output.topk(1)
        output_category = top_i[0]
        # print(output_category)
        # print(labels)
        pred.append(output_category.item())
        
        test_acc.append(torch.sum(output_category == labels).item())

np.mean(test_acc) * 100

95.31739002066726

In [170]:
print(classification_report(y_true,pred, target_names=all_categories))

              precision    recall  f1-score   support

      Arabic       1.00      1.00      1.00       968
     Chinese       0.97      0.91      0.94       984
       Czech       0.97      0.97      0.97       901
       Dutch       0.95      0.97      0.96       910
     English       0.93      0.84      0.88       933
      French       0.91      0.96      0.93      1005
      German       0.95      0.93      0.94       905
       Greek       0.99      1.00      1.00       975
       Irish       0.98      0.94      0.96       939
     Italian       0.96      0.95      0.96       971
    Japanese       0.99      0.98      0.98       936
      Korean       0.89      0.92      0.91       893
      Polish       0.95      0.99      0.97       907
  Portuguese       0.89      1.00      0.94       928
     Russian       0.97      0.92      0.94       938
    Scottish       0.93      0.99      0.96       971
     Spanish       0.96      0.84      0.89       947
  Vietnamese       0.92    

In [172]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )
print(unicodeToAscii('Estévez'))

Estevez


In [37]:
from sklearn.metrics import classification_report
from  sklearn.metrics import precision_recall_fscore_support

In [61]:
def pandas_classification_report(y_true, y_pred):
    metrics_summary = precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred)
    
    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average='weighted'))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index)
    
    support = class_report_df.loc['support']
    total = support.sum() 
    avg[-1] = total
    
    class_report_df['avg / total'] = avg

    return class_report_df.T

In [63]:
df_class_report.drop(df_class_report.tail(1).index,inplace=True)

In [64]:
df_class_report['Country'] = all_categories

In [62]:
df_class_report = pandas_classification_report(y_true,pred,)


In [42]:
df_class_report.to_csv('report.csv')

In [68]:
df_class_report.set_index('Country').to_csv('report.csv')