## Classifiying surnames
with muti-class logistic regression and bag of letters

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random

In [2]:
def unpack_dataset():
    ! wget https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_train.csv.gz 
    ! wget https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz 
    ! mkdir -p data
    ! gunzip names_train.csv.gz 
    ! gunzip names_test.csv.gz
    ! mv names*.csv data

In [3]:
unpack_dataset()

--2021-03-30 11:00:21--  https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_train.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 50237 (49K) [application/octet-stream]
Saving to: ‘names_train.csv.gz’


2021-03-30 11:00:21 (4.71 MB/s) - ‘names_train.csv.gz’ saved [50237/50237]

--2021-03-30 11:00:21--  https://raw.githubusercontent.com/hunkim/PyTorchZeroToAll/master/data/names_test.csv.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27541 (27K) [application/octet-stream]
Saving to: ‘names_test.cs

In [4]:
PATH = Path("data")
list(PATH.iterdir())

[PosixPath('data/names_train.csv'), PosixPath('data/names_test.csv')]

## Processing data
Here we split every last name into letters and assign every letter an id. We represent a last name by a vector of letter frequencies.

In [5]:
df = pd.read_csv(PATH/"names_train.csv", header=None)

In [6]:
val = pd.read_csv(PATH/"names_test.csv", header=None)

In [7]:
df.head()

Unnamed: 0,0,1
0,Adsit,Czech
1,Ajdrna,Czech
2,Antonowitsch,Czech
3,Antonowitz,Czech
4,Ballalatak,Czech


In [8]:
## vocab is a list of unique letters
letters = [list(l) for l in df[0].values]
vocab = sorted(list(set(np.concatenate(np.array(letters)))))
vocab[:10]

  vocab = sorted(list(set(np.concatenate(np.array(letters)))))


[' ', "'", ',', 'A', 'B', 'C', 'D', 'E', 'F', 'G']

In [9]:
vocab

[' ',
 "'",
 ',',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [10]:
## vocab2id is a dictionary mapping letters to a unique number
vocab2id = {key:i for i, key in enumerate(vocab)}
#vocab2id

In [11]:
## label2id is a dictionary mapping classes to ids
labels = sorted(df[1].unique())
label2id = {key:i for i, key in enumerate(labels)}
label2id

{'Arabic': 0,
 'Chinese': 1,
 'Czech': 2,
 'Dutch': 3,
 'English': 4,
 'French': 5,
 'German': 6,
 'Greek': 7,
 'Irish': 8,
 'Italian': 9,
 'Japanese': 10,
 'Korean': 11,
 'Polish': 12,
 'Portuguese': 13,
 'Russian': 14,
 'Scottish': 15,
 'Spanish': 16,
 'Vietnamese': 17}

In [12]:
len(label2id)

18

In [13]:
num_letters = len(vocab)
num_letters

55

In [14]:
def encode_data(df, vocab2id, label2id, num_letters):
    """ Returns encoded data
    
    outputs:
    data: a np array of shape (df.shape[0], num_letters)
          data[i, j] counts the number of times letter vocab[j]
          is on observation j
    y: np array of len df.shape[0]. Id of the labels of each observation.
    """
    data = np.zeros((df.shape[0], num_letters))
    y = np.zeros(df.shape[0])
    for i, row in df.iterrows():
        y[i] = label2id[row[1]]
        for c in list(row[0]):
            data[i][vocab2id[c]] +=1
    return data, y

In [15]:
x_train, y_train = encode_data(df, vocab2id, label2id, num_letters)
x_valid, y_valid = encode_data(val, vocab2id, label2id, num_letters)
x_train.shape, x_valid.shape

((13374, 55), (6700, 55))

In [16]:
y_train[0]

2.0

In [17]:
x_train[0]

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
       0., 0., 0., 0.])

In [18]:
y_train[10]

2.0

In [19]:
# Checking
[vocab[i] for i, v in enumerate(x_train[0]) if v==1 ]

['A', 'd', 'i', 's', 't']

In [20]:
[vocab[i] for i, v in enumerate(x_train[1]) if v==1 ]

['A', 'a', 'd', 'j', 'n', 'r']

## Model
We are going to write a multiclass logistic regression model. Here are the equations:

\begin{align}
z_1 & = a_{11}x_1 + \dots a_{1D}x_D + b_1\\
z_2 & = a_{21}x_1 + \dots a_{2D}x_D + b_2 \\
& \dots \\
z_K & = a_{K1}x_1 + \dots a_{KD}x_D + b_K
\end{align}

$$\hat{y}_k = \frac{e^{z_k}}{ \sum_{i=1}^K e^{z_i}}$$


Here the observations are $D$ dimensional vectors $x = (x_1, \dots, x_D)$.

In order to get multiclass logistic regression, we do a linear transformation and then a softmax transformation.

For numerical reasons, it is better not to apply the softmax directly after the linear transformation but to apply it together with the loss function. The loss function `F.cross_entropy` combines log_softmax and nll_loss in a single function. Therefore to write the model just do the linear transformation with the appropriate parameters.

In [21]:
class MultiLogisticRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MultiLogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim,output_dim)
        
    def forward(self, x):
        ## CODE HERE
        x = self.linear(x)
        return x

## Training loop

`loss.item()` to get a Python number from a tensor containing a single value.

In [22]:
def train_epochs(model, x_train, y_train, x_valid, y_valid, epochs, lr=0.01, wd=1e-4):
    ## get an optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
    ## convert your training data to pytorch tensors
    x = torch.FloatTensor(x_train)
    y = torch.LongTensor(y_train)
    for i in range(epochs):
        model.train()
        ## evaluate your training data to get y_hat
        y_hat = model(x)
        ## compute your loss
        loss = F.cross_entropy(y_hat,y)
        ## zero_grad
        optimizer.zero_grad()
        ## compute gradients
        loss.backward()
        ## call gradient descent
        optimizer.step()
        ## call valid_metrics(model, x_valid, y_valid)
        ## print train loss, valid loss and potentially valid accuracy
        val_loss, val_acc = valid_metrics(model, x_valid, y_valid)
        if i%10 == 1:
            print("train loss %.3f val loss %.3f and val accuracy %.3f" % 
                  (loss.item(), val_loss, val_acc))

In [23]:
def valid_metrics(model, x_valid, y_valid):
    model.eval()
    x = torch.FloatTensor(x_train)
    y = torch.LongTensor(y_train)
    y_out = model(x)
    loss = F.cross_entropy(y_out,y)
    _,y_hat = torch.max(y_out,1)
    val_acc = y_hat.eq(y).sum().float()/ y.size(0)
    
    return loss.item(), float(val_acc)

In [24]:
y = torch.LongTensor(y_train)


y.size(0)

13374

In [34]:
model = MultiLogisticRegression(55,18)
train_epochs(model, x_train, y_train, x_valid, y_valid, 1000, lr=0.01, wd=1e-4)

train loss 2.769 val loss 2.674 and val accuracy 0.235
train loss 2.008 val loss 1.955 and val accuracy 0.501
train loss 1.644 val loss 1.625 and val accuracy 0.514
train loss 1.511 val loss 1.502 and val accuracy 0.521
train loss 1.428 val loss 1.421 and val accuracy 0.568
train loss 1.367 val loss 1.362 and val accuracy 0.594
train loss 1.319 val loss 1.315 and val accuracy 0.603
train loss 1.280 val loss 1.277 and val accuracy 0.614
train loss 1.249 val loss 1.246 and val accuracy 0.620
train loss 1.222 val loss 1.220 and val accuracy 0.631
train loss 1.199 val loss 1.197 and val accuracy 0.635
train loss 1.180 val loss 1.178 and val accuracy 0.643
train loss 1.162 val loss 1.161 and val accuracy 0.647
train loss 1.147 val loss 1.146 and val accuracy 0.650
train loss 1.133 val loss 1.132 and val accuracy 0.653
train loss 1.121 val loss 1.120 and val accuracy 0.656
train loss 1.110 val loss 1.109 and val accuracy 0.659
train loss 1.101 val loss 1.100 and val accuracy 0.662
train loss