# RNN City Classification

## Objective
The goal of this notebook is use a recurrent neural network to predict the country of a city using only the ASCII representation of a city's name as input to the RNN.

See `rnn.py` for implementation of the PyTorch RNN.

In [None]:
import pandas as pd
import string
import torch
import numpy as np
from rnn import RNN
import matplotlib.pyplot as plt

In [None]:
# read dataset csv into pandas dataframe
df = pd.read_csv('data/worldcities.csv')

# dataset exploration
print(df.head(1))
print(df.info())

In [None]:
# remove all empty city ASCII names and corresponding country name
dataset = df[df['city_ascii'].notna()]

# remove all non-letter chars from city names, excluding space
dataset.loc[:,'city_ascii'] = dataset['city_ascii'].apply(lambda city: ''.join([c for c in city if c.isalpha() or c == ' ']))

# zip city_ascii and country togeter into a series of examples, converting to lowercase
examples = zip(dataset['city_ascii'].str.lower(), dataset['country'].str.lower())

# convert to numpy array
examples = np.array([(city, country) for city, country in examples])

In [None]:
# create train, dev, and test sets
# first, randomly shuffle examples
rng = np.random.default_rng()
rand_examples = rng.permutation(examples)

num_examples = len(examples)

train_set = rand_examples[:int(num_examples * 0.8)]
test_set = rand_examples[int(num_examples * 0.8):]

### Helpers for creating encoding of examples

In [None]:
"""
Methods and utilities for encoding city names as tensors.
"""
all_chars = string.ascii_lowercase + ' '
num_chars = len(all_chars)

char_encodings = {}

for c_idx, char in enumerate(all_chars):
    char_encodings[char] = c_idx


def encode_city_name(name):
    """
    Encode city_ascii string as a tensor matrix.
    """
    encoding = torch.zeros((len(name), 1, num_chars))
    for c_idx, char in enumerate(name):
        encoding[c_idx][0][char_encodings[char]] = 1

    return encoding


"""
Methods and utilities for getting categorical index of country.
"""
country_set = set()

for _city, country in examples:
    country_set.add(country)

# maps a country name to an categorical index
num_countries = len(country_set)
country_to_idx = {}
idx_to_country = {}


for country_idx, country in enumerate(country_set):
    country_to_idx[country] = country_idx
    idx_to_country[country_idx] = country


def get_country_index(country):
    """
    Returns a countries categorical index.
    """
    return country_to_idx[country]


def get_country(index):
    """
    Returns a country category from index.
    """
    return idx_to_country[index]

### Helpers for model evaluation.

In [None]:
def evaluate(rnn: RNN, city) -> str:
    """
    Returns the predicted country for a city name.
    """

    hidden = rnn.get_init_hidden()
    city_encoding = encode_city_name(city)

    for char_encoding in city_encoding:
        output, hidden = rnn.forward(char_encoding[0], hidden)

    return idx_to_country[torch.topk(output, k=1)[1].item()]

## TRAIN RNN

In [None]:
N_EPOCHS = 15
N_HIDDEN = 512
LEARNING_RATE = 0.001
rnn = RNN(num_chars, N_HIDDEN, num_countries)
rnn.train()

loss_fn = torch.nn.NLLLoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=LEARNING_RATE)

losses = []
TRACK_LOSS_ITER = 300
total_loss = 0
iter = 0

for epoch in range(N_EPOCHS):
    rng.shuffle(train_set)

    for train_ex in train_set:
        city, country = train_ex
        city_encoding = encode_city_name(city)
        gold_label = torch.tensor(get_country_index(country), dtype=torch.long)

        hidden = rnn.get_init_hidden()

        rnn.zero_grad()

        for i in range(city_encoding.size()[0]):
            output, hidden = rnn.forward(city_encoding[i], hidden)

        loss = loss_fn(output, gold_label.unsqueeze(0))
        total_loss += loss.item()
        loss.backward()

        optimizer.step()

        iter += 1
        if iter % TRACK_LOSS_ITER == 0:
            losses.append(total_loss / TRACK_LOSS_ITER)
            total_loss = 0

rnn.eval()

# Evaluate Model

In [None]:
"""
Print Accuracy on Test Set.
"""
num_correct = 0

for test_ex in test_set:
    city, country = test_ex
    predicted_country = evaluate(rnn, city)

    if predicted_country == country:
        num_correct += 1

test_acc_output = "Test Accuracy: {num_correct} correct out of {num_test_ex}. Total Accuracy: {acc:.2f}%"
print(test_acc_output.format(num_correct=num_correct, num_test_ex=len(test_set), acc=(num_correct / len(test_set) * 100)))

In [None]:
"""
Plots Negative Log Likelihood
"""

plt.figure()
plt.title("Negative Log Likelihood vs Iterations")
plt.ylabel("NLL")
plt.xlabel("Iteration")
plt.plot(losses)
plt.show()