In [None]:
# one off install
#%pip install torch

In [None]:
# Exercise - display a 3 dimensional (random) tensor: five 3*2 matrices 


In [None]:
# import functions used in this Case Study
import PyTorch_Functions as ptf

#### Data Prep

In [None]:
# Exercise - 

# a) download data from zip file: https://download.pytorch.org/tutorial/data.zip
# b) extract it
# c) rename folder "PyTorch-NLPdata"
# d) put it in Jupyter working directory

In [None]:
# aside Jupyter working directory
# pwd

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import os

print(ptf.findFiles('PyTorch-NLPdata/data/names/*.txt'))

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

for filename in ptf.findFiles('PyTorch-NLPdata/data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = ptf.readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

In [None]:
category_lines.keys()

In [None]:
category_lines.values()

In [None]:
# Exercise display last 5 Portuguese names


#### Creating the network

In [None]:
# torch.manual_seed(0) ADD THIS IF WANT TO REPRODUCE RESULTS!

n_hidden = 128

rnn = ptf.RNN(ptf.n_letters, n_hidden, n_categories)

In [None]:
# run a network step (one forward pass)
input = ptf.letterToTensor('A')
hidden = torch.zeros(1, n_hidden)

output, next_hidden = rnn(input, hidden)
print(output)

print(ptf.categoryFromOutput(all_categories, output)) # most likely language for the letter A (from ALbert)

output shows a tensor showing the (log) probability/likelihood of the letter A
being in each of the 18 dictionaries. The tuple at the end refers to the most likely language
to which it belongs (highest log value)

NB above gives different results if you recreate the RNN network above without "seeding" - add torch.manual_seed(0) to recreate results

#### Model Prep

In [None]:
# get a training example (a name and its language):

# calls the randomTrainingExample function ten times
for i in range(10):
    category, line, category_tensor, line_tensor = ptf.randomTrainingExample(all_categories, category_lines)
    print('category =', category, '/ line =', line) # category is language
    # line is name

#### Training the Network

Training iterations below will:

- Create input and target tensors
- Create a zeroed initial hidden state
- Read each letter in and
- Keep hidden state for next letter
- Compare final output to target
- Back-propagate
- Return the output and loss

In [None]:
learning_rate = 0.05 # If you set this too high, it might explode. 
# If too low, it might not learn

In [None]:
# train with a bunch of examples
import time

start = time.time()

n_iters = 5000 # keep smaller for training purposes NB use 5000 - quite fast
# in practise could use e.g. 100000 iterations for training
print_every = 500 # divide n_iters by 10
plot_every = 100 # divide n_iters by 50

# Keep track of losses for plotting
current_loss = 0
all_losses = []

for iter in range(1, n_iters + 1):
    category, line, category_tensor, line_tensor = ptf.randomTrainingExample(all_categories, category_lines)
    output, loss = ptf.train(category_tensor, 
                             line_tensor, 
                             rnn, 
                             learning_rate)
    current_loss += loss

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        guess, guess_i = ptf.categoryFromOutput(all_categories, output)
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, ptf.timeSince(start), loss, line, guess, correct))

    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0

#### Plotting the Results

In [None]:
len(all_losses)

In [None]:
import matplotlib.pyplot as plt

plt.plot(all_losses)

#### Evaluating the Results

In [None]:
# evaluate TEST surnames and produce a confusion matrix
# comparing actual language and predicted language

# Keep track of correct guesses in a confusion matrix
confusion = torch.zeros(n_categories, n_categories)
n_confusion = 1000

import pandas as pd
import numpy as np

import matplotlib.ticker as ticker

column_names = ["record", "name", "actual", "predicted","predicted-lng"] # information we want to isolate/export from

mydf = pd.DataFrame(columns = column_names) # define an empty dataframe we will populate in the for loop
likelihood_df = pd.DataFrame(columns = ["likelihood"])
allProbs =pd.DataFrame(columns = range(0,18)) # easier way to extract language probabilities

# Go through a bunch of examples and record which are correctly guessed
for i in range(n_confusion):
    category, line, category_tensor, line_tensor = ptf.randomTrainingExample(all_categories, category_lines)
    output = ptf.evaluate(line_tensor, rnn)
    guess, guess_i = ptf.categoryFromOutput(all_categories, output)
    category_i = all_categories.index(category)
    confusion[category_i][guess_i] += 1
    
    # extract actual and predicted language for all 1000 samples above     mydf.loc[i, "record"] = i
    mydf.loc[i, "name"] = line
    mydf.loc[i, "actual"] = category_i # actual language index
    mydf.loc[i, "predicted"] = guess_i # predicted language index
    mydf.loc[i, "predicted-lng"] = guess # predicted language 
    likelihood_df.loc[i,"likelihood"] = output.detach().numpy()  # transform the likelihhod tensor into
    # easier way to extract language probabilities
    for j in range(18):
        allProbs.loc[i,j] = output.detach().numpy()[0][j]

# Normalize by dividing every row by its sum
for i in range(n_categories):
    confusion[i] = confusion[i] / confusion[i].sum()

# Set up plot
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(confusion.numpy())
fig.colorbar(cax)

# Set up axes
ax.set_xticklabels([''] + all_categories, rotation=90)
ax.set_yticklabels([''] + all_categories)

# Force label at every tick
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

# sphinx_gallery_thumbnail_number = 2
plt.show()

NB confusion matrix is PROBABILISTIC - it considers the probability a
name is any of the 18 languages and adds these up over all the iterations -
so even if the predicted language is not the highest probability, the
relative probability is still high enough (e.g. top 3 / top 5) to show the overall picture above of a good model over many interations. This is what we confirm in the export below i.e. even for incorrect classifications, the probability of the actual class (language) is still one of the highest 
for each name

In [None]:
# Exercise - 
# a) extract actual and predicted language for all 10000 samples above and 
# b) export to a csv
