<a href="https://colab.research.google.com/github/DotSlash-A/Pytorch/blob/main/classifying_names_using_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://download.pytorch.org/tutorial/data.zip

--2023-09-29 10:59:13--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 108.156.60.129, 108.156.60.77, 108.156.60.94, ...
Connecting to download.pytorch.org (download.pytorch.org)|108.156.60.129|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip’


2023-09-29 10:59:14 (93.0 MB/s) - ‘data.zip’ saved [2882130/2882130]



In [2]:
!unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.txt   
  inflating: data/names/Polish.txt   
  inflating: data/names/Portuguese.txt  
  inflating: data/names/Russian.txt  
  inflating: data/names/Scottish.txt  
  inflating: data/names/Spanish.txt  
  inflating: data/names/Vietnamese.txt  


In [6]:
import torch

In [3]:
from io import open
import glob
import os

def findFiles(path): return glob.glob(path)

print(findFiles('data/names/*.txt'))

import unicodedata
import string

all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

print(unicodeToAscii('Ślusàrski'))

# Build the category_lines dictionary, a list of names per language
category_lines = {}
all_categories = []

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

['data/names/Scottish.txt', 'data/names/Spanish.txt', 'data/names/German.txt', 'data/names/French.txt', 'data/names/Chinese.txt', 'data/names/Italian.txt', 'data/names/Czech.txt', 'data/names/Irish.txt', 'data/names/English.txt', 'data/names/Polish.txt', 'data/names/Portuguese.txt', 'data/names/Japanese.txt', 'data/names/Vietnamese.txt', 'data/names/Korean.txt', 'data/names/Dutch.txt', 'data/names/Greek.txt', 'data/names/Russian.txt', 'data/names/Arabic.txt']
Slusarski


In [None]:
category_lines['Korean']

In [None]:
def letter_to_tensor(letter):
  n=len(letter)
  y=torch.zeroes(1,n)


In [7]:
# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letterToTensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letterToIndex(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

In [8]:
print(letterToTensor('J'))
print(lineToTensor('Jones').size())

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0.]])
torch.Size([5, 1, 57])


In [9]:
import torch.nn as nn


In [10]:
n_hidden = 128

In [11]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
      super(RNN, self).__init__()
      self.hidden_size=hidden_size

      self.i_to_h=nn.Linear(input_size+hidden_size,hidden_size)
      self.h_to_o=nn.Linear(hidden_size,output_size)
      self.softmax=nn.LogSoftmax(dim=1)


    def forward(self, input, hidden):
      combined=torch.cat((input,hidden),1)
      hidden=self.i_to_h(combined)
      output=self.h_to_o(hidden)
      return self.softmax(output)


    def initHidden(self):
      tensor = torch.zeros(1, self.hidden_size)
      return tensor


In [12]:
rnn = RNN(n_letters,n_hidden,n_categories)