In [1]:
import os
import random
from string import ascii_letters

import torch
from torch import nn
import torch.nn.functional as F
from unidecode import unidecode

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
data_dir = "./data/names"

lang2label = {
    file_name.split(".")[0]: torch.tensor([i], dtype=torch.long)
    for i, file_name in enumerate(os.listdir(data_dir))
}
num_langs = len(lang2label)

In [6]:
char2idx = {letter: i for i, letter in enumerate(ascii_letters + " .,:;-'")}
num_letters = len(char2idx)

59

In [7]:
def name2tensor(name):
    tensor = torch.zeros(len(name), 1, num_letters)
    for i, char in enumerate(name):
        tensor[i][0][char2idx[char]] = 1
    return tensor

In [10]:
tensor_names = []
target_langs = []

for file in os.listdir(data_dir):
    with open(os.path.join(data_dir, file)) as f:
        lang = file.split(".")[0]
        names = [unidecode(line.rstrip()) for line in f]
        for name in names:
            try:
                tensor_names.append(name2tensor(name))
                target_langs.append(lang2label[lang])
            except KeyError:
                pass

In [11]:
from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(
    range(len(target_langs)), 
    test_size=0.1, 
    shuffle=True, 
    stratify=target_langs
)

train_dataset = [
    (tensor_names[i], target_langs[i])
    for i in train_idx
]

test_dataset = [
    (tensor_names[i], target_langs[i])
    for i in test_idx
]

In [13]:
class VanillaRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(VanillaRNN, self).__init__()
        self.hidden_size = hidden_size
        self.in2hidden = nn.Linear(input_size + hidden_size, hidden_size)
        self.in2output = nn.Linear(input_size + hidden_size, output_size)
    
    def forward(self, x, hidden_state):
        combined = torch.cat((x, hidden_state), 1)
        hidden = torch.sigmoid(self.in2hidden(combined))
        output = self.in2output(combined)
        return output, hidden
    
    def init_hidden(self):
        return nn.init.kaiming_uniform_(torch.empty(1, self.hidden_size))

In [14]:
hidden_size = 256
learning_rate = 0.001

model = VanillaRNN(input_size=num_letters, hidden_size=hidden_size, output_size=num_langs)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [15]:
num_epochs = 2
print_interval = 3000

for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i, (name, label) in enumerate(train_dataset):
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
        
        if (i + 1) % print_interval == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Step [{i + 1}/{len(train_dataset)}], "
                f"Loss: {loss.item():.4f}"
            )

Epoch [1/2], Step [3000/18063], Loss: 1.6980
Epoch [1/2], Step [6000/18063], Loss: 1.8498
Epoch [1/2], Step [9000/18063], Loss: 0.0086
Epoch [1/2], Step [12000/18063], Loss: 0.2665
Epoch [1/2], Step [15000/18063], Loss: 0.1882
Epoch [1/2], Step [18000/18063], Loss: 0.0002
Epoch [2/2], Step [3000/18063], Loss: 0.1571
Epoch [2/2], Step [6000/18063], Loss: 0.1640
Epoch [2/2], Step [9000/18063], Loss: 0.7492
Epoch [2/2], Step [12000/18063], Loss: 4.0861
Epoch [2/2], Step [15000/18063], Loss: 0.2719
Epoch [2/2], Step [18000/18063], Loss: 0.0030


In [16]:
from rnn_dataset import get_loader

dataloader,dataset = get_loader("./data/", "small_sentiments.csv", batch_size=1)

In [18]:
sentiment_classifier = VanillaRNN(input_size=len(dataset.vocab), hidden_size=hidden_size, output_size=2)

In [20]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(sentiment_classifier.parameters(), lr=learning_rate)

num_epochs = 100
print_interval = 1

for epoch in range(num_epochs):
    for i, (sentence, sentiment) in enumerate(dataloader):
        hidden_state = sentiment_classifier.init_hidden()
        
        for word in sentence:
            output, hidden_state = sentiment_classifier(word, hidden_state)
        loss = criterion(output, sentiment)

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(sentiment_classifier.parameters(), 1)
        optimizer.step()
        
        if (i + 1) % print_interval == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Step [{i + 1}/{len(train_dataset)}], "
                f"Loss: {loss.item():.4f}"
            )

Epoch [1/100], Step [1/18063], Loss: 0.4731
Epoch [1/100], Step [2/18063], Loss: 1.3778
Epoch [2/100], Step [1/18063], Loss: 0.2976
Epoch [2/100], Step [2/18063], Loss: 1.5016
Epoch [3/100], Step [1/18063], Loss: 0.2578
Epoch [3/100], Step [2/18063], Loss: 1.5688
Epoch [4/100], Step [1/18063], Loss: 1.5449
Epoch [4/100], Step [2/18063], Loss: 0.2681
Epoch [5/100], Step [1/18063], Loss: 0.2702
Epoch [5/100], Step [2/18063], Loss: 1.4990
Epoch [6/100], Step [1/18063], Loss: 0.2541
Epoch [6/100], Step [2/18063], Loss: 1.5402
Epoch [7/100], Step [1/18063], Loss: 0.2430
Epoch [7/100], Step [2/18063], Loss: 1.5707
Epoch [8/100], Step [1/18063], Loss: 0.2349
Epoch [8/100], Step [2/18063], Loss: 1.5940
Epoch [9/100], Step [1/18063], Loss: 1.5756
Epoch [9/100], Step [2/18063], Loss: 0.2470
Epoch [10/100], Step [1/18063], Loss: 1.4990
Epoch [10/100], Step [2/18063], Loss: 0.2658
Epoch [11/100], Step [1/18063], Loss: 1.4408
Epoch [11/100], Step [2/18063], Loss: 0.2812
Epoch [12/100], Step [1/1806

Epoch [93/100], Step [1/18063], Loss: 0.1396
Epoch [93/100], Step [2/18063], Loss: 1.5988
Epoch [94/100], Step [1/18063], Loss: 1.5810
Epoch [94/100], Step [2/18063], Loss: 0.1435
Epoch [95/100], Step [1/18063], Loss: 0.1437
Epoch [95/100], Step [2/18063], Loss: 1.5428
Epoch [96/100], Step [1/18063], Loss: 0.1370
Epoch [96/100], Step [2/18063], Loss: 1.5595
Epoch [97/100], Step [1/18063], Loss: 1.5395
Epoch [97/100], Step [2/18063], Loss: 0.1411
Epoch [98/100], Step [1/18063], Loss: 0.1410
Epoch [98/100], Step [2/18063], Loss: 1.4983
Epoch [99/100], Step [1/18063], Loss: 1.4845
Epoch [99/100], Step [2/18063], Loss: 0.1435
Epoch [100/100], Step [1/18063], Loss: 1.4126
Epoch [100/100], Step [2/18063], Loss: 0.1518


In [22]:
num_correct = 0
num_samples = len(dataloader)

sentiment_classifier.eval()

with torch.no_grad():
    for i, (sentence, sentiment) in enumerate(dataloader):
        hidden_state = sentiment_classifier.init_hidden()
        for word in sentence:
            output, hidden_state = sentiment_classifier(word, hidden_state)
        _, pred = torch.max(output, dim=1)
        num_correct += bool(pred == sentiment)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")


Accuracy: 50.0000%


In [41]:
dataset[1]["text"]

tensor([[0., 1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0.]])

In [46]:
def predict_sentiment(tensor_sentence):
    sentiment_classifier.eval()
    with torch.no_grad():
        hidden_state = sentiment_classifier.init_hidden()
        for word in tensor_sentence:
            output, hidden_state = sentiment_classifier(word, hidden_state)
        print(output)
        _, pred = torch.max(output, dim=1)
    sentiment_classifier.train()
    return pred.item()

In [48]:
iterator = iter(dataloader)
print(predict_sentiment(next(iterator)[0]))
print(predict_sentiment(next(iterator)[0]))

tensor([[-0.8004,  1.0151]])
1
tensor([[-0.3499,  0.6979]])
1
