Steps:

1. Load the data.
2. Cast text to lower case.
3. El,iminate punctuation.
4. Get the maximum number of tokens in the data.
5. Tokenize the data.

In [1]:
import string
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
from unidecode import unidecode

In [2]:
data = pd.read_csv("data/train.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
data.head()

In [None]:
data['lyric'] = data['lyric'].str.lower()

In [None]:
data.head()

In [None]:
def clear_punctuation(text):
    text = ''.join([char for char in text if char not in all_punct])
    return text

In [None]:
all_char = string.ascii_lowercase

In [None]:
all_char

In [None]:
all_punct = string.punctuation

In [None]:
all_punct

In [None]:
data['lyric'] = data['lyric'].apply(clear_punctuation)

In [None]:
data.head()

In [None]:
data['lyric_len'] = data['lyric'].apply(len)

In [None]:
data.head()

In [None]:
data.loc[data['lyric_len'] == max(data['lyric_len'])]

In [None]:
for row in data.iterrows():
    print(row[1])
    break

The length of the texts are not symmetrical, so we need to pad the text.

In [None]:
def pad_text(text, unk = '?', limit = 400):
    if len(text) < limit:
        text = text + unk*(limit - len(text))
    else:
        text = text[:limit-1]
    return text

In [None]:
data['lyric'] = data['lyric'].apply(pad_text, **{"limit" : 400, "unk" : "#"})

In [None]:
data.head()

In [None]:
unidecode(data['lyric'][0])

In [None]:
all_char = '#' + ' ' + all_char

In [None]:
all_char

In [None]:
char_dict = dict(enumerate(all_char, start = 0))
char_dict = {v: k for k, v in char_dict.items()}

In [None]:
char_dict

In [None]:
class TextDataset(Dataset):
    def __init__(self, file):
        self.file = file
        self.texts = self.file['lyric']
        self.labels = self.file['class']
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, ix):
        return torch.tensor([char_dict[c] for c in self.texts[ix]]), self.labels[ix]

In [None]:
text_data = TextDataset(data)

In [None]:
len(text_data)

In [None]:
text_data.texts

In [None]:
next(iter(text_data))

In [None]:
train_ds, test_ds = torch.utils.data.random_split(text_data, lengths = [len(text_data) - 10000, 10000])

In [None]:
len(train_ds)

In [None]:
len(test_ds)

In [None]:
BATCH_SIZE = 32

In [None]:
train_dl = DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle = True)

In [None]:
test_dl = DataLoader(test_ds, batch_size = BATCH_SIZE, shuffle = True)

In [None]:
class LyricModel(nn.Module):
    def __init__(self, batch_size = 32, num_layers = 2, bidirectional = True, hidden_size = 128, length = 64):
        self.hidden = hidden_size
        self.batch_size = batch_size
        self.length = length
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.in_features = self.hidden * (int(self.bidirectional) + 1)
        
        super(LyricModel, self).__init__()
        
        self.embedder = nn.Embedding(num_embeddings = len(char_dict), embedding_dim = self.length)
        
        self.lstm = nn.LSTM(input_size = self.length, hidden_size = self.hidden, batch_first = True,
                            num_layers = self.num_layers, bidirectional = self.bidirectional)
        self.linear1 = nn.Linear(self.in_features, self.in_features//2)
        self.linear2 = nn.Linear(self.in_features//2, 1)
        self.linear3 = nn.Linear(400, 1)
        
    def forward(self, x, h):
        y = self.embedder(x)
        y, h = self.lstm(y, h)
        y = F.leaky_relu(self.linear1(y), .1)
        y = F.leaky_relu(self.linear2(y), .1)
        y = F.leaky_relu(self.linear3(y), .1)
        
        return torch.sigmoid(y), h
    
    def init_hidden_state(self, mean, stddev):
        """
        Initialize hidden state and context tensors.
        """
        h = torch.distributions.Normal(mean, stddev).sample(((int(self.bidirectional) + 1)*self.num_layers,\
                                                             self.batch_size, self.hidden_size))
        c = torch.distributions.Normal(mean, stddev).sample(((int(self.bidirectional) + 1)*self.num_layers, \
                                                             self.batch_size, self.hidden_size))
        
        return (h, c)

In [None]:
model = LyricModel()

In [None]:
EPOCHS = 25
lr = 2e-4
betas = (0.9, 0.999)

In [None]:
opt = optim.Adam(params = model.parameters(), lr = lr, betas = betas)

In [None]:
criterion = nn.BCELoss()

In [None]:
for epoch in range(1, EPOCHS + 1):
    train_losses = 0
    train_accs = 0
    for i, (X, y) in enumerate(train_dl, start = 1):
        opt.zero_grad()
        pred = model(X)
        loss = criterion(pred.squeeze(), y.float())
        loss.backward()
        opt.step()
        train_losses += loss.item()
        train_accs += torch.sum(torch.where(pred > 0.5, 1, 0).squeeze() == y)/len(y)
        
        if (i == len(train_dl)):
            train_loss = train_losses/len(train_dl)
            train_acc = train_accs/len(train_dl)
            
            test_losses = 0
            test_accs = 0
            
            with torch.no_grad():
                for X_test, y_test in test_dl:
                    test_pred = model(X_test)
                    test_loss = criterion(test_pred.squeeze(), y_test.float())
                    test_losses += test_loss.item()
                    test_accs += torch.sum(torch.where(test_pred > 0.5, 1, 0).squeeze() == y_test)/len(y_test)
                
                test_loss = test_losses/len(test_dl)
                test_acc = test_accs/len(test_dl)
                
            print(f"Epoch [{epoch}/{EPOCHS}]")
            print(f"\tIteration [{i}/{len(train_dl)}]")
            print(f"\t\tTrain loss : {train_loss: .3f} || Test loss : {test_loss: .3f}")
            print(f"\t\tTrain acc : {train_acc: .3f} || Test acc : {test_acc: .3f}")

In [None]:
print(nn.LSTM.__doc__)