Steps:

1. Load the data.
2. Cast text to lower case.
3. Eliminate punctuation.
4. Get the maximum number of tokens in the data.
5. Tokenize the data.

In [1]:
import os
import string
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
from unidecode import unidecode

In [2]:
torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x1f5244bc850>

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
print(device)

cuda


In [5]:
DATA_PATH = os.path.join(os.getcwd().replace('notebooks', 'data'), 'train.csv')
data = pd.read_csv(DATA_PATH)

In [6]:
data.head()

Unnamed: 0,lyric,class
0,Can't drink without thinkin' about you,1
1,Now Lil Pump flyin' private jet (Yuh),0
2,"No, matter fact, you ain't help me when I had ...",0
3,"And you could find me, I ain't hidin'",0
4,From the way you talk to the way you move,1


In [7]:
data['lyric'] = data['lyric'].str.lower()

In [8]:
data.head()

Unnamed: 0,lyric,class
0,can't drink without thinkin' about you,1
1,now lil pump flyin' private jet (yuh),0
2,"no, matter fact, you ain't help me when i had ...",0
3,"and you could find me, i ain't hidin'",0
4,from the way you talk to the way you move,1


In [9]:
def clear_punctuation(text):
    text = ''.join([char for char in text if char not in all_punct])
    return text

In [10]:
all_char = string.ascii_lowercase

In [11]:
all_char

'abcdefghijklmnopqrstuvwxyz'

In [12]:
all_punct = string.punctuation

In [13]:
all_punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [14]:
data['lyric'] = data['lyric'].apply(clear_punctuation)

In [15]:
data.head()

Unnamed: 0,lyric,class
0,cant drink without thinkin about you,1
1,now lil pump flyin private jet yuh,0
2,no matter fact you aint help me when i had no ...,0
3,and you could find me i aint hidin,0
4,from the way you talk to the way you move,1


In [16]:
data['lyric_len'] = data['lyric'].apply(len)

In [17]:
data.head()

Unnamed: 0,lyric,class,lyric_len
0,cant drink without thinkin about you,1,36
1,now lil pump flyin private jet yuh,0,34
2,no matter fact you aint help me when i had no ...,0,51
3,and you could find me i aint hidin,0,34
4,from the way you talk to the way you move,1,41


In [18]:
data.loc[data['lyric_len'] == max(data['lyric_len'])]

Unnamed: 0,lyric,class,lyric_len
41158,although the recent concerts of some of the bi...,0,345


In [19]:
for row in data.iterrows():
    print(row[1])
    break

lyric        cant drink without thinkin about you
class                                           1
lyric_len                                      36
Name: 0, dtype: object


The length of the texts are not symmetrical, so we need to pad the text.

In [20]:
def pad_text(text, unk = '?', limit = 400):
    if len(text) < limit:
        text = text + unk*(limit - len(text))
    else:
        text = text[:limit-1]
    return text

In [21]:
data['lyric'] = data['lyric'].apply(pad_text, **{"limit" : 400, "unk" : "#"})

In [22]:
data.head()

Unnamed: 0,lyric,class,lyric_len
0,cant drink without thinkin about you##########...,1,36
1,now lil pump flyin private jet yuh############...,0,34
2,no matter fact you aint help me when i had no ...,0,51
3,and you could find me i aint hidin############...,0,34
4,from the way you talk to the way you move#####...,1,41


In [23]:
unidecode(data['lyric'][0])

'cant drink without thinkin about you############################################################################################################################################################################################################################################################################################################################################################################'

In [24]:
all_char = '#' + ' ' + all_char + ''.join([str(n) for n in range(10)])

In [25]:
all_char

'# abcdefghijklmnopqrstuvwxyz0123456789'

In [26]:
char_dict = dict(enumerate(all_char, start = 0))
char_dict = {v: k for k, v in char_dict.items()}

In [27]:
char_dict

{'#': 0,
 ' ': 1,
 'a': 2,
 'b': 3,
 'c': 4,
 'd': 5,
 'e': 6,
 'f': 7,
 'g': 8,
 'h': 9,
 'i': 10,
 'j': 11,
 'k': 12,
 'l': 13,
 'm': 14,
 'n': 15,
 'o': 16,
 'p': 17,
 'q': 18,
 'r': 19,
 's': 20,
 't': 21,
 'u': 22,
 'v': 23,
 'w': 24,
 'x': 25,
 'y': 26,
 'z': 27,
 '0': 28,
 '1': 29,
 '2': 30,
 '3': 31,
 '4': 32,
 '5': 33,
 '6': 34,
 '7': 35,
 '8': 36,
 '9': 37}

In [28]:
class TextDataset(Dataset):
    def __init__(self, file):
        self.file = file
        self.texts = self.file['lyric']
        self.labels = self.file['class']
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, ix):
        return torch.tensor([char_dict[c] for c in self.texts[ix]]), self.labels[ix]

In [29]:
text_data = TextDataset(data)

In [30]:
len(text_data)

51054

In [31]:
text_data.texts

0        cant drink without thinkin about you##########...
1        now lil pump flyin private jet yuh############...
2        no matter fact you aint help me when i had no ...
3        and you could find me i aint hidin############...
4        from the way you talk to the way you move#####...
                               ...                        
51049    i told her pour me some more then she went rig...
51050    hit the ground and crawl to the dresser#######...
51051    just keep breathin and breathin and breathin a...
51052    down go the system long live the king king####...
51053    if your mother knew all the things we do from ...
Name: lyric, Length: 51054, dtype: object

In [32]:
next(iter(text_data))

(tensor([ 4,  2, 15, 21,  1,  5, 19, 10, 15, 12,  1, 24, 10, 21,  9, 16, 22, 21,
          1, 21,  9, 10, 15, 12, 10, 15,  1,  2,  3, 16, 22, 21,  1, 26, 16, 22,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,

In [33]:
char_dict['3']

31

In [34]:
train_ds, test_ds = torch.utils.data.random_split(text_data, lengths = [len(text_data) - 10000, 10000])

In [35]:
len(train_ds)

41054

In [36]:
len(test_ds)

10000

In [37]:
BATCH_SIZE = 32

In [38]:
train_dl = DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle = True)

In [39]:
test_dl = DataLoader(test_ds, batch_size = BATCH_SIZE, shuffle = True)

In [87]:
class LyricModel(nn.Module):
    def __init__(self, batch_size = 32, num_layers = 2, bidirectional = True, hidden_size = 128, length = 64):
        self.hidden = hidden_size
        self.batch_size = batch_size
        self.length = length
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.in_features = self.hidden * self.num_layers * (int(self.bidirectional) + 1)
        
        super(LyricModel, self).__init__()
        
        self.embedder = nn.Embedding(num_embeddings = len(char_dict), embedding_dim = self.length)
        
        self.lstm = nn.LSTM(input_size = self.length, hidden_size = self.hidden, batch_first = True,
                            num_layers = self.num_layers, bidirectional = self.bidirectional)
        self.linear1 = nn.Linear(self.in_features, self.in_features//2)
        self.linear2 = nn.Linear(self.in_features//2, 1)
        
    def forward(self, x, state):
        h, c = state
        y = self.embedder(x)
        print(y.shape)
        y, (h, c) = self.lstm(y, h)
        h = h.view(-1, self.in_features)
        y = F.leaky_relu(self.linear1(h), .1)
        y = F.leaky_relu(self.linear2(y), .1)
        
        return torch.sigmoid(y).squeeze(), (h, c)
    
    def init_hidden_state(self, mean, stddev):
        """
        Initialize hidden state and context tensors.
        """
        h = torch.distributions.Normal(mean, stddev).sample(((int(self.bidirectional) + 1)*self.num_layers,\
                                                             self.batch_size, self.hidden))
        c = torch.distributions.Normal(mean, stddev).sample(((int(self.bidirectional) + 1)*self.num_layers, \
                                                             self.batch_size, self.hidden))
        
        return (h.to(device), c.to(device))

In [88]:
model = LyricModel()
model.to(device)

LyricModel(
  (embedder): Embedding(38, 64)
  (lstm): LSTM(64, 128, num_layers=2, batch_first=True, bidirectional=True)
  (linear1): Linear(in_features=512, out_features=256, bias=True)
  (linear2): Linear(in_features=256, out_features=1, bias=True)
)

In [89]:
EPOCHS = 25
lr = 2e-4
betas = (0.9, 0.999)

In [90]:
opt = optim.Adam(params = model.parameters(), lr = lr, betas = betas)

In [91]:
criterion = nn.BCELoss().to(device)

In [92]:
TRAIN_DATA_LEN = len(train_dl)
TEST_DATA_LEN = len(test_dl)

print(f"Training for {EPOCHS} epochs i.e. {TRAIN_DATA_LEN} iterations...\n")

for epoch in range(1, EPOCHS + 1):
    state = model.init_hidden_state(0, .02)
    train_loss = []
    test_loss = []
    
    model.train()
    for i, (X, y) in enumerate(train_dl, start = 1):
        X, y = X.to(device), y.to(device)
        
        y_pred, (h, c) = model(X, state)
        loss = criterion(y_pred, y.float())
        loss.backward(retain_graph = True)
        opt.step()
        opt.zero_grad()
        train_loss.append(loss.item())

    with torch.grad():
        model.eval()
        state = model.init_hidden_state(0, 0.02)
        h, c = state
        
        for i, (X_, y_) in enumerate(test_dl, start = 1):
            X_, y_ = X_.to(device), y_.to(device)
            y_pred, (h, c) = model(X_, h)
            loss = criterion(y_pred, y_.float())
            test_loss.append(loss.item())

        print(f"Iteration [{i : .3f}/{TEST_DATA_LEN : .3f}]")
        print(f"   Train loss: {sum(train_loss)/len(train_loss) : .3f} || Test loss: {sum(test_loss)/len(test_loss) : .3f}")
    

Training for 25 epochs i.e. 1283 iterations...

torch.Size([32, 400, 64])


RuntimeError: For batched 3-D input, hx and cx should also be 3-D but got (2-D, 2-D) tensors

In [84]:
c.shape

torch.Size([4, 32, 128])

In [86]:
h.shape

torch.Size([4, 32, 128])

In [85]:
y_pred.shape

torch.Size([32])

In [None]:
y.shape