Steps:

1. Load the data.
2. Cast text to lower case.
3. El,iminate punctuation.
4. Get the maximum number of tokens in the data.
5. Tokenize the data.

In [1]:
import string
import os

import pandas as pd
import numpy as np

import torch
from torch import nn
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F

from unidecode import unidecode

In [2]:
### Reproducibility
torch.manual_seed(42)
os.environ['PYTHONHASHSEED'] = str(42)

In [3]:
BATCH_SIZE = 32
SEQ_LEN = 400

In [4]:
data = pd.read_csv("train.csv")

In [5]:
data.head()

Unnamed: 0,lyric,class
0,Can't drink without thinkin' about you,1
1,Now Lil Pump flyin' private jet (Yuh),0
2,"No, matter fact, you ain't help me when I had ...",0
3,"And you could find me, I ain't hidin'",0
4,From the way you talk to the way you move,1


In [6]:
data['lyric'] = data['lyric'].str.lower()

In [7]:
data.head()

Unnamed: 0,lyric,class
0,can't drink without thinkin' about you,1
1,now lil pump flyin' private jet (yuh),0
2,"no, matter fact, you ain't help me when i had ...",0
3,"and you could find me, i ain't hidin'",0
4,from the way you talk to the way you move,1


In [8]:
def clear_punctuation(text):
    text = ''.join([char for char in text if char not in all_punct])
    return text

In [9]:
all_char = string.ascii_lowercase + string.digits

In [10]:
all_char

'abcdefghijklmnopqrstuvwxyz0123456789'

In [11]:
all_punct = string.punctuation

In [12]:
all_punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
data['lyric'] = data['lyric'].apply(clear_punctuation)

In [14]:
data.head()

Unnamed: 0,lyric,class
0,cant drink without thinkin about you,1
1,now lil pump flyin private jet yuh,0
2,no matter fact you aint help me when i had no ...,0
3,and you could find me i aint hidin,0
4,from the way you talk to the way you move,1


In [15]:
data['lyric_len'] = data['lyric'].apply(len)

In [16]:
data.head()

Unnamed: 0,lyric,class,lyric_len
0,cant drink without thinkin about you,1,36
1,now lil pump flyin private jet yuh,0,34
2,no matter fact you aint help me when i had no ...,0,51
3,and you could find me i aint hidin,0,34
4,from the way you talk to the way you move,1,41


In [17]:
data.loc[data['lyric_len'] == max(data['lyric_len'])]

Unnamed: 0,lyric,class,lyric_len
41158,although the recent concerts of some of the bi...,0,345


In [18]:
for row in data.iterrows():
    print(row[1])
    break

lyric        cant drink without thinkin about you
class                                           1
lyric_len                                      36
Name: 0, dtype: object


The length of the texts are not symmetrical, so we need to pad the text.

In [19]:
def pad_text(text, unk = '?', limit = 400):
    if len(text) < limit:
        text = text + unk*(limit - len(text))
    else:
        text = text[:limit-1]
    return text

In [20]:
data['lyric'] = data['lyric'].apply(pad_text, **{"limit" : SEQ_LEN, "unk" : "#"})

In [21]:
data.head()

Unnamed: 0,lyric,class,lyric_len
0,cant drink without thinkin about you##########...,1,36
1,now lil pump flyin private jet yuh############...,0,34
2,no matter fact you aint help me when i had no ...,0,51
3,and you could find me i aint hidin############...,0,34
4,from the way you talk to the way you move#####...,1,41


In [22]:
unidecode(data['lyric'][0])

'cant drink without thinkin about you############################################################################################################################################################################################################################################################################################################################################################################'

In [23]:
### Unique characters represented in the text
set(''.join(data['lyric'].tolist()))

{' ',
 '#',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z'}

In [24]:
### Account for space and trhe unknown token (#)
all_char = '#' + ' ' + all_char

In [25]:
all_char

'# abcdefghijklmnopqrstuvwxyz0123456789'

In [26]:
char_dict = dict(enumerate(all_char, start = 0))
char_dict = {v: k for k, v in char_dict.items()}

In [27]:
char_dict

{'#': 0,
 ' ': 1,
 'a': 2,
 'b': 3,
 'c': 4,
 'd': 5,
 'e': 6,
 'f': 7,
 'g': 8,
 'h': 9,
 'i': 10,
 'j': 11,
 'k': 12,
 'l': 13,
 'm': 14,
 'n': 15,
 'o': 16,
 'p': 17,
 'q': 18,
 'r': 19,
 's': 20,
 't': 21,
 'u': 22,
 'v': 23,
 'w': 24,
 'x': 25,
 'y': 26,
 'z': 27,
 '0': 28,
 '1': 29,
 '2': 30,
 '3': 31,
 '4': 32,
 '5': 33,
 '6': 34,
 '7': 35,
 '8': 36,
 '9': 37}

In [28]:
class TextDataset(Dataset):
    def __init__(self, file):
        self.file = file
        self.texts = self.file['lyric']
        self.labels = self.file['class']
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, ix):
        return torch.tensor([char_dict[c] for c in unidecode(self.texts[ix])]), self.labels[ix]

In [29]:
text_data = TextDataset(data)

In [30]:
len(text_data)

51054

In [31]:
text_data.texts

0        cant drink without thinkin about you##########...
1        now lil pump flyin private jet yuh############...
2        no matter fact you aint help me when i had no ...
3        and you could find me i aint hidin############...
4        from the way you talk to the way you move#####...
                               ...                        
51049    i told her pour me some more then she went rig...
51050    hit the ground and crawl to the dresser#######...
51051    just keep breathin and breathin and breathin a...
51052    down go the system long live the king king####...
51053    if your mother knew all the things we do from ...
Name: lyric, Length: 51054, dtype: object

In [32]:
next(iter(text_data))

(tensor([ 4,  2, 15, 21,  1,  5, 19, 10, 15, 12,  1, 24, 10, 21,  9, 16, 22, 21,
          1, 21,  9, 10, 15, 12, 10, 15,  1,  2,  3, 16, 22, 21,  1, 26, 16, 22,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,

In [33]:
train_ds, test_ds = torch.utils.data.random_split(text_data, lengths = [len(text_data) - 5000, 5000])

In [34]:
len(train_ds)

46054

In [35]:
len(test_ds)

5000

In [36]:
train_dl = DataLoader(train_ds, batch_size = BATCH_SIZE, shuffle = False, drop_last = True)

In [37]:
test_dl = DataLoader(test_ds, batch_size = BATCH_SIZE, shuffle = False, drop_last = True)

In [38]:
next(iter(test_dl))

[tensor([[ 5, 16, 15,  ...,  0,  0,  0],
         [ 7,  2,  4,  ...,  0,  0,  0],
         [10,  1, 12,  ...,  0,  0,  0],
         ...,
         [14,  2, 26,  ...,  0,  0,  0],
         [16, 15, 13,  ...,  0,  0,  0],
         [24,  9,  2,  ...,  0,  0,  0]]),
 tensor([0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
         1, 1, 1, 1, 0, 0, 1, 1])]

In [39]:
class LyricModel(nn.Module):
    def __init__(self, batch_size = 32, num_layers = 2, bidirectional = True, hidden_size = 128, length = 64):
        self.hidden = hidden_size
        self.batch_size = batch_size
        self.length = length
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        
        super(LyricModel, self).__init__()
        
        self.embedder = nn.Embedding(num_embeddings = len(char_dict), embedding_dim = self.length)
        
        self.linear1 = nn.Linear(self.length, self.length//2)
        self.linear2 = nn.Linear(self.length//2, 1)
        self.linear3 = nn.Linear(SEQ_LEN, 1)
        
    def forward(self, x):
        y = self.embedder(x)
        y = F.relu(self.linear1(y))
        y = F.relu(self.linear2(y))
        y = y.view(-1, SEQ_LEN)
        return self.linear3(y)

In [40]:
model = LyricModel(batch_size = BATCH_SIZE)

In [41]:
EPOCHS = 25
lr = 1e-3
betas = (0.9, 0.999)

In [42]:
opt = optim.Adam(params = model.parameters(), lr = lr, betas = betas)

In [43]:
criterion = nn.BCEWithLogitsLoss()

In [44]:
for epoch in range(1, EPOCHS + 1):
    train_losses = 0
    train_accs = 0
    for i, (X, y) in enumerate(train_dl, start = 1):
        opt.zero_grad()
        pred = model(X)
        loss = criterion(pred.squeeze(), y.float())
        loss.backward()
        opt.step()
        train_losses += loss.item()
        train_accs += torch.sum(torch.where(pred > 0.5, 1, 0).squeeze() == y)/len(y)
        
        if (i == len(train_dl)):
            train_loss = train_losses/len(train_dl)
            train_acc = train_accs/len(train_dl)
            
            test_losses = 0
            test_accs = 0
            
            with torch.no_grad():
                for X_test, y_test in test_dl:
                    test_pred = model(X_test)
                    test_loss = criterion(test_pred.squeeze(), y_test.float())
                    test_losses += test_loss.item()
                    test_accs += torch.sum(torch.where(test_pred > 0.5, 1, 0).squeeze() == y_test)/len(y_test)
                
                test_loss = test_losses/len(test_dl)
                test_acc = test_accs/len(test_dl)
                
            print(f"Epoch [{epoch}/{EPOCHS}]")
            print(f"\tIteration [{i}/{len(train_dl)}]")
            print(f"\t\tTrain loss : {train_loss: .3f} || Test loss : {test_loss: .3f}")
            print(f"\t\tTrain acc : {train_acc: .3f} || Test acc : {test_acc: .3f}")

Epoch [1/25]
	Iteration [1439/1439]
		Train loss :  0.647 || Test loss :  0.641
		Train acc :  0.572 || Test acc :  0.572
Epoch [2/25]
	Iteration [1439/1439]
		Train loss :  0.637 || Test loss :  0.636
		Train acc :  0.574 || Test acc :  0.578
Epoch [3/25]
	Iteration [1439/1439]
		Train loss :  0.633 || Test loss :  0.633
		Train acc :  0.582 || Test acc :  0.589
Epoch [4/25]
	Iteration [1439/1439]
		Train loss :  0.631 || Test loss :  0.632
		Train acc :  0.591 || Test acc :  0.596
Epoch [5/25]
	Iteration [1439/1439]
		Train loss :  0.629 || Test loss :  0.631
		Train acc :  0.598 || Test acc :  0.598
Epoch [6/25]
	Iteration [1439/1439]
		Train loss :  0.629 || Test loss :  0.630
		Train acc :  0.603 || Test acc :  0.600
Epoch [7/25]
	Iteration [1439/1439]
		Train loss :  0.628 || Test loss :  0.630
		Train acc :  0.607 || Test acc :  0.603
Epoch [8/25]
	Iteration [1439/1439]
		Train loss :  0.628 || Test loss :  0.630
		Train acc :  0.609 || Test acc :  0.605
Epoch [9/25]
	Iteration 