In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import torch
from torchtext.data import Field, TabularDataset, Iterator, BucketIterator
from sklearn.model_selection import train_test_split
import random




In [2]:
def split_data(csv_file):
    data = pd.read_csv(csv_file)[['genre','lyrics']].dropna()
    x = data.drop('genre',axis=1)
    y = data.genre
    X_train, X_test, y_train, y_test = train_test_split(x, y,test_size=0.5)
    #X_train.to_csv('data/X_train.csv', header=True)
    #y_train.to_csv('data/y_train.csv', header=True)
    #X_test.to_csv('data/X_test.csv', header=True)
    #y_test.to_csv('data/y_test.csv', header=True)
    train = X_train.join(y_train,how='outer')
    test = X_test.join(y_test,how ='outer')
    train.to_csv('data/train.csv', header=True)
    test.to_csv('data/test.csv', header = True)
    return train, test

In [3]:
train, test = split_data('data/lyrics.csv')


In [None]:
x_train = train["lyrics"].values

In [None]:
y_train = train['genre'].values


In [4]:
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True)
LABEL = Field(sequential=False, use_vocab=False)

In [5]:
lyric_datafields = [("genre", LABEL),("lyrics",TEXT)]

In [6]:
trn, vld = TabularDataset.splits(
        path="data", # the root directory where the data lies
        train='train.csv', validation="test.csv",
        format='csv',
        skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=lyric_datafields)

In [7]:
TEXT.build_vocab(trn)


In [None]:
trn[0].__dict__.keys()

In [8]:
train_iter, val_iter = BucketIterator.splits(
        (trn, vld), # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(64, 64),
        device=-1, # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.genre), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [9]:
batch = next(train_iter.__iter__()); batch


[torchtext.data.batch.Batch of size 64]
	[.genre]:[torch.LongTensor of size 64]
	[.lyrics]:[torch.LongTensor of size 671x64]

In [10]:
batch.__dict__.keys()


dict_keys(['batch_size', 'dataset', 'fields', 'input_fields', 'target_fields', 'genre', 'lyrics'])

In [13]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [14]:

train_dl = BatchWrapper(train_iter, "lyrics", ["genre"])

In [15]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [16]:
class SimpleBiLSTMBaseline(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300,
                 spatial_dropout=0.05, recurrent_dropout=0.1, num_linear=1):
        super().__init__() # don't forget to call this!
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1, dropout=recurrent_dropout)
        self.linear_layers = []
        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
        self.linear_layers = nn.ModuleList(self.linear_layers)
        self.predictor = nn.Linear(hidden_dim, 1)
    
    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)
        return preds

In [17]:
em_sz = 100
nh = 500
nl = 3
model = SimpleBiLSTMBaseline(nh, emb_dim=em_sz); model

  "num_layers={}".format(dropout, num_layers))


SimpleBiLSTMBaseline(
  (embedding): Embedding(552964, 100)
  (encoder): LSTM(100, 500, dropout=0.1)
  (linear_layers): ModuleList()
  (predictor): Linear(in_features=500, out_features=1, bias=True)
)

In [18]:
import tqdm

In [19]:
opt = optim.Adam(model.parameters(), lr=1e-2)
loss_func = nn.BCEWithLogitsLoss()

In [20]:
epochs = 2


In [None]:
%%time
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train() # turn on training mode
    for x, y in tqdm.tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        print("here")
        opt.zero_grad()
        print("here2")
        preds = model(x)
        print("here3")
        loss = loss_func(preds, y)
        print("here4")
        loss.backward()
        print("here5")
        opt.step()
        print("here6")
        
        #running_loss += loss.data[0] * x.size(0)
        print("next epoc")
    epoch_loss = running_loss / len(trn)
    
    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.data[0] * x.size(0)

    val_loss /= len(vld)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))


  0%|          | 0/2083 [00:00<?, ?it/s][A

here
here2
here3
here4
here5



  0%|          | 1/2083 [01:37<56:15:18, 97.27s/it][A

here6
next epoc
here
here2
here3
here4
here5



  0%|          | 2/2083 [02:16<46:08:58, 79.84s/it][A

here6
next epoc
here
here2
here3
here4
here5



  0%|          | 3/2083 [03:01<40:04:21, 69.36s/it][A

here6
next epoc
here
here2
here3
here4
here5



  0%|          | 4/2083 [03:45<35:43:12, 61.85s/it][A

here6
next epoc
here
here2
here3
here4
here5



  0%|          | 5/2083 [05:09<39:34:10, 68.55s/it][A

here6
next epoc
here
here2
here3
here4
here5



  0%|          | 6/2083 [06:40<43:23:56, 75.22s/it][A

here6
next epoc
here
here2
here3
here4
here5



  0%|          | 7/2083 [07:26<38:14:58, 66.33s/it][A

here6
next epoc
here
here2
here3
here4
here5



  0%|          | 8/2083 [09:05<43:57:39, 76.27s/it][A

here6
next epoc
here
here2
here3
here4
here5



  0%|          | 9/2083 [11:14<53:04:04, 92.11s/it][A

here6
next epoc
here
here2
here3
here4


In [None]:
set(train["genre"])