In [1]:
import pandas as pd
from torchtext.data import TabularDataset
from torchtext.data import Iterator
from torchtext.data import BucketIterator
from torch import nn
from ProjDataset import ProjDataset
from tqdm import tqdm
from torch.utils.data import DataLoader
import torch, pickle, random, os
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
import torch.optim as optim
from torch import nn
from GRUNetwork import RNN_GRU
import torch, spacy

In [2]:
from torchtext.data import Field, BucketIterator

In [52]:
d = pd.read_csv('data/dataframes/pop.csv')

In [57]:
f = pd.read_csv('data/dataframes/training.csv')

In [94]:
TEXT = Field(tokenize = 'spacy',
              tokenizer_language = 'en',
              lower = True,
              include_lengths=True,
              sequential=True,
              use_vocab=True
            )
LABEL = Field(sequential=False, use_vocab=False)

In [237]:
lyric_datafield = [
                 ("lyrics", TEXT),
                 ("genre", LABEL)]

In [239]:



train, tst = TabularDataset.splits('data/dataframes/',
                                 train='pop.csv',
                                 test ='testing.csv',
                                 format ='csv',
                                 fields = lyric_datafield,
                                 skip_header=True)

In [202]:
batch = next(iter(trn))

In [235]:
d = pd.read_csv('data/dataframes/testing.csv')

In [204]:
TEXT.build_vocab(trn, max_size=100000, vectors="glove.6B.50d")

In [324]:
traindl, testdl = BucketIterator.splits(datasets=(trn,tst), batch_sizes=(50,1),
                     sort_key=lambda x: len(x.lyrics), device=None,
                     sort_within_batch=True, repeat=False)
            

In [206]:
vocab_size = len(TEXT.vocab)
embedding_dim = 50
n_hidden =64
n_out = 5

In [207]:
class BatchGenerator:
    def __init__(self, dl, x_field, y_field):
        self.dl, self.x_field, self.y_field = dl, x_field, y_field
        
    def __len__(self):
        return len(self.dl)
    
    def __iter__(self):
        for batch in self.dl:
            X = getattr(batch, self.x_field)
            y = getattr(batch, self.y_field)
            yield (X,y)


In [326]:
train_batch_it = BatchGenerator(traindl, 'lyrics', 'genre')
tst_batch_it = BatchGenerator(testdl, 'lyrics', 'genre')

In [210]:
device = 'cuda:01'

In [201]:
from torch import nn
import torch, spacy



class RNN_GRU(nn.Module):
    def __init__(self, vocab_size, seq_len, input_size, 
                 hidden_size, num_layers, output_size, 
                 device, dropout=0.0, pretrained_vectors=torch.Tensor,
                 pretrained=bool):
        super().__init__()
        self.num_layers = num_layers
        self.seq_len = seq_len
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.device = device
        self.pretrained = pretrained
        self.pretrained_vectors = pretrained_vectors
        self.emb = nn.Embedding(vocab_size, input_size).to(device)
        if pretrained == True:
            self.emb.weight.data.copy_(pretrained_vectors)
            self.emb.weight.requires_grad = False 
        
        self.gru = nn.GRU(input_size, hidden_size,
                          num_layers=self.num_layers, batch_first=True, dropout=dropout).to(device)
        self.fc = nn.Linear(hidden_size * seq_len, output_size).to(device)

    def forward(self, sequence, hidden_layer, device):
        output = self.emb(sequence).to(device)
        hidden_layer = hidden_layer.to(self.device)
        output, hidden_layer = self.gru(output, hidden_layer)
        output = output.contiguous().view(-1, self.hidden_size *
                                          len(sequence[0]))
        output = self.fc(output).to(device)

        return output, hidden_layer

    def init_hidden(self, batch_size):
        return torch.zeros(self.num_layers, batch_size, self.hidden_size).float().to(self.device)



In [211]:
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
import torch.nn.functional as F

class ConcatPoolingGRUAdaptive(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out, pretrained_vec):
        super().__init__()
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_hidden = n_hidden
        self.n_out = n_out
        
        self.emb = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.emb.weight.data.copy_(pretrained_vec) # load pretrained vectors
        self.emb.weight.requires_grad = False # make embedding non trainable
        self.gru = nn.GRU(self.embedding_dim, self.n_hidden)
        self.out = nn.Linear(self.n_hidden*2, self.n_out)
        
    def forward(self, seq, lengths):
        bs = seq.size(1)
        self.h = self.init_hidden(bs)
        seq = seq.transpose(0,1).to(device)
        
        embs = self.emb(seq)
        embs = embs.transpose(0,1)
        embs = pack_padded_sequence(embs, lengths, enforce_sorted=False)
        
        gru_out, self.h = self.gru(embs, self.h)
        
        gru_out, lengths = pad_packed_sequence(gru_out) 
        
        avg_pool = F.adaptive_avg_pool1d(gru_out.permute(1,2,0),1).view(bs,-1)
        max_pool = F.adaptive_max_pool1d(gru_out.permute(1,2,0),1).view(bs,-1)  
        outp = self.out(torch.cat([avg_pool,max_pool],dim=1))
        return F.log_softmax(outp, dim=-1)
    
    def init_hidden(self, batch_size): 
       return torch.zeros((1,batch_size,self.n_hidden)).cuda().to(device)
        

In [212]:
pre_model = ConcatPoolingGRUAdaptive(vocab_size, embedding_dim, n_hidden,
                                     n_out, pretrained_vec=TEXT.vocab.vectors).to(device)

In [213]:
from tqdm import tqdm_notebook

In [332]:
nr_of_epochs = 20
pre_model.train()
pre_model = pre_model.to(device)
epoch_nr = 0
EPOCH = list(range(nr_of_epochs))
tenp = round(len(train_batch_it,) / 10)
avg_loss = 0 
criterion = nn.CrossEntropyLoss()
optimizer = Adam(pre_model.parameters(), lr=0.001)
for epoch in tqdm_notebook(EPOCH):
    epoch_nr += 1
    epoch_loss = []
    count = 0
    percent = 0
    
    for i in tqdm_notebook(train_batch_it):
        
        
        x = i[0][0]
        y = i[1].to(device)
        lengths = i[0][1]
        optimizer.zero_grad()
        
        try:
            output = pre_model(x, lengths)
        
            loss = criterion(output, y.long())
        
            loss.backward()
            epoch_loss.append(loss.item())
            optimizer.step()
            avg_loss = sum(epoch_loss) / len(epoch_loss)
        except RuntimeError:
            continue
        
    
        
        
    print("Average loss at epoch %d: %.7f" % (epoch_nr, avg_loss))
    
        

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 1: 0.8290459


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 2: 0.8178085


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 3: 0.8070127


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 4: 0.7949754


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 5: 0.7844466


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 6: 0.7722690


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 7: 0.7607746


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 8: 0.7527369


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 9: 0.7385443


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 10: 0.7304832


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 11: 0.7187075


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 12: 0.7073429


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 13: 0.6982067


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 14: 0.6857069


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 15: 0.6782147


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 16: 0.6659472


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 17: 0.6535603


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 18: 0.6473791


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 19: 0.6337539


HBox(children=(IntProgress(value=0, max=500), HTML(value='')))


Average loss at epoch 20: 0.6274186



In [336]:
correct = 0
count = 0
for i in tqdm_notebook(tst_batch_it):
    
    x = i[0][0]
    y = i[1]
    lengths = i[0][1]
    try:
        predictions = pre_model(x, lengths)
        for prediction in zip(predictions,y):
            count+=1
            output, index = torch.max(prediction[0], 0)
            if index.item() == y[0].item():
                correct += 1

    except:
        continue
accuracy = (correct / count) * 100

print('Model Accuracy: {}'.format(accuracy))

HBox(children=(IntProgress(value=0, max=250), HTML(value='')))


Model Accuracy: 60.8


In [334]:
count

250

In [None]:
model = RNN_GRU(vocab_size = len(TEXT.vocab), seq_len=300, input_size=300,
               hidden_size = 128, num_layers = 2, output_size = 5,
               device = 'cuda:01', dropout=0.01, pretrained_vectors=TEXT.vocab.vectors,pretrained=False)

In [None]:
device = 'cuda:01'
batch_size = 200

In [None]:
from tqdm import tqdm_notebook

In [None]:


nr_of_epochs = 10
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)
model.train()
model = model.to(device)
epoch_nr = 0
EPOCH = list(range(nr_of_epochs))
tenp = round(len(train_iter,) / 10)
avg_loss = 0 

for epoch in tqdm_notebook(EPOCH):
    epoch_nr += 1
    epoch_loss = []
    h = model.init_hidden(200)
    count = 0
    percent = 0
    for example in tqdm_notebook(train_iter):
        count +=1
        x = example.lyrics.to(device)
        y = example.genre.to(device)
        optimizer.zero_grad()
        h = h.data
        out, h = model(x, h, device)
        loss = criterion(out, y.long())
        loss.backward()
        epoch_loss.append(loss.item())
        optimizer.step()
        avg_loss = sum(epoch_loss) / len(epoch_loss)
    
    print("Average loss at epoch %d: %.7f" % (epoch_nr, avg_loss))
    

In [None]:
count

In [None]:
accuracy