# Installations, Imports, Drive mount and Device selection

In [1]:
!pip install tensorboard-plugin-customizable-plots
!pip install pretty_midi
!pip install tensorboard

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorboard-plugin-customizable-plots
  Downloading tensorboard_plugin_customizable_plots-0.1.9-py3-none-any.whl (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboard-plugin-customizable-plots
Successfully installed tensorboard-plugin-customizable-plots-0.1.9
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pretty_midi
  Downloading pretty_midi-0.2.9.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mido>=1.1.16
  Downloading mido-1.2.10-py2.py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.1/51.1 KB[0m

In [2]:
import os
import pandas as pd
import numpy as np
import pretty_midi
import re
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as T
import gensim
import gensim.downloader
import random
import heapq
from operator import itemgetter
from torch.nn.modules import dropout
import matplotlib.pyplot as plt
import time
from torch.utils.tensorboard import SummaryWriter

In [8]:
# Mounting Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


# Data Reading

In [9]:
# Paths

path_to_assingment_folder = '/content/drive/MyDrive/DeepLearningAssingment3'
midi_folder_name = 'midi_files'
test_set_csv_name = 'lyrics_test_set.csv'
train_set_csv_name = 'lyrics_train_set.csv'
path_to_train = os.path.join(path_to_assingment_folder, train_set_csv_name)
path_to_test = os.path.join(path_to_assingment_folder, test_set_csv_name)
path_to_midi_dir = os.path.join(path_to_assingment_folder, midi_folder_name)

In [10]:
# Fixing column names and reading train and test data

col_names = ['artist', 'song', 'lyrics']
train_set_csv = pd.read_csv(path_to_train, names = col_names+["a","b","c","d"])[col_names]
test_set_csv = pd.read_csv(path_to_test, names = col_names)

In [11]:
# Helper functions for data reading

def clean_midi_name(file_name):
  '''
  This function get a file name and clean it
  '''
  cleaned_name = file_name.replace('_-_',' ')
  cleaned_name = cleaned_name.replace('_',' ')
  cleaned_name = cleaned_name.replace('.mid','')
  return cleaned_name.lower()

def generate_lyrics_midi_lists(df, path_to_midi_dir):
  '''
  DESCRIPTION: this function generates pairs of lyrics and midi files.
  PARAMETERS:
  :param df(pandas DataFrame): the data we're working with.
  :param path_to_midi_dir(str): the path for the location of the midi files.
  :return: list of tuples, each tuple (lyrics, midifile)
  '''
  midi_file_names = [f for f in os.listdir(path_to_midi_dir) if os.path.isfile(os.path.join(path_to_midi_dir, f))]
  midi_file_names_clean = [clean_midi_name(file_name) for file_name in midi_file_names]

  mapping = {}
  df['file_name'] = df['artist'] + ' ' + df['song']
  lyrics_l = []
  midis = []
  for file_num in range(len(midi_file_names)):
    mapping[midi_file_names_clean[file_num]] = midi_file_names[file_num]
  for ind, row in df.iterrows():
    file_name = " ".join(row['file_name'].split())
    lyrics = row['lyrics']
    try: 
      midi_file_name = mapping[file_name]
    except:
      continue
    midi_file_path = os.path.join(path_to_midi_dir, midi_file_name)
    try:
      midi_file = pretty_midi.PrettyMIDI(midi_file_path)
      lyrics_l.append(lyrics)
      midis.append(midi_file)
    except: 
      continue
  return lyrics_l, midis

In [12]:
# Generating lyrics and matching midi lists from train and test dfs
try:
  midis_train = pd.read_pickle(os.path.join(path_to_assingment_folder ,"midis_train"))
  lyrics_train = pd.read_pickle(os.path.join(path_to_assingment_folder ,"lyrics_train"))
except:
  lyrics_train, midis_train = generate_lyrics_midi_lists(train_set_csv, path_to_midi_dir)
  pd.to_pickle(midis_train ,filepath_or_buffer=os.path.join(path_to_assingment_folder , "midis_train"))
  pd.to_pickle(lyrics_train ,filepath_or_buffer=os.path.join(path_to_assingment_folder , "lyrics_train"))
lyrics_test, midis_test = generate_lyrics_midi_lists(test_set_csv, path_to_midi_dir)

# Preprocess 

In [13]:
# Helper functions for preprocessing

def song_cleaner(song):
  '''
  DESCRIPTION: this function get a song lyrics and preproccess it to be in the right format.
  PARAMETERS:
  :param song(string): the lyrics of the song.
  :return: song(string): clean song.
  '''
  song = song.lower()
  song = song.replace("#", "").replace("_", " ").replace(",", "").replace("'", "").replace(".", "").replace("?", "").replace(":", "").replace("!", "").replace(";", "").replace("-", " ").replace("-", " ").replace("(", "").replace(")", "").replace("`", "").replace("[", "").replace("]", "").replace("{","").replace("}","")
  return song


def get_word2vec():
  '''
  DESCRIPTION: this function importing pre trained word2vec
  :return: word2vec.
  '''

  try:
    word2vec = pd.read_pickle(os.path.join(path_to_assingment_folder ,"word2vec-google-news-300"))
  except:
    word2vec = gensim.downloader.load('word2vec-google-news-300')
    pd.to_pickle(word2vec ,filepath_or_buffer=os.path.join(path_to_assingment_folder , "word2vec-google-news-300"))
  return word2vec
  
def get_clean_lyrics_list_of_list(lyrics):
  '''
  DESCRIPTION: this function tokenize the lyrics of each song
  PARAMETERS:
  :param lyrics([[str],[str],[str]]) : each inner list is a song lyrics string
  :return: song(string): tokenized list of lists
  '''

  clean_lyrics_list = []
  for song in lyrics:
      clean_lyrics_list.append(song_cleaner(song))
  clean_lyrics_list_of_list = [song.split() for song in clean_lyrics_list]
  return clean_lyrics_list_of_list


def get_embedding_dict_and_embedded_lyrics(clean_lyrics_list_of_lists):
  '''
  DESCRIPTION: this function is the fundemantal of our embedding layer it take the list of lists
                and creating an embedding dict embedded songs and words that was not found in word2vec
                but do exist in the lyrics dataset.
  PARAMETERS:
  :param clean_lyrics_list_of_lists: [[lyrics],[lyrics],[lyrics],[lyrics]]
  :return: embedded_dict (key=word,value=embedded vector), embedded_songs [[embedded vector for each word a song],[embedded vector for each word a song]]
            words_not_known_to_word2vec which is self explanetory.
  '''

  try:
    embedding_dict = pd.read_pickle(os.path.join(path_to_assingment_folder ,"embedding_dict"))
    embedded_songs = pd.read_pickle(os.path.join(path_to_assingment_folder ,"embedded_lyrics"))
    words_not_known_to_word2vec = pd.read_pickle(os.path.join(path_to_assingment_folder ,"words_not_known_to_word2vec"))
  except:
    word2vec = get_word2vec()
    embedding_dict = {}
    embedded_songs = []
    words_not_known_to_word2vec=[]

    for song in clean_lyrics_list_of_lists:
      embedded_song = []
      for word in song:
        if word in embedding_dict.keys():
          embedded_song.append(embedding_dict[word])
          continue
        else:
          try:
            embedded_word = word2vec.get_vector(word)
            embedding_dict[word] = embedded_word
            embedded_song.append(embedded_word)
          except:
            words_not_known_to_word2vec.append(word)
            continue
      embedded_songs.append(embedded_song)

    pd.to_pickle(embedding_dict ,filepath_or_buffer=os.path.join(path_to_assingment_folder , "embedding_dict"))
    pd.to_pickle(embedded_songs ,filepath_or_buffer=os.path.join(path_to_assingment_folder , "embedded_lyrics"))
    pd.to_pickle(words_not_known_to_word2vec ,filepath_or_buffer=os.path.join(path_to_assingment_folder , "words_not_known_to_word2vec"))
    
  return embedding_dict, embedded_songs, words_not_known_to_word2vec

def extract_features_from_midi(midi_files):
  '''
  DESCRIPTION: this function get list of midi files (one for eats song) and generates a list of features for each song.
  PARAMETERS:
  :param midi_files: list if midi files [midi_file,midi_file,...]
  :return: list of midi features [[features for midi file #1],[features for midi file #2],...]
  '''
  midi_features = []
  for midi_file in midi_files:
    midi_features.append(extract_vector(midi_file))
  return midi_features

def midi_vector_naive(midi):
  tempo = np.array([midi.estimate_tempo()])
  chroma = np.array(midi.get_chroma().mean(-1))
  piano_roll = np.array(midi.get_piano_roll().mean(-1))
  num_of_instrument = np.array([len(midi.instruments)])
  vector_list = [tempo,chroma,piano_roll,num_of_instrument]
  midi_vector = np.concatenate(vector_list)
  return np.float32(midi_vector)


def extract_vector(midi_data):
    # Load the MIDI file

    
    # Initialize an empty list to store the vector
    eps = 1e-10
    vector = []
    
    # Append the length of the time signature list
    vector.append(len(midi_data.time_signature_changes))
    
    # Initialize an empty list to store the mean of the active rows and the average of the indices
    programs_stats = [0]*100
    # Iterate over each instrument in the MIDI file
    for instrument in midi_data.instruments:
        if instrument.program == 0 or instrument.program > 99:
          continue
        
        # Extract the piano roll for the instrument
        piano_roll = np.array(instrument.get_piano_roll())
        # Get the indices of the 7 rows with the most values
        most_active_rows = np.argsort(np.sum(piano_roll, axis=1))[-7:]

        # Get the mean of the 7 most active rows
        mean_active_rows = np.array(piano_roll[most_active_rows].mean(-1))

        # Get the mean of the indices of the 7 most active rows
        mean_active_rows_idx = np.array(most_active_rows).mean(-1)
        programs_stats[instrument.program] = weight(mean_active_rows.mean(),mean_active_rows_idx)

    programs_stats = [(value/(sum(programs_stats)+eps)) for value in programs_stats]
    programs_stats[0] = len(midi_data.time_signature_changes)
    # append the numpy to the vector
    vector = np.array(programs_stats,dtype=np.float32)

    return vector


def weight(x, y ,alpha=0.8):
  return (1-alpha)*x + alpha*y



EMBEDDING_SIZE = 300
PAD_VALUE = 0

# PADDING - not used at the moment
def add_padding(song, required_size):
  '''
  DESCRIPTION: this function adds padding in case we give a whole song as input to the network.
  PARAMETERS:
  :param song: song to add padding to
  :param required_size: target size
  :return: original song with padding equal to PAD_VALUE at the required size
  '''
  song_size = len(song)
  pad_size = required_size - song_size
  pad = [[PAD_VALUE for i in range(EMBEDDING_SIZE)] for i in range(pad_size)]
  return song + pad

# PADDING - not used at the moment
def remove_padding(x):
  '''
  DESCRIPTION: this function remove padding for a single batch
  PARAMETERS:
  :param x: batch
  :return: x without unnesesary padding
  '''
  pad = torch.tensor([0 for i in range(300)]).cuda()
  max_len = 0
  for seq in x:
    for i, word in enumerate(seq):
      if torch.all(pad.eq(word)):
        if i + 1 > max_len:
            max_len = i + 1
        break
  if max_len == 0:
    max_len = x.shape[1]
  return x[:,:max_len,:]

def get_test_tokens(clean_lyrics_list_of_lists):
  '''
  DESCRIPTION: this function return the test tokens
  PARAMETERS:
  :param clean_lyrics_list_of_lists: list of the song lyrics
  :return: list containing the first word in each song aka test token
  '''
  test_tokens = []
  for song in clean_lyrics_list_of_lists:
    test_tokens.append(song[0])
  return test_tokens


In [14]:
# Clean the lyrics
clean_lyrics_list_of_lists_train = get_clean_lyrics_list_of_list(lyrics_train)
clean_lyrics_list_of_lists_test = get_clean_lyrics_list_of_list(lyrics_test)

# Generate embedding
embedding_dict, embedded_lyrics_train, words_not_known_to_word2vec = get_embedding_dict_and_embedded_lyrics(clean_lyrics_list_of_lists_train)

# Extract first word in test lyrcis as test token
test_tokens = get_test_tokens(clean_lyrics_list_of_lists_test)

# Extract midi features
try:
  midi_features_train = pd.read_pickle(os.path.join(path_to_assingment_folder ,"midi_features_train_2"))
  midi_features_test = pd.read_pickle(os.path.join(path_to_assingment_folder ,"midi_features_test_2"))
except:
  midi_features_train = extract_features_from_midi(midis_train)
  midi_features_test =  extract_features_from_midi(midis_test)
  pd.to_pickle(midi_features_train ,filepath_or_buffer=os.path.join(path_to_assingment_folder , "midi_features_train_2"))
  pd.to_pickle(midi_features_test ,filepath_or_buffer=os.path.join(path_to_assingment_folder , "midi_features_test_2"))


# Defining the Dataset and Model

In [15]:
# Dataset helper function

def get_sequences_sliding_window(embedded_lyrics, window_size, stride):
  '''
  DESCRIPTION: this function applies sliding window on song lyrics
  PARAMETERS:
  :param embedded_lyrics: song lyrics to generate sequences from
  :param window_size: which window size to use when generating sequences of words
  :return: list of sequences of according the given window size
  '''
  sequences = []
  for song in embedded_lyrics:
    sequences += [song[i:i+window_size] for i in range(0, len(song)-window_size+1, stride)]

  return sequences

def combine_word_embedding_with_midi_features(embedded_lyrics, midi_features):
  '''
  DESCRIPTION: this function combine the word embeddings with the midi features
  PARAMETERS:
  :param embedded_lyrics: embedded song lyrics
  :param midi_features: list of midi_features where each object contains the features for the whole song
  :return: embedded song lyrics concated with the midi_features
  '''
  embedded_lyrics_midi = []
  for i, song in enumerate(embedded_lyrics):
    song_with_midi = []
    for word in song:
      song_with_midi.append(np.concatenate((word, midi_features[i]), axis=None))
    embedded_lyrics_midi.append(song_with_midi)
  return embedded_lyrics_midi

# Define the dataset class 

class LyricsDataset(Dataset):
  def __init__(self, embedded_lyrics, sequence_len, window_size, stride , midi_features):
    
    # Combine word embeddings with midi features
    embedded_lyrics_midi = combine_word_embedding_with_midi_features(embedded_lyrics, midi_features)

    # Sequencing with sliding window
    sequences = get_sequences_sliding_window(embedded_lyrics_midi, window_size=window_size, stride=stride)

    self.data = torch.tensor(sequences)

  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, index):
    return self.data[index]

# Define the LSTM model 

class LSTM(nn.Module):
  def __init__(self, vocab_size, input_size, embedding_dim, hidden_dim, num_layers, batch_size):
    super(LSTM, self).__init__()
    self.properties = {'num_layers' : num_layers,
                      'batch_size' : batch_size,
                      'hidden_dim' : hidden_dim}
    self.dropout = nn.Dropout(p=0.4)
    self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True, dropout=0.4)
    self.fc = nn.Linear(hidden_dim, embedding_dim)
  
  def forward(self, x, h=None, c=None):
    if h is None:
      h = torch.zeros(self.properties['num_layers'],
                      self.properties['batch_size'],
                      self.properties['hidden_dim']).to(device)
    if c is None:
      c = torch.zeros(self.properties['num_layers'],
                      self.properties['batch_size'],
                      self.properties['hidden_dim']).to(device)
    hidden = (h, c)

    # Pass the embedded data through the LSTM layers
    x, hidden = self.lstm(x, hidden)
    x = self.dropout(x)
    # Pass the output of the LSTM layers through the output layer
    logits = self.fc(x)

    return logits, hidden


In [16]:
# Defining our dataset object
try:
  lyrics_dataset = torch.load(os.path.join(path_to_assingment_folder, 'lyrics_dataset_2'))
except:
  lyrics_dataset = LyricsDataset(embedded_lyrics = embedded_lyrics_train, sequence_len=5, window_size=16, stride=1 , midi_features=midi_features_train)
  torch.save(lyrics_dataset, os.path.join(path_to_assingment_folder, 'lyrics_dataset_2'))

In [None]:
clean_lyrics_list_of_lists_train = []
embedding_dict, embedded_lyrics_train, words_not_known_to_word2vec = get_embedding_dict_and_embedded_lyrics(clean_lyrics_list_of_lists_train)

# Training and Predicting functions and helpers

In [18]:
# Helper Functions for Train and Predict 

def most_similar_n(output, top_n, last_word):
  '''
  DESCRIPTION: this function return the top n similar words to the our put from the vocab via cosine similarity
  PARAMETERS:
  :param output: output of the model - vector of embedding size
  :param top_n: number of top n words to return
  :return: dictionary of top n similar words
  '''
  distance_measure = nn.CosineSimilarity(dim = 0)
  similarity_dict = {}
  for word, embedding in embedding_dict.items():
      similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()
  most_similar = dict(heapq.nlargest(top_n, similarity_dict.items(), key=itemgetter(1)))

  # Dont predict the same word twice in a row
  try:
    most_similar.pop(last_word)
  except:
    pass

  return most_similar

def min_max_normalization(similarity_dict):
  '''
  DESCRIPTION: this function applies minmax normalization
  PARAMETERS:
  :param similarity_dict: dictionary containign values to normalize
  :return: orignial dictionary with same keys, values are the normalized values
  '''
  max_val = max(similarity_dict.values())
  min_val = min(similarity_dict.values())
  for key in similarity_dict.keys():
    similarity_dict[key] = (similarity_dict[key] - min_val) / (max_val - min_val)
  return similarity_dict

def choose_word(normalized_distance_dict):
  '''
  DESCRIPTION: this function choses the next word out of the top n similar words in the dict
  PARAMETERS:
  :param probability_dict: dictionary containig the top n similar words and thier normalized distance
  :return: the word chosen 
  '''

  best_candidate = max(normalized_distance_dict.keys(), key=lambda x: normalized_distance_dict[x])
  # Choose line seperator if its the best candidate - emiprically worked the best
  if best_candidate == '&':
    return '&'
  # Get a list of the keys in the dictionary
  keys = list(normalized_distance_dict.keys())
  # Get a list of the values in the dictionary
  values = list(normalized_distance_dict.values())
  # Normalize the values so they add up to 1
  values = [x/sum(values) for x in values]
  # Choose a random value between 0 and 1
  r = random.random()
  # Find the word that corresponds to the random value
  for i, v in enumerate(values):
      r -= v
      if r <= 0:
          return keys[i]

# Implementation of train and predict functions

def train(model, dataloader_train, dataloader_validation, optimizer, criterion, num_epochs):
  '''
  DESCRIPTION: this function trains the model
  '''
  model.train()
  
  total_loss = 0
  losses_train = []
  losses_validation = []
  epoch_times = []
  for epoch in range(num_epochs):
    print('='*50)
    print('Epoch Number: ',epoch)
    start = time.time()
    epoch_losses_train = []
    epoch_losses_validation = []
    for i, batch in enumerate(dataloader_train):
        batch = batch.to(device)
        x = batch[...,:-1, :].to(device)
        y = batch[...,1:, :].to(device)
        output,_ = model(x)
        word_loss = criterion(output.transpose(-1, -2), y[:, :, :300].transpose(-1, -2))
        sequence_loss = sum(word_loss)
        batch_loss = sum(sequence_loss)
        
        optimizer.zero_grad()
        
        # Backward pass
        batch_loss.backward()
        
        # Update the weights
        optimizer.step()

        epoch_losses_train.append(batch_loss)
    with torch.no_grad():
      for batch in dataloader_validation:
        x = batch[...,:-1, :].to(device)
        y = batch[...,1:, :].to(device)
        output,_ = model(x)
        word_loss = criterion(output.transpose(-1, -2), y[:, :, :300].transpose(-1, -2))
        sequence_loss = sum(word_loss)
        batch_loss = sum(sequence_loss)
        epoch_losses_validation.append(batch_loss)

    end = time.time()
    epoch_time = end-start
    avg_loss_train = sum(epoch_losses_train)/len(epoch_losses_train)
    print("Average Loss Train: " + str(avg_loss_train)) 
    avg_loss_validation = sum(epoch_losses_validation)/len(epoch_losses_validation)
    print("Average Loss Validation: " + str(avg_loss_validation)) 
    print("Epoch time: " + str(epoch_time))
    # Stopping criterion - if change in avg loss is less than 1, empirically afterwards we saw overfit in the results
    if epoch > 5:
      if avg_loss_validation > losses_validation[-1]*0.99:
        return losses_train, losses_validation, epoch_times

    losses_train.append(avg_loss_train)
    losses_validation.append(avg_loss_validation)
    epoch_times.append(epoch_time)

  return losses_train, losses_validation, epoch_times

def predict(model, first_word, device, song_size, midi_features):
  '''
  DESCRIPTION: this function generate a song accordig to given word and midi file
  PARAMETERS:
  :param model: trained model to use
  :param first word: first word to predict with
  :param device: device to calculate on - cuda, cpu 
  :param song_size: size of the song to be generated - number of words
  :return: the word chosen 
  '''
  model.eval()

  # A list to keep all the words the model returned
  words = [first_word]

  # initializing the first hidden state (usually to zeros)
  h = torch.zeros(model.properties['num_layers'],
                  1,
                  model.properties['hidden_dim']).to(device)

  c = torch.zeros(model.properties['num_layers'], 
                  1,
                  model.properties['hidden_dim']).to(device)

  #x = torch.tensor([[embedding_dict[first_word]]]).to(device)
  x = torch.tensor([[np.concatenate((embedding_dict[first_word], midi_features), axis=None)]]).to(device) 

  # iterating for the amount of tokens we want to genereate
  next_word = ""
  for i in range(0, song_size):

      # predict the next token
      # y_pred is the output of the linear layer which was fed the hidden state
      output, (h, c) = model(x, h, c)

      # y is shape (1, sequence length, embedding dim)
      # so we only want the last token to decide what is the next
      # so y[0] is shape (sequence length, embedding dim)
      # and y[0][-1] is shape (embedding dim, ) and is specifically the last token
      last_word = output[0][-1]
      similarity_dict = most_similar_n(last_word, top_n = 5, last_word = next_word)
      probability_dict = min_max_normalization(similarity_dict)
      next_word = choose_word(probability_dict)
      words.append(next_word)
      x = torch.tensor([[np.concatenate((embedding_dict[next_word], midi_features), axis=None)]]).to(device)
      #x = torch.tensor([[embedding_dict[next_word] + midi_features]]).to(device) remove comment to use midi features

  return words

# Training

In [None]:
# Define model, objective function, batch size, optimizer.

vocab_size = len(embedding_dict)
embedding_dim = 300 
hidden_dim = 512  # Hyper Parameter
num_layers = 1 # Hyper Parameter
batch_size = 20
input_size = len(lyrics_dataset[0][0])
model = LSTM(vocab_size=vocab_size,input_size = input_size, embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_layers=num_layers, batch_size= batch_size).to(device)


# splitting train data into train and validation.

train_size = int(0.9 * len(lyrics_dataset))
validation_size = len(lyrics_dataset) - train_size
train_dataset, validation_dataset = random_split(lyrics_dataset, [train_size, validation_size])




In [None]:
# Train

num_epochs = 20
dataloader_train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
dataloader_validation = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion_cs = nn.CosineSimilarity(dim=1)
criterion = lambda x, y: 1- criterion_cs(x, y)
losses_train, losses_validation, epoch_times = train(model=model, dataloader_train=dataloader_train, dataloader_validation=dataloader_validation, optimizer=optimizer, criterion=criterion, num_epochs=num_epochs)

Epoch Number:  0
Average Loss Train: tensor(139.9747, device='cuda:0', grad_fn=<DivBackward0>)
Average Loss Validation: tensor(126.8950, device='cuda:0')
Epoch time: 31.869155645370483
Epoch Number:  1
Average Loss Train: tensor(119.5864, device='cuda:0', grad_fn=<DivBackward0>)
Average Loss Validation: tensor(114.7349, device='cuda:0')
Epoch time: 30.806077480316162
Epoch Number:  2
Average Loss Train: tensor(110.3448, device='cuda:0', grad_fn=<DivBackward0>)
Average Loss Validation: tensor(108.5517, device='cuda:0')
Epoch time: 30.701199769973755
Epoch Number:  3
Average Loss Train: tensor(104.9108, device='cuda:0', grad_fn=<DivBackward0>)
Average Loss Validation: tensor(104.2102, device='cuda:0')
Epoch time: 30.779974222183228
Epoch Number:  4
Average Loss Train: tensor(101.1212, device='cuda:0', grad_fn=<DivBackward0>)
Average Loss Validation: tensor(101.2739, device='cuda:0')
Epoch time: 30.684524297714233
Epoch Number:  5
Average Loss Train: tensor(98.2250, device='cuda:0', grad_

In [None]:
print("training time total: ",sum(epoch_times), 'seconds')

training time total:  339.32761693000793 seconds


In [None]:
# tensorboard generation
writer = SummaryWriter(log_dir="/content/midi_expert")
for i in range(len(losses_train)):
    writer.add_scalars('losses', {'train':losses_train[i],
                                    'validation':losses_validation[i]}
                                    , i)

In [39]:
# writer.close()
%load_ext tensorboard

In [None]:
# midi expert
%tensorboard --logdir=/content/midi_expert

In [None]:
# midi naive
%tensorboard --logdir=/content/midi_naive

In [None]:
# only words
%tensorboard --logdir=/content/runs

# Predicting and Reults

In [19]:
def sing_motherfucker(words):
  joined_words = ' '.join(words)
  sentences = joined_words.split('&')
  for sentence in sentences:
    print(sentence)

In [20]:
vocab_size = len(embedding_dict)
embedding_dim = 300
hidden_dim = 512  # Hyper Parameter
num_layers = 1 # Hyper Parameter
batch_size = 64
input_size = len(lyrics_dataset[0][0])
model = LSTM(vocab_size=vocab_size,input_size=input_size , embedding_dim=embedding_dim, hidden_dim=hidden_dim, num_layers=num_layers, batch_size= batch_size).to(device)
model.load_state_dict(torch.load(os.path.join(path_to_assingment_folder, 'model_12ep_final_midi_expert')))



<All keys matched successfully>

In [None]:
print("Model : Naive Midi")
print("Word : time")
print("Song : Eternal Flame")
words = predict(model, 'time', device, song_size=100, midi_features= midi_features_test[0])
sing_motherfucker(words)

Model : Naive Midi
Word : time
Song : Eternal Flame


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


time 
 anyway just not do 
 do me anyway really do 
 anyway just do me 
 just do you think do you 
 anyway do you want me 
 anyway just do 
 anyway do you do you want you 
 anyway do just do 
 just do you want me do 
 just do you do you do 
 anyway do you do you do 
 just do you want you do 
 just do you know do you 
 do me do you do me 
 anyway you know you do 
 do you want do really


In [None]:
print("Model : Naive Midi")
print("Word : time")
print("Song : Honesty")
words = predict(model, 'time', device, song_size=100, midi_features= midi_features_test[1])
sing_motherfucker(words)

Model : Naive Midi
Word : time
Song : Honesty


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


time 
 anyway just do you 
 anyway just do you 
 just do you do you do 
 anyway do you do me 
 just do you know 
 anyway just do you 
 so anyway you do me 
 just do you want you do 
 just do you think do you 
 just do you want you do 
 just do you want do you want you know 
 anyway you know you think do me 
 you want you do me 
 just do you want you do 
 just want you want you 
 you do


In [None]:
print("Model : Naive Midi")
print("Word : time")
print("Song : Loveful")
words = predict(model, 'time', device, song_size=100, midi_features= midi_features_test[2])
sing_motherfucker(words)

In [None]:
print("Model : Naive Midi")
print("Word : time")
print("Song : Barbie Girl")
words = predict(model, 'time', device, song_size=100, midi_features= midi_features_test[3])
sing_motherfucker(words)

In [None]:
print("Model : Naive Midi")
print("Word : time")
print("Song : All the small things")
words = predict(model, 'time', device, song_size=100, midi_features= midi_features_test[4])
sing_motherfucker(words)

In [None]:
# 5

In [21]:
print("Model : Expert Midi")
print("Word : time")
print("Song : Eternal Flame")
words = predict(model, 'time', device, song_size=100, midi_features= midi_features_test[0])
sing_motherfucker(words)

Model : Expert Midi
Word : time
Song : Eternal Flame


  x = torch.tensor([[np.concatenate((embedding_dict[first_word], midi_features), axis=None)]]).to(device)
  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


time 
 i dont think it just go in really do you know 
 do hey do what you do i want 
 i got my way ive been there is 
 i dont want never do 
 do yourself do really do thats what we 
 cant anyway it in my mind can do your love 
 i came here just in myself 
 i just want be that you are 
 your love should i do 
 i do love do it 
 know you can it go off 
 anyway just the this just be so you can


In [22]:
print("Model : Expert Midi")
print("Word : time")
print("Song : Honesty")
words = predict(model, 'time', device, song_size=100, midi_features= midi_features_test[1])
sing_motherfucker(words)

Model : Expert Midi
Word : time
Song : Honesty


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


time 
 i do you do 
 anyway what cant tell is 
 what we can know why be do 
 you know that i do 
 i know you know 
 hey oh 
 oh hey 
 oh hey 
 just think your love 
 so anyways do you know you do 
 i do know what dont do me thats gotta say 
 he wanted get in his life 
 just so yeah 
 im oh girl 
 i know i know 
 but i do you know you got 
 i do it just do the just do


In [23]:
print("Model : Expert Midi")
print("Word : time")
print("Song : Loveful")
words = predict(model, 'time', device, song_size=100, midi_features= midi_features_test[2])
sing_motherfucker(words)

Model : Expert Midi
Word : time
Song : Loveful


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


time 
 i do know i think 
 i know your love whoah 
 im gonna get on haha 
 cause im the just do 
 do anyway do cant tell me 
 dont think know youre know yourself 
 anyway so dont do the shit do you think 
 i know thats just im gonna get down 
 hey dont wanna get just boy 
 thats gonna be good mama 
 hey cuz gonna know just gonna do 
 do me do dont do my love do 
 oh hey oh 
 oh yeah 
 oh ya know what dont


In [24]:
print("Model : Expert Midi")
print("Word : time")
print("Song : Barbie Girl")
words = predict(model, 'time', device, song_size=100, midi_features= midi_features_test[3])
sing_motherfucker(words)

Model : Expert Midi
Word : time
Song : Barbie Girl


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


time 
 just do it just want do 
 know it do 
 do you do it really 
 so do your just do really think 
 really know it just so not 
 anyway i think 
 not do hey anyway 
 thats just the really hey 
 i dont want you do 
 hey oh thats just really 
 what thats do 
 you want it just so do 
 oh haha oh 
 oh i want your really want 
 just do it just do it 
 really want it all do you do it just 
 go


In [25]:
print("Model : Expert Midi")
print("Word : time")
print("Song : All the small things")
words = predict(model, 'time', device, song_size=100, midi_features= midi_features_test[4])
sing_motherfucker(words)

Model : Expert Midi
Word : time
Song : All the small things


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


time 
 thats how i was 
 just got just do not that you should get me 
 just the do just do just anyway do your just want go it 
 go it go just get just up just you 
 just want it just it just do it all just really think that we can just come just then just want be 
 so do just the really do you think you know 
 what i want do 
 thats really thats what just really do 
 just one just do it all out really know it just do


In [None]:
# 10

In [None]:
print("Model : Naive Midi")
print("Word : Party")
print("Song : Eternal Flame")
words = predict(model, 'party', device, song_size=100, midi_features= midi_features_test[0])
sing_motherfucker(words)

In [None]:
print("Model : Naive Midi")
print("Word : Party")
print("Song : Honesty")
words = predict(model, 'party', device, song_size=100, midi_features= midi_features_test[1])
sing_motherfucker(words)

In [None]:
print("Model : Naive Midi")
print("Word : Party")
print("Song : Loveful")
words = predict(model, 'party', device, song_size=100, midi_features= midi_features_test[2])
sing_motherfucker(words)

In [None]:
print("Model : Naive Midi")
print("Word : Party")
print("Song : Barbie Girl")
words = predict(model, 'party', device, song_size=100, midi_features= midi_features_test[3])
sing_motherfucker(words)

In [None]:
print("Model : Naive Midi")
print("Word : Party")
print("Song :  All The Small Things")
words = predict(model, 'party', device, song_size=100, midi_features= midi_features_test[4])
sing_motherfucker(words)

In [None]:
# 15

In [26]:
print("Model : Expert Midi")
print("Word : Party")
print("Song : Eternal Flame")
words = predict(model, 'party', device, song_size=100, midi_features= midi_features_test[0])
sing_motherfucker(words)

Model : Expert Midi
Word : Party
Song : Eternal Flame


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


party 
 just really just really do 
 what you do im not anyway 
 do thats just what we do 
 alright just gonna be gotta get just 
 this gotta be just one just you 
 we can do anyway 
 they do me do dont know you want it cant go just 
 but ya gotta get the just get that dont be anyway 
 you do really want know how do it 
 thats what i do 
 i think love where the kid go out 
 go going just go out 
 hey oh yeah oh


In [27]:
print("Model : Expert Midi")
print("Word : Party")
print("Song : Honesty")
words = predict(model, 'party', device, song_size=100, midi_features= midi_features_test[1])
sing_motherfucker(words)

Model : Expert Midi
Word : Party
Song : Honesty


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


party 
 we go on now 
 if i do yourself do i do you do 
 i know you want know i want yourself 
 so just do 
 just what you not know what when ive get you 
 anyway it so 
 so i go 
 just do 
 just what i do 
 i know do what i do 
 i think you want do 
 i know you think that just not just do 
 so anyway the day thats hey it 
 anyway hey 
 hey oh hahaha yeah 
 yeah hey 
 so anyway


In [28]:
print("Model : Expert Midi")
print("Word : Party")
print("Song : Loveful")
words = predict(model, 'party', device, song_size=100, midi_features= midi_features_test[2])
sing_motherfucker(words)

Model : Expert Midi
Word : Party
Song : Loveful


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


party just 
 do you tell me how 
 my love is just like 
 do your love is really like 
 do you know your soul love 
 i really do 
 go it just go out just when i get out 
 thats gotta do my love 
 just like the shit wanna just hey haha 
 her sweety seems just do 
 just do hey oh hey oh cuz ya 
 anyway hey dont know anyway hey 
 ohhh my oh hey thats hey i gotta 
 anyway hey thats hey anyway 
 dont think anyway you know


In [29]:
print("Model : Expert Midi")
print("Word : Party")
print("Song : Barbie Girl")
words = predict(model, 'party', device, song_size=100, midi_features= midi_features_test[3])
sing_motherfucker(words)

Model : Expert Midi
Word : Party
Song : Barbie Girl


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


party 
 maybe just do you think yeah 
 anyways hey yeah 
 yeah hey 
 so i can 
 know thats the so really do that gotta do it 
 but you cant do it 
 it can know it just not do 
 just do you think yeah 
 but i want do it 
 just want just really do what want do 
 do you do it really 
 if your just do you do 
 oh but hey yeah hey oh uh 
 anyway just do 
 know what do it do 
 so just want it


In [30]:
print("Model : Expert Midi")
print("Word : Party")
print("Song : All The Small Things")
words = predict(model, 'party', device, song_size=100, midi_features= midi_features_test[4])
sing_motherfucker(words)

Model : Expert Midi
Word : Party
Song : All The Small Things


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


party so 
 but thats just this just what do that know 
 the just me do it all the way 
 so do just the just go on the it 
 hey oh hey oh hey oh 
 oh hey yeah 
 just really do it really do it just do just the really your just do you know 
 it could be do it can make 
 anyway just do your just what i do 
 hey anyway just wanna go just really want do 
 its just what do it do 
 it seems that we would be


In [None]:
# 20

In [32]:
print("Model : Expert Midi")
print("Word : Car")
print("Song : Eternal Flame")
words = predict(model, 'car', device, song_size=100, midi_features= midi_features_test[0])
sing_motherfucker(words)

Model : Expert Midi
Word : Car
Song : Eternal Flame


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


car 
 anyway so do something 
 i want yourself for just i know you think thats really do 
 do that yourself wanna do 
 anyway not do 
 do yourself just do i know 
 anyway really want think that we want 
 we need you anyway 
 i do want you do 
 anyway hey i wanna love you hey mama 
 just so gotta get anyway 
 that you cant want the day that we get 
 i love you 
 not that really wanna be really want 
 come just go in loves just cant be


In [33]:
print("Model : Expert Midi")
print("Word : Car")
print("Song : Honesty")
words = predict(model, 'car', device, song_size=100, midi_features= midi_features_test[1])
sing_motherfucker(words)

Model : Expert Midi
Word : Car
Song : Honesty


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


car 
 just do you know you do 
 thats how ive had you 
 just so your daddy 
 do you know what should i do 
 dont you think i got really do yourself 
 but when i go just anyway 
 anyway i want know i know yourself do just really do 
 it comes just really 
 if i love you so 
 not so not so anyway 
 so dont you know you do 
 thats what i know 
 so anyway be 
 if your eyes can see what they are 
 anyway thats just


In [34]:
print("Model : Expert Midi")
print("Word : Car")
print("Song : Loveful")
words = predict(model, 'car', device, song_size=100, midi_features= midi_features_test[2])
sing_motherfucker(words)

Model : Expert Midi
Word : Car
Song : Loveful


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


car 
 just get just hey 
 hey oh hey oh ya oh 
 oh hey oh 
 oh hey oh 
 oh yeah oh 
 ya think just gotta do just like 
 just really wanna dont wanna anyway 
 i just want get back on the way hey 
 dont wanna know youre going back 
 anyway come on hey yeah gotta do just do 
 just do just wanna go down like all hey 
 just really wanna you wanna do 
 do you want come down 
 but cuz you do 
 hey yeah i know its


In [35]:
print("Model : Expert Midi")
print("Word : Car")
print("Song : Barbie Girl")
words = predict(model, 'car', device, song_size=100, midi_features= midi_features_test[3])
sing_motherfucker(words)

Model : Expert Midi
Word : Car
Song : Barbie Girl


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


car 
 its anyway just what just it 
 its really know if i do 
 i know you know that you want it 
 but if yourself anyway do it 
 anyway you do 
 do me know what you know 
 thats really do it really 
 what do your do anyway know 
 do just do it just do it 
 anyway 
 so i can do it just 
 do 
 so you should not it go 
 but i can do something do 
 do me really want it just go 
 so just do it


In [36]:
print("Model : Expert Midi")
print("Word : Car")
print("Song : All The Small Things")
words = predict(model, 'car', device, song_size=100, midi_features= midi_features_test[4])
sing_motherfucker(words)

Model : Expert Midi
Word : Car
Song : All The Small Things


  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


car 
 it is anyway hey im gotta get just really just gotta do it 
 but the just what you know 
 what can know is do 
 your boy seems just you just hey 
 just come on just it just do it just really do 
 just do we anyway not you 
 anyway dont do my really what you know 
 anyway just do it all just really do just the maybe 
 hey really want do all just really want just do it 
 if somebody think just what do it do 
 do it all


In [None]:
words = predict(model, 'car', device, song_size=100, midi_features=midi_features_test[1])
sing_motherfucker(words)

  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


dear 
 thats just what hey anyway you 
 but it really think it just no so 
 no you gotta know 
 but it do not it 
 but your hands just got guess you 
 do you know 
 what can i do 
 just do your kinda crazy 
 really do know what its like crazy when 
 so that you got me do you want know 
 if its love could get 
 it really 
 but i know that i be hey 
 if you wanna are just hey really want really do 
 just gotta


In [None]:
#word2vec doesnt know hiya and barbie, inference in girl instead
words = predict(model, 'girl', device, song_size=100, midi_features=midi_features_test[2])
sing_motherfucker(words)

  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


girl 
 hey oh hey oh yeah 
 just do it just do it just do you want come down 
 let me know we let just do 
 really want do 
 but i know that i cant want go 
 im gotta go here 
 just do it anyway you want 
 just do it just anyway 
 on that night 
 the one so sweet love me for 
 my love for you hey 
 if you do it you want it 
 never you want something never do can know what loved you know 
 i thought


In [None]:
words = predict(model, 'all', device, song_size=100, midi_features=midi_features_test[3])
sing_motherfucker(words)

  similarity_dict[word] = distance_measure(torch.tensor(output).to(device), torch.tensor(embedding).to(device)).item()


all just do 
 then really know what your kid seems seein is up 
 oh you need me so i want you 
 hey maybe thats what it is 
 because it you know 
 if i can even just me 
 anyway dont do it 
 when you want get it 
 but dont you think that it becomes so bad 
 we not can know really wont come me 
 but the just you just for my life 
 oh hey oh hey oh 
 oh uh oh yeah hey yeah 
 yeah hey oh yeah 
 the


In [None]:
# Saving Models after train
torch.save(model.state_dict(), os.path.join(path_to_assingment_folder, 'model_12ep_final_midi_expert')) 

In [None]:
pd.to_pickle(losses_train ,filepath_or_buffer=os.path.join(path_to_assingment_folder , "losses_train_midi_expert"))
pd.to_pickle(losses_validation ,filepath_or_buffer=os.path.join(path_to_assingment_folder , "losses_validation_midi_expert"))
pd.to_pickle(epoch_times ,filepath_or_buffer=os.path.join(path_to_assingment_folder , "epoch_times_midi_expert"))

# Experiments

In [None]:

# # Define the LSTM model bidirect

# class LSTM(nn.Module):
#   def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, batch_size):
#     super(LSTM, self).__init__()
#     self.properties = {'num_layers' : num_layers,
#                       'batch_size' : batch_size,
#                       'hidden_dim' : hidden_dim}
#     # Define the embedding layer to convert words into numerical representations
#     #self.embedding = nn.Embedding(vocab_size, embedding_dim)
#     # Define the LSTM layers
#     self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True, dropout=0.6, bidirectional=True)
#     # Define the output layer
#     #self.fc = nn.Linear(vocab_size, embedding_dim)
#     self.fc = nn.Linear(hidden_dim*2, embedding_dim)
  
#   def forward(self, x, h=None, c=None):

#     if h is None:
#       h = torch.zeros(self.properties['num_layers']*2,
#                       self.properties['batch_size'],
#                       self.properties['hidden_dim']).to(device)#.double().to(device)
#     if c is None:
#       c = torch.zeros(self.properties['num_layers']*2,
#                       self.properties['batch_size'],
#                       self.properties['hidden_dim']).to(device)#.double().to(device)
    
#     hidden = (h, c)#.to(device))
#     # Pass the input data through the embedding layer
#     #x = self.embedding(x)
#     # Pass the embedded data through the LSTM layers
#     x, hidden = self.lstm(x, hidden)
#     # Pass the output of the LSTM layers through the output layer
#     logits = self.fc(x)
#     return logits, hidden

In [None]:
# Define the dataset class for your lyrics data

class LyricsDataset(Dataset):
  def __init__(self, embedded_lyrics, sequence_len, window_size):
    
    # Sequencing with sliding window
    sequences = get_sequences_sliding_window(embedded_lyrics, window_size=window_size)
    # Padding & Sequencing
    # max_len = len(max(embedded_lyrics, key=len))
    # sequences = []
    # for ind, song in enumerate(embedded_lyrics):
        # num_sequences = len(song)//sequence_len
        # for sequence in range(num_sequences):
        #     start = sequence*sequence_len
        #     end = (sequence+1)*sequence_len
        #     sequences.append(song[start:end])
        # embedded_lyrics[ind] = add_padding(song, max_len)


    #embedded_lyrics_np = np.array([np.array(xi) for xi in embedded_lyrics])
    #self.data = torch.from_numpy(embedded_lyrics_np).double()
    self.data = torch.tensor(sequences)

  def __len__(self):
    return len(self.data)
  
  def __getitem__(self, index):
    return self.data[index]


In [None]:
# Define the LSTM model 

class LSTM(nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, batch_size):
    super(LSTM, self).__init__()
    self.properties = {'num_layers' : num_layers,
                      'batch_size' : batch_size,
                      'hidden_dim' : hidden_dim}
    # Define the embedding layer to convert words into numerical representations
    #self.embedding = nn.Embedding(vocab_size, embedding_dim)
    # Define the LSTM layers
    self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True, dropout=0.6)
    # Define the output layer
    #self.fc = nn.Linear(vocab_size, embedding_dim)
    self.fc = nn.Linear(hidden_dim, embedding_dim)
  
  def forward(self, x, h=None, c=None):

    if h is None:
      h = torch.zeros(self.properties['num_layers'],
                      self.properties['batch_size'],
                      self.properties['hidden_dim']).to(device)#.double().to(device)
    if c is None:
      c = torch.zeros(self.properties['num_layers'],
                      self.properties['batch_size'],
                      self.properties['hidden_dim']).to(device)#.double().to(device)
    
    hidden = (h, c)#.to(device))
    # Pass the input data through the embedding layer
    #x = self.embedding(x)
    # Pass the embedded data through the LSTM layers
    x, hidden = self.lstm(x, hidden)
    # Pass the output of the LSTM layers through the output layer
    logits = self.fc(x)
    return logits, hidden