In [91]:
import torch
from torch.autograd import Variable
import torch.nn as nn 
import torch.nn.functional as F
import os 
import pandas as pd
import re 
import numpy as np
from typing import List, Dict 
import dotenv 

In [92]:
# Don't continue if your gpu isnt recognized, it's gonna take too long
if torch.cuda.is_available():
    print("Yes, GPU was found and recognized!")
else:
    print("No, something went wrong. Make sure to install nvidia driver. :(")

Yes, GPU was found and recognized!


First Step: Turn All Artists Into Their Own Csv

In [93]:
artists: set = {artist_name for artist_name in os.listdir("data") if os.path.isdir(f"data/{artist_name}")}
data_frames: Dict[str, pd.DataFrame] = {}
combined_song_strings: str = ""

def artist_songs_to_data_frame(artist: str) -> tuple:
    directory: str = f"data/{artist}"
    df_rows: List[str, str] = []
    artist_vocab: List[str] = []

    for song_path in os.listdir(directory):
        if len(song_path) <= 7 or song_path[:7] != "edited_":
            continue

        song_name: str = song_path[:len(song_path)-4]
        song_stringified: str = ""
        lyrics_list: List[str] = []
        song_file: TextIOWrapper = open(f"{directory}/{song_path}") # type: ignore

        for line in song_file:
            song_stringified += line 
            
            global combined_song_strings
            combined_song_strings += line
            line = line.strip()

            if line:
                lyrics_list += line.split(" ")

        artist_vocab += lyrics_list
        song_file.close()
        df_rows.append((song_name, song_stringified, lyrics_list))

    return pd.DataFrame(df_rows, columns=["Song_Name", "Song_String", "Lyrics_List"]), sorted(list(set(artist_vocab)))

for artist in artists:
    # wanted to use the csv but life is pain :(, didn't have enough time
    artist_tuple: tuple = artist_songs_to_data_frame(artist)
    data_frames[artist] = artist_tuple

In [94]:
class Artist_LSTM_Model(nn.Module):
    def __init__(self, corpus: list, hidden_dimensionality: int, embedding_dimensionality: int, dropout: int, num_layers: int):
        super(Artist_LSTM_Model, self).__init__()
        self.linear_module: nn.Linear = nn.Linear(hidden_dimensionality, corpus)
        self.hidden_dimensionality: int = hidden_dimensionality
        self.embeddings: nn.Embedding = nn.Embedding(corpus, embedding_dimensionality)
        self.forward_function: nn.Linear = nn.Linear(hidden_dimensionality, corpus)
        self.lstm: nn.lSTM = nn.LSTM(embedding_dimensionality, hidden_dimensionality, num_layers = num_layers, dropout=dropout)
    
    def forward(self, seq_in):
        return self.forward_function(self.lstm(self.embeddings(seq_in.t()))[0][-1])

In [95]:
def create_artist_model(artist_song_strings: list, window_size: int, steps: int, epochs: int, create_new_model=True) -> tuple:
    def get_data(sentences: list, next_chars: list) -> tuple:
        x: np.ndarray = np.zeros((len(sentences), window_size))
        y: np.ndarray = np.zeros((len(sentences)))
        
        for i in range(len(sentences)):
            sentence: str = sentences[i]
            for t, char in enumerate(sentence):
                x[i, t] = char_converter[char]

            y[i] = char_converter[next_chars[i]]

        return x, y

    def create_model() -> Artist_LSTM_Model:
        sentences: list = []
        next_chars: list = []

        for i in range(0, len(lines) - window_size, steps):
            sentences.append(lines[i: i + window_size])  
            next_chars.append(lines[i + window_size]) 
            
        training_instances: tuple = get_data(np.array(sentences), np.array(next_chars))

        train = torch.utils.data.TensorDataset(
            torch.tensor(training_instances[0], dtype=torch.long).to(device="cuda"),
            torch.tensor(training_instances[1], dtype=torch.long).to(device="cuda")
        )

        train_loader = torch.utils.data.DataLoader(train, batch_size = 256)

        model: Artist_LSTM_Model = Artist_LSTM_Model(len(words),256,256, .5, 10)
        model.cuda()

        optimizer = torch.optim.Adamaz(model.parameters(), lr = 0.07, beta=(.9, .99)) 

        for epoch in range(epochs):
            model.train()
            for load in train_loader:
                model(load[0])
                
                optimizer.zero_grad()
                optimizer.step()

        return model

    lines = re.findall(r'\S+|\n', re.sub("\n", " \n ", artist_song_strings.str.cat(sep='\n').lower()))
    words = sorted(list(set(lines)))

    char_converter: dict = {}
    integer_converter: dict = {}
    
    for index, word in enumerate(words):
        char_converter[word] = index 
        integer_converter[index] = word

    return None if not create_new_model else create_model(),  integer_converter, char_converter
            

In [96]:
def generate_lyrics(seed_lyrics: list, model: Artist_LSTM_Model, integer_convertor: list, char_convertor: list, num_of_chars: int, window_size: int, variance: float) -> str:
    def calc_next_word(current_predictions: list, variance: float):
        predictions: np.ndarray = np.log(np.asarray(current_predictions).astype('float64'))/variance
        exponential_predictions: float = np.exp(predictions)

        return np.argmax(np.random.multinomial(1, exponential_predictions/np.sum(exponential_predictions), 1))

    generated_lyrics: str = " ".join(seed_lyrics)
    window: list = seed_lyrics

    for i in range(num_of_chars):
        x: np.ndarray = np.zeros((1, window_size))

        for t, char in enumerate(window):
            x[0, t] = char_convertor[char] 
            
        x_in: Variable = Variable(torch.LongTensor(x).to(device="cuda"))
        pred: np.array = np.array(F.softmax(model(x_in), dim=1).data[0].cpu())

        next_word: list = integer_convertor[calc_next_word(pred, variance)] 
        window: list = window[1:] + [next_word]
         
        generated_lyrics += " " + next_word 
        
    return generated_lyrics

In [97]:
def check_seed_lyrics(seed_lyrics: list, artist_words: list, window_size: int) -> False:
    if len(seed_lyrics) != window_size:
        print("Seed lyric size is not the same as the window size")
        return False
    
    valid_seed_lyrics: bool = True 
    
    for word in seed_lyrics:
        if word not in artist_words and word != "\n":
            print(word, "was not in artist word list")
            valid_seed_lyrics = False 

    return valid_seed_lyrics 

artists: dict = {
    "Eminem": ["shady", "\n", "i", "am", "back", "\n", "eminem", "wasnt", "the", "best" ], 
    "Kanye West": ["start", "\n", "like", "im", "back", "at", "the", "start", "\n", "here"], 
    "Tame Impala": ["my", "name", "is", "impala", "like", "the", "car", "but", "im", "that"],
    "The Maine": ["from", "maine", "and", "thats", "my", "name", "\n", "i", "dont", "eat"],
    "Pink Floyd": ["car", "\n", "i", "own", "a", "bar", "but", "not", "so", "far"],
    "The Story So Far": ["far", "i", "have", "a", "car", "with", "the", "story", "so", "far"]
}

window_size: int = 10
steps: int = 1
epochs: int = 100
num_of_chars: int = 200
variance: int = 1

generated_lyrics_file = open("lstm_generated_lyrics.txt", "w")

for artist, seed_lyrics in artists.items():
    if not check_seed_lyrics(seed_lyrics, data_frames[artist][1], window_size):
        print("Could not generate lyrics for", artist, "fix errors above")
        continue

    create_new_model: bool =  True if not f"Eminem.pth" in os.listdir("./lstm_models") else False
    data: tuple = create_artist_model(data_frames[artist][0]["Song_String"], window_size, steps, epochs, create_new_model)

    model: Artist_LSTM_Model or None = data[0]
    
    if model is None:
        model = torch.load(f"./lstm_models/{artist}.pth")
    else:
        torch.save(model, f"./lstm_models/{artist}.pth")

    generated_lyrics_file.write(generate_lyrics(seed_lyrics, model, data[1], data[2], num_of_chars, window_size, variance) + "\n\n")

generated_lyrics_file.close()