The lyric generator using Markov Chains.

In [None]:
!pip install PyGithub

# Package Imports
import random
import pandas as pd 
import numpy as np
from urllib.request import urlopen # The default requests package
import requests # For making GitHub requests
from pathlib import Path # The Path class

# For the more advanced requests
import base64
import os
import sys
sys.path.append("./PyGithub");
from github import Github
from getpass import getpass



In [None]:
# Recursively Import the Data (AUTOMATIC)

def _decode_and_write(file__, path_):
    data = file__.decoded_content
    data = data.decode('utf-8')[1:]
    with open(path_, 'w') as writefile:
        writefile.write(data) 
    data = data.splitlines()
    data_rows = []
    for count, word in enumerate(data):
        if count>0:
            data_rows.append(word.split(','))
    data = pd.DataFrame(data_rows)
    data = data.to_numpy()
    return data


def import_github(path_name="AllLyrics.txt"):
    """
    Function for importing the github file
    path_name: str
    output: None
    """
    g = Github(getpass("Enter your PAT key ")) # Enter your PAT Key.
    username = "MikeMNelhams"
    main_branch_bool = input("Main Branch: Yes or No? ")
    yes_synonyms = ["yes", "y", "yh", "1", "true"]
    if main_branch_bool.lower() in yes_synonyms: 
        branch = "master" 
    else: 
        branch = "PROTOTYPE"

    user = g.get_user(username)
    r_proj_clone = 0
    for repo in g.get_user().get_repos():
        if repo.name == "ai-group-project-Team-JMJM":
            r_proj_clone = repo
            break
        # To see all the available attributes and methods
        print(dir(repo))
    if not r_proj_clone:
        print("ai-group-project-Team-JMJM not found")
        sys.exit()
    print("Importing Github cleaned text files...")
    contents = r_proj_clone.get_contents("RapLyrics/CLEAN", ref=branch)
    RAP_DATA = []
    for file_ in contents:
        path = file_.path
        path = str(path) 
        # Only choose the .txt files
        if path[-4:] == '.txt':
            # Append the Lyrics
            RAP_DATA.append(file_.decoded_content.decode("utf-8")) 
    
    temp_path = Path(path_name)
    if temp_path.is_file(): 
        if os.stat(path_name).st_size == 0:
            write_bool2 = True
        else: 
            write_bool2 = False
    else: 
        write_bool2 = True
    
    if write_bool2: 
        for lyric in RAP_DATA: 
            try:
                with open(path_name, 'w') as writefile: 
                    writefile.write(lyric)
            except: 
                print("Error, file moved/deleted during write")
        print("{} is now up to date!".format(path_name))
    else: 
        print("{} is already up to date!".format(path_name))
    
    contents = r_proj_clone.get_contents("RapLyrics/Other", ref=branch)
    for counter, file_ in enumerate(contents):
        path = file_.path
        path = str(path) 

        title_start = path.find('Other')
        title_len = path[title_start:].find('.')
        path = path[title_start + 6:title_start + title_len + 4]

        print("Writing file {} {}".format(counter, path))
        temp_path = Path(path)
        if temp_path.is_file():
            with open(path,'w'): pass # Cheeky way to clear the file if it exists
        
        # Split the long string into a list of lines, then split by words, then put into a csv, then to numpy array 
        data = file_.decoded_content
        data = data.decode('utf-8')[1:]

        with open(path, 'w') as writefile:
            writefile.write(data) 
        print("All files now up to date!")


def update_github(write_bool=False, path_name="AllLyrics.txt"):
    """
    Function for updating the github file, by cleaning the lyrics, optional write to txt file. 
    write_bool: bool
    path_name: str
    output: None
    """
    g = Github(getpass("Enter your PAT key ")) # Enter your PAT Key.
    username = "MikeMNelhams"
    main_branch_bool = input("Main Branch: Yes or No? ")
    yes_synonyms = ["yes", "y", "yh", "1", "true"]
    if main_branch_bool.lower() in yes_synonyms: 
        branch = "master" 
    else: 
        branch = "PROTOTYPE"

    user = g.get_user(username)
    r_proj_clone = 0
    for repo in g.get_user().get_repos():
        if repo.name == "ai-group-project-Team-JMJM":
            r_proj_clone = repo
            break
        # To see all the available attributes and methods
        print(dir(repo))
    
    if not r_proj_clone:
        print("ai-group-project-Team-JMJM not found")
        sys.exit()

    print("Importing editing csv files...")

    # Split the long string into a list of lines, then split by words, then put into a csv, then to numpy arr
    contents = r_proj_clone.get_contents("RapLyrics/Other", ref=branch)
    for counter, file_ in enumerate(contents):
        path = file_.path 
        path = str(path)
        title_start = path.find('Other')
        title_len = path[title_start:].find('.')
        name = path[title_start + 6:title_start + title_len + 4]
        print("Writing file {} {}".format(counter, name))
        if name.lower() == "censors.csv":
            censors = _decode_and_write(file_, path)
        elif name.lower() == "capitals.csv":
            capitals = _decode_and_write(file_, path)
        else: 
            _decode_and_write(file_, path)
    print("All editing csv files are up to date!")

    print("Importing Github uncleaned text files...")
    contents = r_proj_clone.get_contents("RapLyrics/UNCLEAN", ref=branch)

    RAP_DATA = []
    rap_lyric_names = []

    for file_ in contents:
        path = file_.path
        path = str(path) 
        # Only choose the .txt files
        if path[-4:] == '.txt':
            # Append the name
            title_start = path.find('UNCLEAN')
            title_len = path[title_start:].find('.')
            name = path[title_start + 8:title_start + title_len]
            if name[-2:] == 'UC':
                name = name[:-2]
            rap_lyric_names.append(name) 

        # Append the Lyrics
        RAP_DATA.append(file_.decoded_content.decode("utf-8")) 
        
    # Remove the \ufeff at the beginning O(n)
    for count, lyric in enumerate(RAP_DATA): 
        RAP_DATA[count] = lyric[1:]

    # Censor the profanities O(n*m + n*m2) m > m2 xor m2 > m
    for count in range(len(RAP_DATA)): 
        for i in range(len(censors[0:])):
            RAP_DATA[count] = RAP_DATA[count].replace(str(censors[i, 0]), str(censors[i, 1]))
        for i in range(len(capitals[0:])):
            RAP_DATA[count] = RAP_DATA[count].replace(str(capitals[i, 0]), str(capitals[i, 1]))

    contents = r_proj_clone.get_contents("RapLyrics/CLEAN", ref=branch)
    cleaned_names = []
    for counter, file_ in enumerate(contents):
        path = file_.path
        path = str(path) 
        print("File {} ".format(counter + 1) + path)
        # Only choose the .txt files
        if path[-4:] == '.txt':
            # Append the name
            title_start = path.find('CLEAN')
            title_len = path[title_start:].find('.')
        name = path[title_start + 6:title_start + title_len]
        if name[-2:] == 'CL':
            name = name[:-2]
        cleaned_names.append(name) 

    # ALL OF THE EDITING IS DONE IN THE 'PROTOTYPE BRANCH' to avoid overwriting import changes
    # If the (now cleaned) rap_lyrics name is new (not in cleaned_names), then we want to create that as a new file 
    # If the (now cleaned) rap_lyrics name is NOT new (not in cleaned_names), then we want to update the file
    # print(rap_lyric_names)
    # print(cleaned_names)
    print("Commiting files to github...")
    for counter, new_name in enumerate(rap_lyric_names): 
        if new_name in cleaned_names: 
            duplicate = r_proj_clone.get_contents("RapLyrics/CLEAN/{}CL.txt".format(new_name), ref=branch)
            r_proj_clone.update_file("RapLyrics/CLEAN/{}CL.txt".format(new_name), "This was uploaded automatically via pipeline", RAP_DATA[counter], duplicate.sha, branch=branch)
        else:
            r_proj_clone.create_file("RapLyrics/CLEAN/{}CL.txt".format(new_name), "This was uploaded automatically via pipeline", RAP_DATA[counter], branch=branch)

    if write_bool: 
        print("Writing text file to: {}".format(path_name))
        with open(path_name, 'w') as writefile:
            for lyric in RAP_DATA:
                writefile.write(lyric)


In [None]:
# Import vocabulary. PATKEY: 5ae2446bd5828c9e27deb3865118d9e783aa6e15
import_github()

Enter your PAT key ··········
Main Branch: Yes or No? yes
Importing Github cleaned text files...
AllLyrics.txt is already up to date!
Writing file 0 capitals.csv
All files now up to date!
Writing file 1 censors.csv
All files now up to date!
Writing file 2 censors2.csv
All files now up to date!


In [None]:
# Our rap function
def rap():
    nlines = int(input('How many lines would you like the rap to be? (number not word)'))
    ngeneratelines = int(input('How many lines should we generate to choose from? (number not word, higher = better quality but takes longer)'))

    # Read vocabulary
    Text = open("AllLyrics.txt", "r").read().lower()
    Vocabulary = ''.join([i for i in Text if not i.isdigit()]).replace("\n", " ").replace(".", "").replace("!", "").replace("?", "").replace(",", "").replace("", "").replace("'", "").replace(")", "").replace("(", "").split(' ')

    # Generate text
    def line_generator(Vocabulary):
        index = 1
        chain = {}
        count = random.randint(6, 12)
        
        for word in Vocabulary[index:]:
            key = Vocabulary[index-1]
            if key in chain:
                chain[key].append(word)
            else:
                chain[key] = [word]
            index += 1
            
        word1 = random.choice(list(chain.keys()))
        line = word1.capitalize()

        while len(line.split(' ')) < count:
            word2 = random.choice(chain[word1])
            word1 = word2
            line += ' ' + word2.lower()
        return line


    # Rhyme Functions
    def reverse_syllable_extract(text):
        sy_form = []
        characters = [char for char in text]
        sylls = ['a', 'e', 'i', 'o', 'u']
        for x in characters:
            if x in sylls:
                sy_form.append(x)
        sy_form.reverse()
        return sy_form


    def rev_syllable_stop_count(text1, text2):
        count = True 
        i = 0
        counter = 0
        syll1 = reverse_syllable_extract(text1)
        syll2 = reverse_syllable_extract(text2)
        while count == True:
            if i < min(len(syll1), len(syll2)) and syll1[i] == syll2[i]:
                counter += 1
                i += 1
            else:
                count = False
        return counter


    def next_line_stop_count(start_line, lines):
        sy_lines = []
        for i in lines:
            sy_lines.append(rev_syllable_stop_count(start_line, i))
        choice = sy_lines[0]
        count = 0
        for i in range(len(sy_lines)):
            if sy_lines[i] > choice:
                choice = sy_lines[i]
        return lines[sy_lines.index(choice)]

    # Generator

    start_line = line_generator(Vocabulary)
    all_other_lines = [line_generator(Vocabulary) for i in range(ngeneratelines)]
    rap = [start_line]
    
    for i in range (nlines):
        if i % 2 == 1:
            next_line = next_line_stop_count(rap[len(rap) - 1], all_other_lines)
        else:
            next_line = random.choice(all_other_lines)
        all_other_lines.remove(next_line)
        rap.append(next_line)
    return rap

In [None]:
rap()

How many lines would you like the rap to be? (number not word)20
How many lines should we generate to choose from? (number not word, higher = better quality but takes longer)200


['Chest he fell to real i brought the air his chest he',
 'Alright alright  alright  and snatch your',
 'Shooting like that sit on the curl back  could',
 'Fly but i came out  and pat your back and snatch',
 'That sit on my ash tray get back  and pat',
 'Slack i turn around and your back and then i dont know',
 'Mack your sack before i dont know',
 'Slapped a bedtime story  alright  we can go',
 'On the ride down my ash tray get a og',
 'Aint poop the one eight seven kidnaps and',
 'Muthafrickin nine millimeter for nuthin at all',
 'Caught up in town my thang way cut throat',
 'Hat the downlow oh once upon a',
 'Yet strapped a spot up in this ninja im a stick',
 'Town my lap  we claiming everything ninja im that sit',
 'Claiming everything ninja even dimes and this muthafricka cuz i had to',
 'Aint poop the game ninja who',
 'Down my lighter  to the little kids',
 'Shooting like stainless steel and put the little kids',
 'Care peck he fell to stay on this muthafricka cuz i brought',
 '

In [None]:
rap()

How many lines would you like the rap to be? (number not word)20
How many lines should we generate to choose from? (number not word, higher = better quality but takes longer)200


['Once upon a trip up in his vision gettin',
 'Kidnaps and go get my lap  okay check it out',
 'Couldnt do him cause please believe it out',
 'Cause hes like me we can go see',
 'Neck shooting like me lets go get em',
 ' alright alright alright  and the one eight seven kidnaps and',
 'Curl back  to the one eight seven kidnaps and',
 'Ride on a stick ninja and',
 'Nine millimeter for heater  come here sit back and',
 'Eatin on some traffic behind some traffic behind some hood',
 'Steak eatin on the downlow oh',
 'Blurry but you better yet strapped',
 'The black poker sack before i turn around and go see take',
 'B***** aint no get my ninja',
 'Aint no stranger to get my ninja',
 'Cool with no stranger to danger aint poop the black poker sack',
 'Shots on the black poker sack',
 'That big rap name ninja disrepectin',
 'B***** aint no get em frick',
 'What time it it was wearing slack i came out with no',
 'Set no stranger to mack your grip or']

In [None]:
# Test version for classifiers - difference here is that we have 10 words for line (as train sets for the generators are) and have taken out the ui

# Our rap function
def markov_rap(nlines, ngeneratelines):

    # Read vocabulary
    Text = open("AllLyrics.txt", "r").read().lower()
    Vocabulary = ''.join([i for i in Text if not i.isdigit()]).replace("\n", " ").replace(".", "").replace("!", "").replace("?", "").replace(",", "").replace("", "").replace("'", "").replace(")", "").replace("(", "").split(' ')

    # Generate text
    def line_generator(Vocabulary):
        index = 1
        chain = {}
        count = 10
        
        for word in Vocabulary[index:]:
            key = Vocabulary[index-1]
            if key in chain:
                chain[key].append(word)
            else:
                chain[key] = [word]
            index += 1
            
        word1 = random.choice(list(chain.keys()))
        line = word1.capitalize()

        while len(line.split(' ')) < count:
            word2 = random.choice(chain[word1])
            word1 = word2
            line += ' ' + word2.lower()
        return line


    # Rhyme Functions
    def reverse_syllable_extract(text):
        sy_form = []
        characters = [char for char in text]
        sylls = ['a', 'e', 'i', 'o', 'u']
        for x in characters:
            if x in sylls:
                sy_form.append(x)
        sy_form.reverse()
        return sy_form


    def rev_syllable_stop_count(text1, text2):
        count = True 
        i = 0
        counter = 0
        syll1 = reverse_syllable_extract(text1)
        syll2 = reverse_syllable_extract(text2)
        while count == True:
            if i < min(len(syll1), len(syll2)) and syll1[i] == syll2[i]:
                counter += 1
                i += 1
            else:
                count = False
        return counter


    def next_line_stop_count(start_line, lines):
        sy_lines = []
        for i in lines:
            sy_lines.append(rev_syllable_stop_count(start_line, i))
        choice = sy_lines[0]
        count = 0
        for i in range(len(sy_lines)):
            if sy_lines[i] > choice:
                choice = sy_lines[i]
        return lines[sy_lines.index(choice)]

    # Generator

    start_line = line_generator(Vocabulary)
    all_other_lines = [line_generator(Vocabulary) for i in range(ngeneratelines)]
    rap = [start_line]
    
    for i in range (nlines):
        if i % 2 == 1:
            next_line = next_line_stop_count(rap[len(rap) - 1], all_other_lines)
        else:
            next_line = random.choice(all_other_lines)
        all_other_lines.remove(next_line)
        rap.append(next_line)
    return rap

In [None]:
# This is the generated text we will test in the classifier
markov_rap(400,4000)

['Come here sit on a spot up out with em',
 'Put the game ninja cause please believe it was way',
 'Brought the golf hat the one eight seven kidnaps and',
 'Guns  yall a read yall a gangsta bedtime story',
 'You better have you need a gangsta bedtime story by',
 'Ever slapped a gangsta bedtime story  great scotts its',
 'Here sit on a kit kat  great scotts its',
 'Bedtime story by saying goodnight  yall get em frick',
 'Upon a trip up in town my ninja even if',
 'Do my ninja caught up on a trip up to',
 'All the downlow oh once upon a trip up on',
 'Do nothing to get a gangsta bedtime story  yall',
 'Grease strikes you need a gangsta bedtime story  and',
 'To stay on some traffic behind some traffic behind some',
 'Can ride on some traffic behind some traffic behind some',
 'What time it was way to him cause please believe',
 'Couldnt do my lap  to him cause please believe',
 'S-n double o-p fa sho i had to real i',
 'Gotta end this muthafricka cuz i had to real i',
 'In the women stay