In [11]:
# Markov with Syllable Counter
# Same rhyme ranker as MikesVersion1
# Changelog:
# - Rap now censored after it has been generated, not before
# - Github importer now displays completion message at correct time

In [12]:
#@title Import Statements
!pip install PyGithub
!pip install pyphen

# Package Imports
import random
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
from urllib.request import urlopen # The default requests package
import requests # For making GitHub requests
from pprint import pprint # For pretty printing
from pathlib import Path # The Path class
import pyphen

# For the more advanced requests
import base64
import os
import sys
sys.path.append("./PyGithub");
from github import Github
from getpass import getpass



In [13]:
#@title Function Definitions
# Recursively Import the Data (AUTOMATIC)

def _decode_and_write(file__, path_):
    data = file__.decoded_content
    data = data.decode('utf-8')[1:]
    with open(path_, 'w') as writefile:
        writefile.write(data) 
    data = data.splitlines()
    data_rows = []
    for count, word in enumerate(data):
        if count>0:
            data_rows.append(word.split(','))
    data = pd.DataFrame(data_rows)
    data = data.to_numpy()
    return data


def import_github(path_name="AllLyrics.txt"):
    """
    Function for importing the github file
    path_name: str
    output: None
    """
    g = Github(getpass("Enter your PAT key ")) # Enter your PAT Key.
    username = "MikeMNelhams"
    main_branch_bool = input("Main Branch: Yes or No? ")
    yes_synonyms = ["yes", "y", "yh", "1", "true"]
    if main_branch_bool.lower() in yes_synonyms: 
        branch = "master" 
    else: 
        branch = "PROTOTYPE"

    user = g.get_user(username)
    r_proj_clone = 0
    for repo in g.get_user().get_repos():
        if repo.name == "ai-group-project-Team-JMJM":
            r_proj_clone = repo
            break
        # To see all the available attributes and methods
        print(dir(repo))
    if not r_proj_clone:
        print("ai-group-project-Team-JMJM not found")
        sys.exit()
    print("Importing Github raw text files...")
    contents = r_proj_clone.get_contents("RapLyrics/UNCLEAN", ref=branch)
    RAP_DATA = []
    for file_ in contents:
        path = file_.path
        path = str(path) 
        # Only choose the .txt files
        if path[-4:] == '.txt':
            # Append the Lyrics
            RAP_DATA.append(file_.decoded_content.decode("utf-8")) 
    
    temp_path = Path(path_name)
    if temp_path.is_file(): 
        if os.stat(path_name).st_size == 0:
            write_bool2 = True
        else: 
            write_bool2 = False
    else: 
        write_bool2 = True
    
    if write_bool2: 
        for lyric in RAP_DATA: 
            try:
                with open(path_name, 'w') as writefile: 
                    writefile.write(lyric)
            except: 
                print("Error, file moved/deleted during write")
        print("{} is now up to date!".format(path_name))
    else: 
        print("{} is already up to date!".format(path_name))
    
    contents = r_proj_clone.get_contents("RapLyrics/Other", ref=branch)
    for counter, file_ in enumerate(contents):
        path = file_.path
        path = str(path) 

        title_start = path.find('Other')
        title_len = path[title_start:].find('.')
        path = path[title_start + 6:title_start + title_len + 4]

        print("Writing file {} {}".format(counter, path))
        temp_path = Path(path)
        if temp_path.is_file():
            with open(path,'w'): pass # Cheeky way to clear the file if it exists
        
        # Split the long string into a list of lines, then split by words, then put into a csv, then to numpy array 
        data = file_.decoded_content
        data = data.decode('utf-8')[1:]

        with open(path, 'w') as writefile:
            writefile.write(data) 
    print("All files now up to date!")


def update_github(write_bool=False, path_name="AllLyrics.txt"):
    """
    Function for updating the github file, by cleaning the lyrics, optional write to txt file. 
    write_bool: bool
    path_name: str
    output: None
    """
    g = Github(getpass("Enter your PAT key ")) # Enter your PAT Key.
    username = "MikeMNelhams"
    main_branch_bool = input("Main Branch: Yes or No? ")
    yes_synonyms = ["yes", "y", "yh", "1", "true"]
    if main_branch_bool.lower() in yes_synonyms: 
        branch = "master" 
    else: 
        branch = "PROTOTYPE"

    user = g.get_user(username)
    r_proj_clone = 0
    for repo in g.get_user().get_repos():
        if repo.name == "ai-group-project-Team-JMJM":
            r_proj_clone = repo
            break
        # To see all the available attributes and methods
        print(dir(repo))
    
    if not r_proj_clone:
        print("ai-group-project-Team-JMJM not found")
        sys.exit()

    print("Importing editing csv files...")

    # Split the long string into a list of lines, then split by words, then put into a csv, then to numpy arr
    contents = r_proj_clone.get_contents("RapLyrics/Other", ref=branch)
    for counter, file_ in enumerate(contents):
        path = file_.path 
        path = str(path)
        title_start = path.find('Other')
        title_len = path[title_start:].find('.')
        name = path[title_start + 6:title_start + title_len + 4]
        print("Writing file {} {}".format(counter, name))
        if name.lower() == "censors2.csv":
            censors = _decode_and_write(file_, path)
        elif name.lower() == "capitals.csv":
            capitals = _decode_and_write(file_, path)
        else: 
            _decode_and_write(file_, path)
    print("All editing csv files are up to date!")

    print("Importing Github uncleaned text files...")
    contents = r_proj_clone.get_contents("RapLyrics/UNCLEAN", ref=branch)

    RAP_DATA = []
    rap_lyric_names = []

    for file_ in contents:
        path = file_.path
        path = str(path) 
        # Only choose the .txt files
        if path[-4:] == '.txt':
            # Append the name
            title_start = path.find('UNCLEAN')
            title_len = path[title_start:].find('.')
            name = path[title_start + 8:title_start + title_len]
            if name[-2:] == 'UC':
                name = name[:-2]
            rap_lyric_names.append(name) 

        # Append the Lyrics
        RAP_DATA.append(file_.decoded_content.decode("utf-8")) 
        
    # Remove the \ufeff at the beginning O(n)
    for count, lyric in enumerate(RAP_DATA): 
        RAP_DATA[count] = lyric[1:]

    # Censor the profanities O(n*m + n*m2) m > m2 xor m2 > m
    for count in range(len(RAP_DATA)): 
        for i in range(len(censors[0:])):
            RAP_DATA[count] = RAP_DATA[count].replace(str(censors[i, 0]), str(censors[i, 1]))
        for i in range(len(capitals[0:])):
            RAP_DATA[count] = RAP_DATA[count].replace(str(capitals[i, 0]), str(capitals[i, 1]))

    contents = r_proj_clone.get_contents("RapLyrics/CLEAN", ref=branch)
    cleaned_names = []
    for counter, file_ in enumerate(contents):
        path = file_.path
        path = str(path) 
        print("File {} ".format(counter + 1) + path)
        # Only choose the .txt files
        if path[-4:] == '.txt':
            # Append the name
            title_start = path.find('CLEAN')
            title_len = path[title_start:].find('.')
        name = path[title_start + 6:title_start + title_len]
        if name[-2:] == 'CL':
            name = name[:-2]
        cleaned_names.append(name) 

    # ALL OF THE EDITING IS DONE IN THE 'PROTOTYPE BRANCH' to avoid overwriting import changes
    # If the (now cleaned) rap_lyrics name is new (not in cleaned_names), then we want to create that as a new file 
    # If the (now cleaned) rap_lyrics name is NOT new (not in cleaned_names), then we want to update the file
    # print(rap_lyric_names)
    # print(cleaned_names)
    print("Committing files to github...")
    for counter, new_name in enumerate(rap_lyric_names): 
        if new_name in cleaned_names: 
            duplicate = r_proj_clone.get_contents("RapLyrics/CLEAN/{}CL.txt".format(new_name), ref=branch)
            r_proj_clone.update_file("RapLyrics/CLEAN/{}CL.txt".format(new_name), "This was uploaded automatically via pipeline", RAP_DATA[counter], duplicate.sha, branch=branch)
        else:
            r_proj_clone.create_file("RapLyrics/CLEAN/{}CL.txt".format(new_name), "This was uploaded automatically via pipeline", RAP_DATA[counter], branch=branch)

    if write_bool: 
        print("Writing text file to: {}".format(path_name))
        with open(path_name, 'w') as writefile:
            for lyric in RAP_DATA:
                writefile.write(lyric)


In [14]:
# Import all of Mike's lyrics.
import_github()

Enter your PAT key ··········
Main Branch: Yes or No? y
Importing Github raw text files...
AllLyrics.txt is already up to date!
Writing file 0 capitals.csv
Writing file 1 censors.csv
Writing file 2 censors2.csv
All files now up to date!


In [54]:
dic = pyphen.Pyphen(lang='en_EN') # set pyphen dictionary to English

def rap():
  nlines = int(input('How many lines would you like the rap to be? '))
  ngeneratelines = int(input('How many lines should be generated to choose from? '))
  count = int(input("How many syllables per line? "))

  censors = []

  # Extract all of Mike's lyrics. 
  Text = open("AllLyrics.txt", "r").read()
  Vocabulary = ''.join([i for i in Text if not i.isdigit()]).replace("\n", " ").split(' ')


  # Generate text
  def line_generator(Vocabulary):
    index = 1
    chain = {}
    # count = 16 # https://colemizestudios.com/rap-lyrics-syllables/, apparently rappers usually use semiquavers
    linecount = 0
    n = 0
    
    for word in Vocabulary[index:]:
        key = Vocabulary[index-1]
        if key in chain:
            chain[key].append(word)
        else:
            chain[key] = [word]
        index += 1
        
    word1 = random.choice(list(chain.keys()))
    line = word1.capitalize()
    wordsyllables = dic.inserted(word1)
    wordcount = len(wordsyllables.split('-'))
    linecount += wordcount

    while linecount < count:
      n += 1
      word2 = random.choice(chain[word1])
      wordsyllables = dic.inserted(word2)
      wordcount = len(wordsyllables.split('-'))
      linecount += wordcount
      # print(n)
      if linecount > count: # don't include word if it makes line go over syllable count
        linecount -= wordcount
      else:
        word1 = word2
        line += ' ' + word2.lower()
      if n > 99: # if not finding a word with right number of syllables, stop trying and print an error
        line += ' ERROR FINDING CORRECT SYLLABLE WORD'
        linecount = count
    return line


  # Rhyme Functions
  def reverse_syllable_extract(text):
    sy_form = []
    characters = [char for char in text]
    sylls = ['a', 'e', 'i', 'o', 'u', 'y']
    for x in characters:
        if x in sylls:
            sy_form.append(x)
    sy_form.reverse()
    return sy_form


  def rev_syllable_stop_count(text1, text2):
    count = True 
    i = 0
    counter = 0
    syll1 = reverse_syllable_extract(text1)
    syll2 = reverse_syllable_extract(text2)
    while count == True:
        if i < min(len(syll1), len(syll2)) and syll1[i] == syll2[i]:
            counter += 1
            i += 1
        else:
            count = False
    return counter


  def next_line_stop_count(start_line, lines):
    sy_lines = []
    for i in lines:
        sy_lines.append(rev_syllable_stop_count(start_line, i))
    choice = sy_lines[0]
    count = 0
    for i in range(len(sy_lines)):
        if sy_lines[i] > choice:
            choice = sy_lines[i]
    return lines[sy_lines.index(choice)]


  start_line = line_generator(Vocabulary)
  done = False
  while not done:
    if 'ERROR FINDING CORRECT SYLLABLE WORD' in start_line:
      start_line = line_generator(Vocabulary)
    else:
      done = True
  
  all_other_lines = [line_generator(Vocabulary) for i in range(ngeneratelines)]
  rap = [start_line]

  for n, line in enumerate(all_other_lines):
    done = False
    while not done:
      if 'ERROR FINDING CORRECT SYLLABLE WORD' in line:
        line = line_generator(Vocabulary)
        all_other_lines[n] = line
      else:
        done = True
    
  for i in range (nlines):
    if i % 2 == 1:
      next_line = next_line_stop_count(rap[len(rap) - 1], all_other_lines)
    else:
      next_line = random.choice(all_other_lines)
    all_other_lines.remove(next_line)
    rap.append(next_line)
  

  def __decode_and_write(file__, path_):
    data = file__.decoded_content
    data = data.decode('utf-8')[1:]
    data = data.splitlines()
    data_rows = []
    for count, word in enumerate(data):
        if count>0:
            data_rows.append(word.split(','))
    data = pd.DataFrame(data_rows)
    data = data.to_numpy()
    return data
  

  g = Github(getpass("Enter your PAT key ")) # Enter your PAT Key.
  username = "MikeMNelhams"
  main_branch_bool = input("Main Branch: Yes or No? ")
  yes_synonyms = ["yes", "y", "yh", "1", "true"]
  if main_branch_bool.lower() in yes_synonyms: 
      branch = "master" 
  else: 
      branch = "PROTOTYPE"
  user = g.get_user(username)
  r_proj_clone = 0
  for repo in g.get_user().get_repos():
      if repo.name == "ai-group-project-Team-JMJM":
          r_proj_clone = repo
          break
      # To see all the available attributes and methods
      print(dir(repo))
    
  if not r_proj_clone:
      print("ai-group-project-Team-JMJM not found")
      sys.exit()

  # Split the long string into a list of lines, then split by words, then put into a csv, then to numpy arr
  contents = r_proj_clone.get_contents("RapLyrics/Other", ref=branch)
  for file_ in contents:
    path = file_.path
    if str(path) == "RapLyrics/Other/censors2.csv":
        censors = __decode_and_write(file_, path)
    else: 
        pass

  for n, line in enumerate(rap):
  
    # Censor the rap
    line = line.split()
    for count in range(len(line)):
      for i in range(len(censors[0:])):
          line[count] = line[count].replace(str(censors[i, 0]), str(censors[i, 1]))

    rap[n] = ' '.join(line)

  return rap

In [55]:
rap()

How many lines would you like the rap to be? 10
How many lines should be generated to choose from? 500
How many syllables per line? 16
Enter your PAT key ··········
Main Branch: Yes or No? y


['Gay in between the basic tvs in showers with the all the truth big poppa',
 'Ginger ale the notorious woo notorious big howl now tell me hmmm',
 'Left the boss of rap killin wives and platinum chanel cologne then',
 'Swallow man you see big i had to give you hear me die for some',
 'Silk robes time to show me roam and dont stop i was forced to do when',
 'Phone bring back to turn green or do turn that blow d**ks scream bam bam bam',
 'Benzs givin ends to push cmon if they hit the strong who ya all',
 'Buy me i act funny how you ninjas i was a part of frickin up',
 'Hesitation in bridgeport new york for ninjaz brainless guns i turn',
 'Pelvis you go ninjaz wanna creep up louder yea though i only ninja aint had',
 'Deal go on youre dead wrong the two glock sixteen shots man what im a']

In [56]:
# Now takes longer to load due to iterating through words until finding word with the right number of syllables
# Also takes longer due to replacing lines which contain errors
# Requires you to re-enter PATKEY and main branch stuff, which is slightly annoying