#### Convert all the rap lyrics text files into one csv file 

In [9]:
import os
import csv 
import json
import pandas as pd

In [10]:
def write_lyrics_json(lyrics_dir, json_path):
    ''' 
        Get the lyrics from the text files in the path 
        and write them to a json file where filename is the key and
        the lyrics are the value.
        - Input: lyrics_dir: path to the lyrics directory
                json_path: path to the json file to write to
        - Output: None
    '''
    # define safe characters
    safe_chars = 'abcdefghijklmnopqrstuvwxyz\
        ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 \
        (),.;:?\"\'\n'

    # open json file in write mode
    with open(json_path, 'w') as f:
        # iterate through the files in the lyrics directory
        for filename in os.listdir(lyrics_dir):
            # get the path to the file
            path = os.path.join(lyrics_dir, filename)
            # open the file in read mode
            with open(path, 'r', encoding='utf-8') as f1:
                # remove "_lyrics.txt" from the filename
                name = filename[:-11]
                # convert dash and space to underscore
                name = name.replace('-', '_').replace(' ', '_')
                # uppercase the first letter
                name = name[0].upper() + name[1:]
                # uppercase the first letter of each word in the name
                name = '_'.join([word[0].upper() + word[1:] for word in name.split('_')])
                # reading paragraphs from the file separated by
                # at least 2 newlines
                paragraphs = f1.read().split('\n\n')
                for paragraph in paragraphs:
                    # remove trailing and leading spaces and newlines
                    paragraph = paragraph.strip(' \n')
                    # remove all characters that are not in safe_chars
                    paragraph = ''.join([c for c in paragraph if c in safe_chars])
                    # skip empty paragraphs or single word paragraphs
                    if len(paragraph) < 2:
                        continue
                    # write the paragraph to the json file
                    # f.write(json.dumps({name: paragraph}) + '\n')
                    f.write(json.dumps({'author': name, 'content': paragraph}) + '\n')

                    
                



In [11]:
json_path = "./data/lyrics.json"
lyrics_path = "./data/junk/lyrics"
write_lyrics_json(lyrics_path, json_path)

In [12]:
def from_csv_to_json(csv_path, json_path):
    '''
        Read the csv file, get the author and the content of the poem
        and write them to a json file where author is the key and
        the content is the value.
        - Input: csv_path: path to the csv file
                json_path: path to the json file
        - Output: None
    '''
     # define safe characters
    safe_chars = 'abcdefghijklmnopqrstuvwxyz\
        ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 \
        (),.;:?\"\'\n'


    # read the csv file
    df = pd.read_csv(csv_path)
    # open the json file in write mode
    with open(json_path, 'w') as f:
        # iterate through the rows of the dataframe
        for index, row in df.iterrows():
            # get the author
            author = row['author']
            # get name without title
            author = author.split(',')[0]
            # lowercase the author name
            author = author.lower()
            # remove trailing and leading spaces
            author = author.strip(' ')
            # convert dash and space to underscore
            author = author.replace('-', '_').replace(' ', '_')
            # uppercase the first letter
            author = author[0].upper() + author[1:]
            # camelcase the author name
            author = '_'.join([word[0].upper() + word[1:] for word in author.split('_')])
            # remove dots from the author name
            author = author.replace('.', '')
            # get the content
            content = row['content']
            # remove \r from the content
            content = content.replace('\r', '')
            # reading paragraphs from the file separated by
            # at least 2 newlines
            paragraphs = content.split('\n\n')
            for paragraph in paragraphs:
                 # remove extra punctuations
                paragraph = paragraph.replace(' ,', ',').replace(' .', '.').replace(' ;', ';')
                paragraph = paragraph.replace(' :', ':').replace(' ?', '?').replace(' !', '!')
                paragraph = paragraph.replace(' \'', '\'').replace('\' ', '\'')
                paragraph = paragraph.replace(' - ', '-').replace(' -', '-').replace('- ', '-')
                paragraph = paragraph.replace(' (', '(').replace('( ', '(')
                paragraph = paragraph.replace(' )', ')').replace(') ', ')')
                paragraph = paragraph.replace(' / ', '/').replace(' /', '/').replace('/ ', '/')
                
                # replace & with and
                paragraph = paragraph.replace('&', 'and')
                # remove numbers and roman numerals
                paragraph = ''.join([c for c in paragraph 
                    if not c.isdigit() and c.upper() not in ['I', 'V', 'X']])
                # remove trailing and leading spaces and newlines
                paragraph = paragraph.strip(' \n')
                # replace multiple spaces with a single space
                paragraph = ' '.join(paragraph.split())
                # remove all characters that are not in safe_chars
                paragraph = ''.join([c for c in paragraph if c in safe_chars])
                # skip empty paragraphs or single word paragraphs
                if len(paragraph) < 2:
                    continue
                # write the paragraph to the json file
                # f.write(json.dumps({author: paragraph}) + '\n')
                f.write(json.dumps({'author': author, 'content': paragraph}) + '\n')



In [13]:
json_path = "./data/poems.json"
csv_path = "./data/junk/poetry/poetry.csv"
from_csv_to_json(csv_path, json_path)