First, we install the lyricsgenius API:

In [1]:
!pip install lyricsgenius



We import the libraries and set a path for our input file of artists:

In [1]:
from lyricsgenius import Genius
import json
import csv
import multiprocessing
import queue
import logging

# OS agnostic
import os 
CSV_PATH = os.path.join(os.path.curdir, 'artists', '10000-MTV-Music-Artists-page-%s.csv')

Now we import the artist dataset into an array. This consists of the top 10,000 artists listed on MTV:

We set up a lyricsgenius token, and use the API to pull the lyrics data for each artist in the dataset:

In [None]:
# Genius setup
token = "EBufquOcw_ts4Y4V7yiddUNyUakTdqCpnMZhiI3XtAScWOntEom8Hj4T87gAV_cA"
genius = Genius(token, retries=2)

genius.verbose = False
genius.remove_section_headers = True
genius.skip_non_songs = True
genius.excluded_terms = ["(Remix)", "(Live)"]

# Multiprocessing cores
process_number = int(multiprocessing.cpu_count()) * 2
multiprocessing.log_to_stderr().setLevel(logging.DEBUG)

# Data management
final_ = multiprocessing.Manager().list()

# artist_queue = queue.Queue()
# final_ = []
checked_artists = set()

# Pull out artists
def get_artists(queue):
    for x in range(1,5):
        path = CSV_PATH % str(x)
        with open(path, encoding="UTF-8") as csvfile:
            TopArtists = csv.reader(csvfile)
            
            # Skip header
            next(TopArtists)
            for row in TopArtists:
                artist = row[0]
                # Check if we should skip this artists since we already found the data
                if artist not in checked_artists:
                    queue.put(artist)
                      


# File management
def write_to_csv(data, columns = ["artist", "song", "data"], file_name="song_data.csv"):
    """
    data: list of dictionaries {artist, song, data}
    """
    
    csv_path = os.path.join(os.path.curdir, 'data', file_name)
    with open(csv_path, 'w') as csv_file: 
        # creating a csv dict writer object 
        writer = csv.DictWriter(csv_file, fieldnames = columns) 
        
        # writing headers (field names) 
        writer.writeheader() 
        
        # writing data rows 
        writer.writerows(data) 
        

def read_csv(file_name="song_lyrics.csv"):
    global final_, checked_artists     
    
    csv_path = os.path.join(os.path.curdir, 'data', file_name)
    artist_set = set()
    
    # opening the CSV file
    try:
        with open(csv_path, mode ='r') as file:   

            # reading the CSV file
            data = csv.DictReader(file)

            for entry in data:
                artist_set.add(entry["artist"])
                final_.append(entry)
                
    except FileNotFoundError:
        pass
    
# Processing
def clean_data(data):
    cleaned_data = data.replace("\n", " ").replace(",", " ")
    return cleaned_data

def process_artist(artist):
    artist_dict = artist.to_dict()
    return ""

def process_song(song):
    lyrics = clean_data(song.lyrics)
    return lyrics
                           
def build_entry(artist, song, data, columns = ["artist", "song", "data"]):
    entry = [{"artist": artist, "song": song, "data": data}]
    

# Run genius search
def search_genius(args):
    artist_queue, num = args
    print("[{num}] Starting\n".format(num=num), end='')
    try:
        while True:
            artist = artist_queue.get()
            if artist is None:
                print("[{num}] Done\n".format(num=num), end='')
                return
            print("[{num}] Searching {artist}\n".format(num=num, artist=artist.strip()), end='')
            
            # Pull data for artist from genius
            genius_artist = genius.search_artist(artist, per_page=50, get_full_info=False)
            print("[{num}] Done".format(num=num), end='')              
            if genius_artist == None:
                print("[{num}] {artist} not found\n".format(num=num, artist=artist.strip()), end='')
                continue
                           
            artist_data =  process_artist(genius_artist)
                           
            print("[{num}] {artist} number of songs: {song_num}\n".format(num=num, artist=artist.strip(), song_num=len(genius_artist.songs)), end='')
            
            for song in genius_artist.songs:
                song_data = process_song(song)
                
                # Add to final list
                final_.append(build_entry(artist, song, song_data))
    
    except Exception as e:
        print("[{num}] Something went wrong: {error}\n".format(num=num, error= e), end='')
    
    
def run(multi_core=False):                  
    
    # Load in any previous data
    print("Reading previous")
    read_csv()
    
    pool = None
    try:  
        if multi_core:
            print("Multiprocessing with {process_number} processes".format(process_number=process_number))
            
            artist_queue = multiprocessing.Manager().Queue()
            get_artists(artist_queue)
            
            for x in range(process_number):
                artist_queue.put(None)
            
            print(artist_queue.qsize())
            # creating processes
            pool = multiprocessing.Pool(process_number)
            args = [(artist_queue, x) for x in range(process_number)]
            pool.map(search_genius, args)
            pool.join()
            
        else:
            print("Running single core")
            artist_queue = queue.Queue()
            get_artists(artist_queue)
            rtist_queue.put(None)
            print(artist_queue.qsize())
            search_genius((1, artist_queue))

    
    except KeyboardInterrupt:
        if pool:
            pool.close()
            pool.terminate()
            pool.join()
        print("KeyboardInterrupt: Writing results")
    
    finally:
        write_to_csv(list(final_))                       


run(multi_core=True) 




[INFO/SyncManager-1] child process calling self.run()
[INFO/SyncManager-1] created temp directory /var/folders/jc/85vy06cn6b13gcm4rcvz9_fw0000gq/T/pymp-2c1gh2ix
[INFO/SyncManager-1] manager serving at '/var/folders/jc/85vy06cn6b13gcm4rcvz9_fw0000gq/T/pymp-2c1gh2ix/listener-w0xzvj21'
[DEBUG/MainProcess] requesting creation of a shared 'list' object
[DEBUG/SyncManager-1] 'list' callable returned object with id '109298fc8'
[DEBUG/MainProcess] INCREF '109298fc8'


Reading previous
Multiprocessing with 32 processes


[DEBUG/SyncManager-2] INCREF '109298fc8'
[INFO/SyncManager-2] child process calling self.run()
[INFO/SyncManager-2] created temp directory /var/folders/jc/85vy06cn6b13gcm4rcvz9_fw0000gq/T/pymp-021lnn9z
[DEBUG/MainProcess] requesting creation of a shared 'Queue' object
[INFO/SyncManager-2] manager serving at '/var/folders/jc/85vy06cn6b13gcm4rcvz9_fw0000gq/T/pymp-021lnn9z/listener-aptz2jcy'
[DEBUG/SyncManager-2] 'Queue' callable returned object with id '1092c8828'
[DEBUG/MainProcess] INCREF '1092c8828'
[DEBUG/MainProcess] thread 'MainThread' does not own a connection
[DEBUG/MainProcess] making connection to manager
[DEBUG/SyncManager-2] starting server thread to service 'MainProcess'
[DEBUG/MainProcess] created semlock with handle 64
[DEBUG/MainProcess] created semlock with handle 65
[DEBUG/MainProcess] created semlock with handle 68
[DEBUG/MainProcess] created semlock with handle 69


8376


[DEBUG/MainProcess] added worker
[DEBUG/MainProcess] added worker
[DEBUG/ForkPoolWorker-3] INCREF '109298fc8'[DEBUG/MainProcess] added worker

[DEBUG/ForkPoolWorker-4] INCREF '109298fc8'[DEBUG/MainProcess] added worker
[DEBUG/ForkPoolWorker-3] INCREF '1092c8828'

[DEBUG/MainProcess] added worker
[INFO/ForkPoolWorker-3] child process calling self.run()[DEBUG/ForkPoolWorker-4] INCREF '1092c8828'[DEBUG/ForkPoolWorker-5] INCREF '109298fc8'[DEBUG/MainProcess] added worker



[DEBUG/MainProcess] added worker
[INFO/ForkPoolWorker-4] child process calling self.run()
[DEBUG/ForkPoolWorker-5] INCREF '1092c8828'[DEBUG/MainProcess] added worker
[DEBUG/ForkPoolWorker-6] INCREF '109298fc8'

[INFO/ForkPoolWorker-5] child process calling self.run()[DEBUG/MainProcess] added worker

[DEBUG/ForkPoolWorker-6] INCREF '1092c8828'[DEBUG/ForkPoolWorker-7] INCREF '109298fc8'[DEBUG/MainProcess] added worker

[INFO/ForkPoolWorker-6] child process calling self.run()
[DEBUG/MainProcess] added worker

[DEBUG/ForkPo

[0] Starting



[DEBUG/ForkPoolWorker-21] INCREF '1092c8828'

[1] Starting


[DEBUG/ForkPoolWorker-23] INCREF '1092c8828'
[DEBUG/ForkPoolWorker-22] INCREF '1092c8828'
[DEBUG/ForkPoolWorker-25] INCREF '1092c8828'
[DEBUG/ForkPoolWorker-24] INCREF '1092c8828'

[2] Starting


[DEBUG/ForkPoolWorker-27] worker exiting after 0 tasks

[3] Starting


[DEBUG/ForkPoolWorker-31] INCREF '1092c8828'


[5] Starting



[DEBUG/ForkPoolWorker-29] worker exiting after 0 tasks


[4] Starting


[DEBUG/ForkPoolWorker-34] INCREF '1092c8828'[DEBUG/ForkPoolWorker-28] worker exiting after 0 tasks
[DEBUG/ForkPoolWorker-30] worker exiting after 0 tasks

[6] Starting


[DEBUG/ForkPoolWorker-32] worker exiting after 0 tasks[DEBUG/ForkPoolWorker-26] worker exiting after 0 tasks


[DEBUG/ForkPoolWorker-3] thread 'MainThread' does not own a connection

[7] Starting




[DEBUG/ForkPoolWorker-4] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-33] INCREF '1092c8828'

[9] Starting






[12] Starting
[8] Starting




[DEBUG/ForkPoolWorker-5] thread 'MainThread' does not own a connection


[11] Starting


[DEBUG/ForkPoolWorker-6] thread 'MainThread' does not own a connection

[10] Starting


[DEBUG/ForkPoolWorker-8] thread 'MainThread' does not own a connection


[13] Starting


[DEBUG/ForkPoolWorker-7] thread 'MainThread' does not own a connection




[14] Starting


[DEBUG/ForkPoolWorker-9] thread 'MainThread' does not own a connection



[16] Starting
[15] Starting





[17] Starting


[DEBUG/ForkPoolWorker-10] thread 'MainThread' does not own a connection

[20] Starting
[18] Starting


[DEBUG/ForkPoolWorker-12] thread 'MainThread' does not own a connection



[19] Starting


[DEBUG/ForkPoolWorker-15] thread 'MainThread' does not own a connection

[22] Starting


[INFO/ForkPoolWorker-27] process shutting down[DEBUG/ForkPoolWorker-11] thread 'MainThread' does not own a connection

[21] Starting





[28] Starting



[DEBUG/ForkPoolWorker-13] thread 'MainThread' does not own a connection
[DEBUG/ForkPoolWorker-14] thread 'MainThread' does not own a connection[INFO/ForkPoolWorker-29] process shutting down[DEBUG/ForkPoolWorker-16] thread 'MainThread' does not own a connection[INFO/ForkPoolWorker-28] process shutting down
[INFO/ForkPoolWorker-34] child process calling self.run()[INFO/ForkPoolWorker-30] process shutting down
[DEBUG/ForkPoolWorker-17] thread 'MainThread' does not own a connection[INFO/ForkPoolWorker-32] process shutting down[INFO/ForkPoolWorker-26] process shutting down[DEBUG/ForkPoolWorker-19] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-18] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-3] making connection to manager[DEBUG/ForkPoolWorker-20] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-21] thread 'MainThread' does not own a connection
[DEBUG/ForkPoolWorker-23] thread 'MainThread' does not own a connection[DEBUG/ForkPoolW

[30] Starting


[DEBUG/ForkPoolWorker-22] thread 'MainThread' does not own a connection
[DEBUG/ForkPoolWorker-25] thread 'MainThread' does not own a connection

[DEBUG/ForkPoolWorker-24] thread 'MainThread' does not own a connection[DEBUG/ForkPoolWorker-5] making connection to manager
[DEBUG/ForkPoolWorker-8] making connection to manager[DEBUG/ForkPoolWorker-6] making connection to manager[DEBUG/ForkPoolWorker-31] thread 'MainThread' does not own a connection



[DEBUG/ForkPoolWorker-7] making connection to manager



[DEBUG/ForkPoolWorker-9] making connection to manager




[DEBUG/ForkPoolWorker-10] making connection to manager


[DEBUG/ForkPoolWorker-12] making connection to manager[DEBUG/ForkPoolWorker-33] thread 'MainThread' does not own a connection

[DEBUG/ForkPoolWorker-15] making connection to manager
[DEBUG/ForkPoolWorker-11] making connection to manager[DEBUG/ForkPoolWorker-27] running all "atexit" finalizers with priority >= 0

[DEBUG/ForkPoolWorker-13] making connection to manager
[DEBUG/F

[31] Starting
[0] Searching Adele


[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-4'






[DEBUG/ForkPoolWorker-34] thread 'MainThread' does not own a connection

[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-5'[DEBUG/ForkPoolWorker-27] DECREF '109298fc8'

[1] Searching Joey + Rory


[DEBUG/ForkPoolWorker-29] DECREF '109298fc8'
[DEBUG/ForkPoolWorker-34] making connection to manager[DEBUG/ForkPoolWorker-28] DECREF '109298fc8'
[DEBUG/ForkPoolWorker-32] DECREF '109298fc8'[DEBUG/ForkPoolWorker-30] DECREF '109298fc8'[DEBUG/ForkPoolWorker-26] DECREF '109298fc8'




[2] Searching Draaco Aventura


[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-8'

[DEBUG/ForkPoolWorker-27] running the remaining "atexit" finalizers
[DEBUG/ForkPoolWorker-29] running the remaining "atexit" finalizers

[DEBUG/ForkPoolWorker-28] running the remaining "atexit" finalizers[DEBUG/ForkPoolWorker-30] running the remaining "atexit" finalizers
[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-6'[DEBUG/ForkPoolWorker-32] running the remaining "atexit" finalizers[DEBUG/ForkPoolWorker-26] running the remaining "atexit" finalizers

[5] Searching Justin Bieber


[INFO/ForkPoolWorker-29] process exiting with exitcode 0
[INFO/ForkPoolWorker-27] process exiting with exitcode 0




[INFO/ForkPoolWorker-30] process exiting with exitcode 0
[INFO/ForkPoolWorker-28] process exiting with exitcode 0

[3] Searching Peer van Mladen


[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-7'[INFO/ForkPoolWorker-32] process exiting with exitcode 0[INFO/ForkPoolWorker-26] process exiting with exitcode 0




[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-9'

[4] Searching Chris Janson





[6] Searching One Direction


[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-10'
[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-12'

[7] Searching Drake



[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-15'

[9] Searching Carrie Underwood



[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-13'

[12] Searching Ed Sheeran



[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-11'

[10] Searching Taylor Swift





[8] Searching SayWeCanFly


[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-14'
[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-17'

[11] Searching Selena Gomez



[DEBUG/MainProcess] cleaning up worker 29
[DEBUG/MainProcess] cleaning up worker 27


[14] Searching Chris Brown


[DEBUG/MainProcess] cleaning up worker 26
[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-19'[DEBUG/MainProcess] cleaning up worker 25

[DEBUG/MainProcess] cleaning up worker 24
[DEBUG/MainProcess] cleaning up worker 23


[16] Searching Nicki Minaj


[DEBUG/MainProcess] cleaning up worker 12
[DEBUG/MainProcess] cleaning up worker 10
[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-18'[DEBUG/MainProcess] cleaning up worker 9
[DEBUG/MainProcess] cleaning up worker 8

[DEBUG/MainProcess] cleaning up worker 7
[DEBUG/MainProcess] cleaning up worker 6
[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-16'[DEBUG/MainProcess] cleaning up worker 5
[DEBUG/MainProcess] cleaning up worker 4


[15] Searching Fifth Harmony


[DEBUG/MainProcess] cleaning up worker 3
[DEBUG/MainProcess] cleaning up worker 2

[DEBUG/MainProcess] cleaning up worker 1
[DEBUG/MainProcess] cleaning up worker 0
[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-20'

[13] Searching Thomas Rhett


[DEBUG/MainProcess] added worker

[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-23'

[17] Searching Eminem



[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-21'[DEBUG/MainProcess] added worker


[20] Searching Beyoncé





[18] Searching Meghan Trainor


[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-25'[DEBUG/MainProcess] added worker

[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-22'[DEBUG/MainProcess] added worker


[22] Searching Twenty One Pilots



[DEBUG/ForkPoolWorker-35] INCREF '109298fc8'[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-31'

[19] Searching Fetty Wap



[DEBUG/MainProcess] added worker

[DEBUG/ForkPoolWorker-35] INCREF '1092c8828'

[28] Searching 5 Seconds Of Summer


[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-24'
[DEBUG/ForkPoolWorker-36] INCREF '109298fc8'[DEBUG/MainProcess] added worker

[INFO/ForkPoolWorker-35] child process calling self.run()
[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-33'


[21] Searching Jason Aldean



[DEBUG/MainProcess] added worker
[DEBUG/ForkPoolWorker-36] INCREF '1092c8828'
[DEBUG/ForkPoolWorker-37] INCREF '109298fc8'[DEBUG/SyncManager-2] starting server thread to service 'ForkPoolWorker-34'

[30] Searching Luke Bryan


[INFO/ForkPoolWorker-36] child process calling self.run()


[DEBUG/MainProcess] added worker


[31] Searching Blake Shelton


[DEBUG/SyncManager-2] got EOF -- exiting thread serving 'ForkPoolWorker-3'[DEBUG/ForkPoolWorker-37] INCREF '1092c8828'
[DEBUG/ForkPoolWorker-38] INCREF '109298fc8'[DEBUG/MainProcess] added worker
[DEBUG/SyncManager-2] got EOF -- exiting thread serving 'ForkPoolWorker-4'


[INFO/ForkPoolWorker-37] child process calling self.run()[DEBUG/SyncManager-2] got EOF -- exiting thread serving 'ForkPoolWorker-5'[DEBUG/ForkPoolWorker-38] INCREF '1092c8828'

[DEBUG/SyncManager-2] got EOF -- exiting thread serving 'ForkPoolWorker-8'
[DEBUG/MainProcess] added worker
[DEBUG/ForkPoolWorker-39] INCREF '109298fc8'
[INFO/ForkPoolWorker-38] child process calling self.run()
[DEBUG/SyncManager-2] got EOF -- exiting thread serving 'ForkPoolWorker-6'
[DEBUG/MainProcess] added worker

[DEBUG/ForkPoolWorker-39] INCREF '1092c8828'[DEBUG/SyncManager-2] got EOF -- exiting thread serving 'ForkPoolWorker-7'
[DEBUG/ForkPoolWorker-40] INCREF '109298fc8'[INFO/ForkPoolWorker-39] child process calling self.run()


[DEBUG/