Using [this](https://www.promptcloud.com/blog/scraping-song-lyrics-using-python-from-genius/) as a reference

In [17]:
#copying grequests module

from functools import partial
import traceback
try:
    import gevent
    from gevent import monkey as curious_george
    from gevent.pool import Pool
except ImportError:
    raise RuntimeError('Gevent is required for grequests.')

# Monkey-patch.
curious_george.patch_all(thread=False, select=False)

from requests import Session


__all__ = (
    'map', 'imap',
    'get', 'options', 'head', 'post', 'put', 'patch', 'delete', 'request'
)


class AsyncRequest(object):
    """ Asynchronous request.

    Accept same parameters as ``Session.request`` and some additional:

    :param session: Session which will do request
    :param callback: Callback called on response.
                     Same as passing ``hooks={'response': callback}``
    """
    def __init__(self, method, url, **kwargs):
        #: Request method
        self.method = method
        #: URL to request
        self.url = url
        #: Associated ``Session``
        self.session = kwargs.pop('session', None)
        if self.session is None:
            self.session = Session()

        callback = kwargs.pop('callback', None)
        if callback:
            kwargs['hooks'] = {'response': callback}

        #: The rest arguments for ``Session.request``
        self.kwargs = kwargs
        #: Resulting ``Response``
        self.response = None

    def send(self, **kwargs):
        """
        Prepares request based on parameter passed to constructor and optional ``kwargs```.
        Then sends request and saves response to :attr:`response`

        :returns: ``Response``
        """
        merged_kwargs = {}
        merged_kwargs.update(self.kwargs)
        merged_kwargs.update(kwargs)
        try:
            self.response = self.session.request(self.method,
                                                self.url, **merged_kwargs)
        except Exception as e:
            self.exception = e
            self.traceback = traceback.format_exc()
        return self


def send(r, pool=None, stream=False):
    """Sends the request object using the specified pool. If a pool isn't
    specified this method blocks. Pools are useful because you can specify size
    and can hence limit concurrency."""
    if pool is not None:
        return pool.spawn(r.send, stream=stream)

    return gevent.spawn(r.send, stream=stream)


# Shortcuts for creating AsyncRequest with appropriate HTTP method
get = partial(AsyncRequest, 'GET')
options = partial(AsyncRequest, 'OPTIONS')
head = partial(AsyncRequest, 'HEAD')
post = partial(AsyncRequest, 'POST')
put = partial(AsyncRequest, 'PUT')
patch = partial(AsyncRequest, 'PATCH')
delete = partial(AsyncRequest, 'DELETE')

# synonym
def request(method, url, **kwargs):
    return AsyncRequest(method, url, **kwargs)


def map(requests, stream=False, size=None, exception_handler=None, gtimeout=None):
    """Concurrently converts a list of Requests to Responses.

    :param requests: a collection of Request objects.
    :param stream: If True, the content will not be downloaded immediately.
    :param size: Specifies the number of requests to make at a time. If None, no throttling occurs.
    :param exception_handler: Callback function, called when exception occured. Params: Request, Exception
    :param gtimeout: Gevent joinall timeout in seconds. (Note: unrelated to requests timeout)
    """

    requests = list(requests)

    pool = Pool(size) if size else None
    jobs = [send(r, pool, stream=stream) for r in requests]
    gevent.joinall(jobs, timeout=gtimeout)

    ret = []

    for request in requests:
        if request.response is not None:
            ret.append(request.response)
        elif exception_handler and hasattr(request, 'exception'):
            ret.append(exception_handler(request, request.exception))
        else:
            ret.append(None)

    return ret


def imap(requests, stream=False, size=2, exception_handler=None):
    """Concurrently converts a generator object of Requests to
    a generator of Responses.

    :param requests: a generator of Request objects.
    :param stream: If True, the content will not be downloaded immediately.
    :param size: Specifies the number of requests to make at a time. default is 2
    :param exception_handler: Callback function, called when exception occurred. Params: Request, Exception
    """

    pool = Pool(size)

    def send(r):
        return r.send(stream=stream)

    for request in pool.imap_unordered(send, requests):
        if request.response is not None:
            yield request.response
        elif exception_handler:
            ex_result = exception_handler(request, request.exception)
            if ex_result is not None:
                yield ex_result

    pool.join()


In [18]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

page = requests.get('https://genius.com/Lana-del-rey-looking-for-america-lyrics')
content = page.content
soup = BeautifulSoup(content, 'html.parser')

lyrics = []
for div in soup.findAll('div', attrs = {'class': 'lyrics'}):
    lyrics.append(div.text.strip().split("\n"))
    
lyrics

[['[Verse 1]',
  'Took a trip to San Francisco',
  'All\u200a our friends said we would jive',
  "Didn't\u200a work, so I left for Fresno",
  'It was quite a scenic drive',
  'Pulled over to watch the children in the park',
  'We\u200a used to only worry for them after dark',
  '',
  '[Chorus]',
  'I’m\u200a still looking for my own version of America',
  'One without the gun, where the flag can freely fly',
  'No\u200a bombs in the sky, only fireworks when you and I collide',
  "It's just a dream I had in mind",
  "It's just a dream I had in mind",
  'It’s just a dream I had in mind',
  '',
  '[Verse 2]',
  'I\u200a flew back to New York City',
  'Missed that Hudson River line',
  'Took a train up to Lake Placid',
  "That's another place and time, where",
  'I used to go to drive-ins and listen to the blues',
  'So many things that I think twice about before I do now',
  '',
  '[Chorus]',
  "I'm still looking for my own version of America",
  'One without the gun, where the flag can f

In [19]:
df = pd.read_pickle('./data/df_v2.pkl')
print(df.shape)
df.head()

(215383, 17)


Unnamed: 0,id,songid,artist,track,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,5PS5dpaLogPzYU9hWiWyZb,Karkkiautomaatti,Tanssi vaan,0.487,0.678,9.0,-7.78,1.0,0.0495,0.0013,0.0551,0.0846,0.87,149.94,157307.0,4.0
1,1,41RpZW2lxAdnqDd2nMBzLQ,Hudson Mohawke,No One Could Ever,0.662,0.823,4.0,-1.711,0.0,0.0662,4.5e-05,0.952,0.343,0.621,177.745,138960.0,4.0
2,2,2poHURuOfVNbzZdivAwtOH,Der Mystic,Tangle Of Aspens,0.43,0.96,5.0,-7.741,0.0,0.0431,0.000725,0.925,0.123,0.225,140.001,514290.0,4.0
3,3,1jg9hZnReygpBvV2axGuPy,Sasha / Turbulence,We Have Got Love,0.663,0.677,3.0,-4.117,0.0,0.0755,0.245,0.0,0.35,0.879,152.085,212413.0,4.0
4,4,3GsS8jzoixpCnp4jDWCEvb,Kris Kross,2 Da Beat Ch'yall,0.859,0.741,11.0,-12.329,0.0,0.271,0.00276,0.0,0.325,0.529,98.082,221200.0,4.0


In [20]:
df.tail(10)

Unnamed: 0,id,songid,artist,track,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
215373,120816,4MpVEowUgu1ECfv1mopU4b,Acumen Nation,No Arms No Legs,0.493,0.981,5.0,-3.968,0.0,0.0934,0.000241,0.00939,0.412,0.257,105.024,241427.0,4.0
215374,120817,4Bc0uYCco6F581DmnS4WU2,Humble Pie,As Safe As Yesterday Is,0.299,0.626,2.0,-13.3,1.0,0.0638,0.00329,0.00435,0.0271,0.508,148.787,366653.0,4.0
215375,120818,7wOCIeCfftf5er9e1yQtJK,John Mellencamp,Just Like You,0.652,0.87,9.0,-4.016,0.0,0.0352,0.0128,0.00387,0.147,0.868,131.739,242667.0,4.0
215376,120819,52mdBm9Rtwo1aJlbLnebtt,Busdriver,The Troglodyte Wins,0.72,0.924,6.0,-3.626,0.0,0.323,0.0565,0.0,0.0902,0.923,81.531,202107.0,4.0
215377,120820,7swU6WtOprmirsEpqGHUXX,Rachael Sage,Marmelade,0.598,0.539,8.0,-11.817,1.0,0.137,0.199,0.0,0.109,0.525,160.812,298853.0,4.0
215378,120821,3A0DxMXoPSKVHndF31QVcH,Ninja Academy,Your Kung Fu Sucks,0.442,0.864,0.0,-8.831,1.0,0.0365,0.00507,0.872,0.461,0.736,115.658,86120.0,4.0
215379,120822,4WchAK5q3I2vXP83aNDY9c,Liz Phair,Perfect World,0.607,0.239,9.0,-11.891,1.0,0.0306,0.694,0.00165,0.112,0.415,113.247,135400.0,4.0
215380,120823,6uIOoA63P2A9Lp4H4AlE4A,Boukman Eksperyans,Kouman Sa Ta Ye,0.715,0.738,7.0,-10.676,1.0,0.0485,0.104,0.424,0.104,0.883,103.018,266467.0,3.0
215381,120824,5V5hNZlkMVxVV7kSAPERjd,Eric B & Rakim,No Omega,0.786,0.842,7.0,-7.494,1.0,0.14,0.0161,0.0643,0.123,0.605,116.553,322027.0,4.0
215382,120825,0yqJqtmdWivm9u6ujtzc5V,Supreme Majesty,After Midnight,0.123,0.946,0.0,-4.643,1.0,0.0921,1.8e-05,1.3e-05,0.0559,0.376,175.506,221107.0,4.0


In [21]:
df.head(20)

Unnamed: 0,id,songid,artist,track,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,0,5PS5dpaLogPzYU9hWiWyZb,Karkkiautomaatti,Tanssi vaan,0.487,0.678,9.0,-7.78,1.0,0.0495,0.0013,0.0551,0.0846,0.87,149.94,157307.0,4.0
1,1,41RpZW2lxAdnqDd2nMBzLQ,Hudson Mohawke,No One Could Ever,0.662,0.823,4.0,-1.711,0.0,0.0662,4.5e-05,0.952,0.343,0.621,177.745,138960.0,4.0
2,2,2poHURuOfVNbzZdivAwtOH,Der Mystic,Tangle Of Aspens,0.43,0.96,5.0,-7.741,0.0,0.0431,0.000725,0.925,0.123,0.225,140.001,514290.0,4.0
3,3,1jg9hZnReygpBvV2axGuPy,Sasha / Turbulence,We Have Got Love,0.663,0.677,3.0,-4.117,0.0,0.0755,0.245,0.0,0.35,0.879,152.085,212413.0,4.0
4,4,3GsS8jzoixpCnp4jDWCEvb,Kris Kross,2 Da Beat Ch'yall,0.859,0.741,11.0,-12.329,0.0,0.271,0.00276,0.0,0.325,0.529,98.082,221200.0,4.0
5,5,0kq75szR7uDEYrZkT2c4Ry,Jorge Negrete,El hijo del pueblo,0.361,0.483,7.0,-6.875,1.0,0.0287,0.348,3e-06,0.177,0.682,94.538,173573.0,4.0
6,6,2HyFpkX9J7vv3OZNDaraHZ,Tiger Lou,Pilots,0.533,0.302,10.0,-10.308,1.0,0.0307,0.34,0.179,0.111,0.294,134.959,318467.0,3.0
7,7,7DHl34zUQues23s4iHo2bc,Waldemar Bastos,N Gana,0.684,0.276,2.0,-14.881,1.0,0.0389,0.861,0.000141,0.126,0.817,110.968,273200.0,4.0
8,8,2djsaJG387f1ahzqWOiRbD,Lena Philipsson,006,0.675,0.77,9.0,-8.802,0.0,0.031,0.00157,0.00393,0.458,0.87,122.323,262333.0,4.0
9,9,5SM86TB7dU5n9Y23wLgcBY,Shawn Colvin,(Looking For) The Heart Of Saturday,0.582,0.0846,11.0,-13.661,1.0,0.0354,0.847,0.0,0.696,0.396,99.394,214907.0,4.0


In [22]:
artist = df.loc[9, 'artist']
title = df.loc[9, 'track']
if '(' in title:
    title = title.replace('(', '')
if ')' in title:
    title = title.replace(')', '')
if '&' in title:
    title = title.replace('&', 'and')
if ' / ' in title:
    title = title.replace(' / ', ' ')
title = title.replace(' ', '-')
    
print(artist)
print(title)

Shawn Colvin
Looking-For-The-Heart-Of-Saturday


In [23]:
artist = artist.replace(' ', '-')
artist

'Shawn-Colvin'

In [24]:
artist + '-' + title

'Shawn-Colvin-Looking-For-The-Heart-Of-Saturday'

In [25]:
artist = df.loc[9, 'artist']
title = df.loc[9, 'track']
string = artist + '-' + title + '-lyrics'
if '(' in string:
    string = string.replace('(', '')
if ')' in string:
    string = string.replace(')', '')
if '&' in string:
    string = string.replace('&', 'and')
if ' / ' in string:
    string = string.replace(' / ', ' ')
string = string.replace(' ', '-')
string

'Shawn-Colvin-Looking-For-The-Heart-Of-Saturday-lyrics'

In [26]:
song_lyrics = []

for i in range(5):

    artist = df.loc[i, 'artist']
    title = df.loc[i, 'track']
    string = artist + '-' + title + '-Lyrics'
    if '(' in string:
        string = string.replace('(', '')
    if ')' in string:
        string = string.replace(')', '')
    if '&' in string:
        string = string.replace('&', 'and')
    if ' / ' in string:
        string = string.replace(' / ', ' ')
    if "'" in string:
        string = string.replace("'", '')
    string = string.replace(' ', '-')
    page = requests.get(f'https://genius.com/{string}')
    content = page.content
    soup = BeautifulSoup(content, 'html.parser')

    lyrics = []
    for div in soup.findAll('div', attrs = {'class': 'lyrics'}):
        lyrics.append(div.text.strip().split("\n"))
    
    song_lyrics.append(lyrics)
    
song_lyrics[:5]

[[],
 [['(instrumental)']],
 [],
 [],
 [['(Sample)',
   "Break 'em off somethin'",
   '',
   '(Intro: Daddy Mac)',
   "Once again it's on!",
   'Kris Kross, Totally Krossed Out',
   'And with no doubt, we came here to',
   '',
   '(Hook: Mac Daddy and Daddy Mac)',
   "Ah, Rock rock y'all (Rock y'all)",
   "2 Da Beat Ch'yall (Beat Ch'yall)",
   "Boom, yeah it don't stop!",
   "Rock rock y'all (Rock y'all)",
   "2 Da Beat Ch'yall (Beat Ch'yall)",
   "Boom, yeah it don't stop!",
   '',
   '(Verse 1: Mac Daddy)',
   'And then they said we would pop, but we just pop the dough, they wish that they had a "Jump"',
   'That they could pop, move your set, move your neck and lay in your trunk',
   "(Rough!) if it ain't (Tough!), nigga it ain't me",
   "My lyrics'll make D-E-A-D A-B-C, now who's a wanna-be?",
   'Definitely not this little nappy-headed kid from Atlanta',
   'Just scrap more and then we drop a can, uh',
   'With a lyrical thrust as I bust these rough rhymes',
   "Whoever dissed the

In [27]:
string

'Kris-Kross-2-Da-Beat-Chyall-Lyrics'

In [28]:
artist = df.loc[4, 'artist']
title = df.loc[4, 'track']
string = artist + '-' + title + '-Lyrics'
if '(' in string:
    string = string.replace('(', '')
if ')' in string:
    string = string.replace(')', '')
if '&' in string:
    string = string.replace('&', 'and')
if ' / ' in string:
    string = string.replace(' / ', ' ')
if "'" in string:
    string = string.replace("'", '')
string = string.replace(' ', '-')
page = requests.get(f'https://genius.com/{string}')
content = page.content
soup = BeautifulSoup(content, 'html.parser')

lyrics = []
for div in soup.findAll('div', attrs = {'class': 'lyrics'}):
    lyrics.append(div.text.strip().split("\n"))
    
song_lyrics.append(lyrics)

lyrics

[['(Sample)',
  "Break 'em off somethin'",
  '',
  '(Intro: Daddy Mac)',
  "Once again it's on!",
  'Kris Kross, Totally Krossed Out',
  'And with no doubt, we came here to',
  '',
  '(Hook: Mac Daddy and Daddy Mac)',
  "Ah, Rock rock y'all (Rock y'all)",
  "2 Da Beat Ch'yall (Beat Ch'yall)",
  "Boom, yeah it don't stop!",
  "Rock rock y'all (Rock y'all)",
  "2 Da Beat Ch'yall (Beat Ch'yall)",
  "Boom, yeah it don't stop!",
  '',
  '(Verse 1: Mac Daddy)',
  'And then they said we would pop, but we just pop the dough, they wish that they had a "Jump"',
  'That they could pop, move your set, move your neck and lay in your trunk',
  "(Rough!) if it ain't (Tough!), nigga it ain't me",
  "My lyrics'll make D-E-A-D A-B-C, now who's a wanna-be?",
  'Definitely not this little nappy-headed kid from Atlanta',
  'Just scrap more and then we drop a can, uh',
  'With a lyrical thrust as I bust these rough rhymes',
  "Whoever dissed the Kris can kiss where the sun don't shine",
  "(Uhh!) The double

In [29]:
df.isnull().sum()

id                  0
songid              0
artist              0
track               1
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
duration_ms         0
time_signature      0
dtype: int64

In [30]:
df2 = df.fillna(' ')

links = []

for i in range(len(df)):
    artist = df2.loc[i, 'artist']
    title = df2.loc[i, 'track']
    string = artist + '-' + title + '-Lyrics'
    if '(' in string:
        string = string.replace('(', '')
    if ')' in string:
        string = string.replace(')', '')
    if '&' in string:
        string = string.replace('&', 'and')
    if ' / ' in string:
        string = string.replace(' / ', ' ')
    if "'" in string:
        string = string.replace("'", '')
    string = string.replace(' ', '-')
    url = f'https://www.genius.com/{string}'
    links.append(url)
    
links[:5]

['https://www.genius.com/Karkkiautomaatti-Tanssi-vaan-Lyrics',
 'https://www.genius.com/Hudson-Mohawke-No-One-Could-Ever-Lyrics',
 'https://www.genius.com/Der-Mystic-Tangle-Of-Aspens-Lyrics',
 'https://www.genius.com/Sasha-Turbulence-We-Have-Got-Love-Lyrics',
 'https://www.genius.com/Kris-Kross-2-Da-Beat-Chyall-Lyrics']

In [32]:
from multiprocessing.dummy import Pool
from itertools import chain
class Lyrics:
    '''
    Multi-theaded Indeed Job Listings Crawler
    Usage:
    listings = IndeedJobListings('Data Scientist', 'Seattle, WA')
    descriptions = listings.get_descriptions(pages=10)
    '''
    def __init__(self, link, threads=12):
        self.threads = threads
        self.query_url = link
        self.session = HTMLSession()
    def _get_lyrics_text(self, url):
        page = requests.get(url)
        content = page.content
        soup = BeautifulSoup(content, 'html.parser')
        results = soup.findAll('div', attrs={'class': 'lyrics'})
        lyrics_text = []
        for div in results:
            lyrics_text.append(div.text.strip().split("\n"))
        return lyrics_text
    def get_lyrics(self, pages=1):
        list_urls = [self.query_url] + [f'{self.query_url}&start={x*10}'
                                        for x in range(1, pages)]
        p = Pool(self.threads)
        post_urls = chain(*p.map(self._get_lyrics_text, list_urls))
        lyrics = p.map(self._get_lyrics_text, post_urls)
        return descriptions
# listings = IndeedJobListings('Data Scientist', 'Seattle, WA')
# descriptions = listings.get_descriptions(pages=1)

In [33]:
links[:5]

['https://www.genius.com/Karkkiautomaatti-Tanssi-vaan-Lyrics',
 'https://www.genius.com/Hudson-Mohawke-No-One-Could-Ever-Lyrics',
 'https://www.genius.com/Der-Mystic-Tangle-Of-Aspens-Lyrics',
 'https://www.genius.com/Sasha-Turbulence-We-Have-Got-Love-Lyrics',
 'https://www.genius.com/Kris-Kross-2-Da-Beat-Chyall-Lyrics']

In [34]:
reqs = (get(link) for link in links)
resp = imap(reqs, Pool(10))

In [36]:
# Run this cell

song_lyrics = []

for r in resp:
    
    try:
        soup = BeautifulSoup(r.text, 'lxml')
        results = soup.findAll('div', attrs={'class': 'lyrics'})
        lyrics = []
        for div in results:
            lyrics.append(div.text.strip().split("\n"))

        song_lyrics.append(lyrics)
    except:
        print("whacky!!!")
    
song_lyrics[:5]

FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?

In [None]:
reqs = ()

In [None]:
page = requests.get(f'https://genius.com/{string}')
    content = page.content
    soup = BeautifulSoup(content, 'html.parser')

    lyrics = []
    for div in soup.findAll('div', attrs = {'class': 'lyrics'}):
        lyrics.append(div.text.strip().split("\n"))
    
    song_lyrics.append(lyrics)
    
    stop = timeit.default_timer()
    
    if (i/len(df)*100) < 5:
        expected_time = 'Calculating...'
        
    else:
        time_perc = timeit.default_timer()
        expected_time = np.round(((time_perc-start) / (i/len(df6))) / 60, 2)
        
    print('Current progress:', np.round(i/len(df) * 100, 2), '%')
    print('Current run time:', np.round((stop - start) / 60, 2), 'minutes')
    print('Expected run time:', expected_time, 'minutes')
    
song_lyrics[:5]