In [1]:
import pandas as pd 
import requests
from bs4 import BeautifulSoup, Tag
import re
from multiprocessing.dummy import Pool as ThreadPool 

In [2]:
df = pd.read_csv('dataframe.csv')

In [3]:
# Get only unique artist-tracks -> Don't scrape two or more times a song
df_no_dupl = df.drop_duplicates(subset = ["artist_track"]) 

In [4]:
print(df.shape)

(257717, 5)


In [5]:
print(df_no_dupl.shape)

(180961, 5)


In [6]:
def get_text_with_br(tag, result=''):
    
    for x in tag.contents:
        if isinstance(x, Tag):  # check if content is a tag
            if x.name == 'br':  # if tag is <br> append it as string
                result += str(x)
            else:  # for any other tag, recurse
                result = get_text_with_br(x, result)
        else:  # if content is NavigableString (string), append
            result += x
    
    return result

def scrape_info(artist_track):

    print(artist_track)
    result = ''
    url  = "https://genius.com/" + artist_track
    try:
        page = requests.get(url)
    except:
        return "NA", "NA", "NA",[] 
    
    #Check if the genius page exists 
    if page.status_code != 200:
        return "NA", "NA", "NA",[]    
    
    html = BeautifulSoup(page.text, 'html.parser') # Take page's html
    
    song_id = html.find('meta', {"property": "twitter:app:url:iphone"})['content'].replace('genius://songs/','')
    
    # Get album
    try:
        album = html.find('a', {"href": "#primary-album"}).get_text() 
    except:
        album = "NA"
    
    # Get date
    try:
        date = "NA"
        container = html.find_all('div', {"class": "HeaderMetadata__Section-sc-1p42fnf-7"})
        for element in container:
            if "Release" in element.get_text():
                date = element.get_text().replace("Release Date","")
    except:
        pass
        
    # Get lyrics
    try:
        lyrics = html.find('div', {"id": "annotation-portal-target"})
        result = get_text_with_br(lyrics).replace('<br/>','\n')
        result = result.replace("EmbedShare URLCopyEmbedCopyCancelHow to Format Lyrics:Type out all lyrics, even if it’s a chorus that’s repeated throughout the songThe Section Header button breaks up song sections. Highlight the text then click the linkUse Bold and Italics only to distinguish between different singers in the same verse.E.g. “Verse 1: Kanye West, Jay-Z, Both”Capitalize each lineTo move an annotation to different lyrics in the song, use the [...] menu to switch to referent editing mode","")
    except:
        result = "NA"
        
        
    # Get comments
    
    url  = "https://genius.com/api/songs/" + song_id + "/comments"
    try:
        comments_page = requests.get(url).json()
    except:
        return album, date, result,[] 
    comments = comments_page['response']['comments']
    #print(comments)
    comment_list = []
    
    for comment in comments:
        if (comment['anonymous_author'] == None):
            comment_author = comment['author']['name']
            comment_date = comment['created_at']
            comment_text = comment['body']['dom']['children']
    
            comment_list.append((comment_author,comment_date,comment_text,artist_track))    
    
    return album,date,result, comment_list

In [7]:
songs_info = []
comments_info = []

def scrape(artist_track):
    album,date,lyrics,comments = scrape_info(artist_track)
    songs_info.append((album,date,lyrics,artist_track))
    comments_info.append(comments)

In [None]:
# Pool of thread used to scrape faster all the songs

p = ThreadPool(50)
p.map(scrape,df_no_dupl.artist_track)
p.terminate()
p.join()
    

# MONGODB

In [9]:
# Getting all song names 

artists_tracks = []
for song in songs_info:
    artists_tracks.append(song[3])

In [22]:
df = df[df['artist_track'].isin(artists_tracks)] # Checking which song_names have been scraped 

In [23]:
print(df.shape)

(257717, 5)


In [30]:
df_songs_info = pd.DataFrame(songs_info,columns=['album','date','lyrics','artist_track']) # Create the dataframe for mongoDB info

In [31]:
df = pd.merge(df,df_songs_info,on='artist_track') # Merge the old info with the info scraped

In [32]:
df.drop_duplicates(subset=['artist_track']).head(30)

Unnamed: 0,user_id,artist,track,playlist,artist_track,album,date,lyrics
0,9cc0cfd4d7d7885102480dd99e7a90d6,Crosby Stills Nash,Helplessly Hoping,HARD ROCK 2010,Crosby-Stills-Nash-Helplessly-Hoping-lyrics,,,
1,07f0fc3be95dcd878966b1f9572ff670,C418,Chris,C418,C418-Chris-lyrics,Minecraft - Volume Alpha,"March 4, 2011",This song is an instrumental
2,07f0fc3be95dcd878966b1f9572ff670,C418,Équinoxe,C418,C418-Équinoxe-lyrics,,,
3,07f0fc3be95dcd878966b1f9572ff670,Pretty Lights,I Know the Truth,Chill out,Pretty-Lights-I-Know-the-Truth-lyrics,,,I'm too blind\nI'm too blind.....to see the li...
4,07f0fc3be95dcd878966b1f9572ff670,Ludwig van Beethoven,Sonata No 23 In F Minor Op 57 Appassionata Ass...,Classique,Ludwig-van-Beethoven-Sonata-No-23-In-F-Minor-O...,,,
5,07f0fc3be95dcd878966b1f9572ff670,Ratatat,Kennedy,Electro,Ratatat-Kennedy-lyrics,Classics,"August 22, 2006",This song is an instrumental
7,07f0fc3be95dcd878966b1f9572ff670,The Prodigy,Take Me To The Hospital,Electro,The-Prodigy-Take-Me-To-The-Hospital-lyrics,Invaders Must Die,"August 31, 2009",[Intro]\nTake me to the...\nTake me to the...\...
10,07f0fc3be95dcd878966b1f9572ff670,Skrillex,Bangarang feat Sirah,Soirée,Skrillex-Bangarang-feat-Sirah-lyrics,,,
24,07f0fc3be95dcd878966b1f9572ff670,Shaka Ponk,Lets Bang,Soirée,Shaka-Ponk-Lets-Bang-lyrics,The Geeks And The Jerkin’ Sock,"June 6, 2011",Let's Bang\nHey Sugar Hey\n\n[Couplet 1]\nSatu...
26,07f0fc3be95dcd878966b1f9572ff670,Dschinghis Khan,Moskau Long Version,Soirée,Dschinghis-Khan-Moskau-Long-Version-lyrics,7 Leben,1979,Moskau\nFremd und geheimnisvoll\nTürme aus rot...


In [74]:
df.to_csv('playlist_info.csv',index = False) 

# NEO4J DB

In [38]:
print(comments_info[:200])

[[], [], [], [], [], [], [], [], [], [], [], [], [], [], [('DeeCee', 1403638235, [{'tag': 'p', 'children': ['We should embrace the contractions here! The “It is"s don’t fit with how the song is actually sung; Jacoby is clearly using "it’s”. And to me it sounds like he’s singing “I got” and “I’ve gotta” rather than “I have got” and “I have got to.”']}, '', {'tag': 'p', 'children': ['Arguably nitpicky, yep. :)']}], 'Papa-Roach-To-Be-Loved-lyrics'), ('CySorcerorD', 1494570401, [{'tag': 'p', 'children': ['replace the A with an I to correct the “A love deprevation” to “I love deprevation”']}], 'Papa-Roach-To-Be-Loved-lyrics'), ('guesg', 1620154841, [{'tag': 'p', 'children': ['I love that this song was the theme song for WWE raw from 2006- 2009.']}], 'Papa-Roach-To-Be-Loved-lyrics'), ('SpaceGhostBenoit', 1626561019, [{'tag': 'p', 'children': ['stayed up every school night as a kid just to watch cena edge and DX']}], 'Papa-Roach-To-Be-Loved-lyrics')], [], [('Meandmylife', 1408078858, [{'tag':

In [39]:
comment_list = [x for x in comments_info if x] #remove [] lists 

In [40]:
comments_list_flat = [item for sublist in comment_list for item in sublist] # Transform list of list in a list

In [70]:
new_comment_list = []

for cm in comments_list_flat:
    #print(cm)
    txt = ''
    for tag in cm[2]:
        if (type(tag) is dict and tag['tag'] == 'p'):
            if ('children' in tag):
                for words in tag['children']:
                    if(type(words) is dict):
                        txt += '\n'
                    else:
                        txt = txt + words + ' '       
    new_comment_list.append((cm[0],cm[1],txt,cm[3]))

In [72]:
df_comment_info = pd.DataFrame(new_comment_list,columns=['author','date','comment','artist_track'])

In [75]:
df_comment_info.to_csv('comments_info.csv',index = False)