# Scraping the internet for songs

In [142]:
# import libraries
from bs4 import BeautifulSoup
import pandas as pd
import requests

### Scraping the billboard hot 100

In [143]:
# define url to be scraped
url = 'https://www.billboard.com/charts/hot-100'
url

'https://www.billboard.com/charts/hot-100'

In [144]:
#get html of the url via get
response = requests.get(url)

In [145]:
# check if that worked
response

<Response [200]>

In [146]:
# put the response in a soup, use html parser
soup = BeautifulSoup(response.content, 'html.parser')

In [147]:
#find all song titles with html class
chart_list = soup.findAll("span", {"class": "chart-element__information__song"})

In [148]:
songs_list = []

for i in chart_list:
#      print (i.text)
    songs_list.append(i.text)

In [149]:
songs_list

['Save Your Tears',
 'Leave The Door Open',
 'Peaches',
 'Rapstar',
 'Levitating',
 'Kiss Me More',
 'Montero (Call Me By Your Name)',
 'Astronaut In The Ocean',
 'Up',
 'Drivers License',
 'Blinding Lights',
 'Deja Vu',
 'Beat Box',
 'Calling My Phone',
 'On Me',
 'Best Friend',
 'Heartbreak Anniversary',
 'Beautiful Mistakes',
 'What You Know Bout Love',
 'Mood',
 "My Ex's Best Friend",
 'Back In Blood',
 'Without You',
 'The Good Ones',
 'Go Crazy',
 'You Broke Me First.',
 '34+35',
 'Hell Of A View',
 'Forever After All',
 "What's Next",
 'Time Today',
 'Therefore I Am',
 'Dakiti',
 'Positions',
 'Shottas (Lala)',
 'Solid',
 'Track Star',
 'Ski',
 'Wants And Needs',
 "You're Mines Still",
 "We're Good",
 'Heat Waves',
 'Made For You',
 'Good Days',
 'For The Night',
 "Breaking Up Was Easy In The 90's",
 'No More Parties',
 'Goosebumps',
 'Telepatia',
 'Tombstone',
 'Streets',
 'Go!',
 'Hard For The Next',
 'If Pain Was A Person',
 "What's Your Country Song",
 'Just The Way',
 'Hold

In [150]:
artist_list = soup.findAll("span", {"class": "chart-element__information__artist"})
artists = []
for artist in artist_list:
    artists.append(artist.text)
artists

['The Weeknd & Ariana Grande',
 'Silk Sonic (Bruno Mars & Anderson .Paak)',
 'Justin Bieber Featuring Daniel Caesar & Giveon',
 'Polo G',
 'Dua Lipa Featuring DaBaby',
 'Doja Cat Featuring SZA',
 'Lil Nas X',
 'Masked Wolf',
 'Cardi B',
 'Olivia Rodrigo',
 'The Weeknd',
 'Olivia Rodrigo',
 'SpotemGottem Featuring Pooh Shiesty Or DaBaby',
 'Lil Tjay Featuring 6LACK',
 'Lil Baby',
 'Saweetie Featuring Doja Cat',
 'Giveon',
 'Maroon 5 Featuring Megan Thee Stallion',
 'Pop Smoke',
 '24kGoldn Featuring iann dior',
 'Machine Gun Kelly X blackbear',
 'Pooh Shiesty Featuring Lil Durk',
 'The Kid LAROI',
 'Gabby Barrett',
 'Chris Brown & Young Thug',
 'Tate McRae',
 'Ariana Grande',
 'Eric Church',
 'Luke Combs',
 'Drake',
 'Moneybagg Yo',
 'Billie Eilish',
 'Bad Bunny & Jhay Cortez',
 'Ariana Grande',
 'Moneybagg Yo',
 'Young Thug & Gunna Featuring Drake',
 'Mooski',
 'Young Thug & Gunna',
 'Drake Featuring Lil Baby',
 'Yung Bleu Featuring Drake',
 'Dua Lipa',
 'Glass Animals',
 'Jake Owen',
 

In [151]:
# putting the gathered songs and artists in a pandas dataframe
hot100=pd.DataFrame({'title': songs_list, 
                'artist': artists})
hot100

Unnamed: 0,title,artist
0,Save Your Tears,The Weeknd & Ariana Grande
1,Leave The Door Open,Silk Sonic (Bruno Mars & Anderson .Paak)
2,Peaches,Justin Bieber Featuring Daniel Caesar & Giveon
3,Rapstar,Polo G
4,Levitating,Dua Lipa Featuring DaBaby
...,...,...
95,4 Da Gang,42 Dugg & Roddy Ricch
96,Blame It On You,Jason Aldean
97,Wasted On You,Morgan Wallen
98,Way Less Sad,AJR


In [152]:
# I will now scraoe the top 500 songs of all time from the rolling stone magazine
#define empty dataframe
greatest = pd.DataFrame()
#the list of songs on that website has 5 pages, so i need to loop over 5 pages
for i in range(5):
    url = "https://musicbrainz.org/series/b3484a66-a4de-444d-93d3-c99a73656905?page=" + str(i) # loop over 5 pages
    table = pd.read_html(url) # read page content into pandas dataframe using read_html function
    greatest = greatest.append(table) #my dataframe with the parsed website
#check the dataframe
greatest

Unnamed: 0,0,1,#,Artist,ISRCs,Length,Name,Rating
0,,,1.0,Bob Dylan,,6:11,Like a Rolling Stone,
1,,,,The Beatles,,4:10,Strawberry Fields Forever (original stereo stu...,
2,,,2.0,The Rolling Stones,USA176610240,3:05,(I Can’t Get No) Satisfaction (live: Royal Alb...,
3,,,3.0,John Lennon,GBAYE0000028GBAYE1000769GBAYE7100010,3:04,Imagine (original studio mix),4.40
4,,,4.0,Marvin Gaye,USMO10000523USMO17100041USMO19290028,3:53,What’s Going On (original LP version),4.40
...,...,...,...,...,...,...,...,...
96,,,396.0,Big Star,USC4R0919655USFI87200246,2:35,Thirteen,4.50
97,,,397.0,Blue Öyster Cult,USSM17600875,3:49,(Don’t Fear) The Reaper (single edit),5.00
98,,,398.0,Lynyrd Skynyrd,AUUM70801787USLIC0601863USMC17446153USUMG0000661,4:44,Sweet Home Alabama,4.55
99,,,399.0,Metallica,GBF089190013USEE10001992,5:32,Enter Sandman,4.40


In [153]:
# the dataframe needs cleaning. I will drop the unnecessary columns
greatest=greatest.drop([0, 1, '#', 'ISRCs','Rating','Length'], axis=1)

In [154]:
# drop all rowas that contain NaN values
greatest= greatest.dropna()
greatest

Unnamed: 0,Artist,Name
0,Bob Dylan,Like a Rolling Stone
1,The Beatles,Strawberry Fields Forever (original stereo stu...
2,The Rolling Stones,(I Can’t Get No) Satisfaction (live: Royal Alb...
3,John Lennon,Imagine (original studio mix)
4,Marvin Gaye,What’s Going On (original LP version)
...,...,...
95,The Shangri‐Las,Remember (Walkin’ in the Sand)
96,Big Star,Thirteen
97,Blue Öyster Cult,(Don’t Fear) The Reaper (single edit)
98,Lynyrd Skynyrd,Sweet Home Alabama


In [155]:
# scrape top 100 all time from top40weekly
#geet urls
url = 'http://top40weekly.com/top-100-songs-of-all-time/'
url

'http://top40weekly.com/top-100-songs-of-all-time/'

In [156]:
#get html code
response = requests.get(url)

In [157]:
# put it in a soup
soup = BeautifulSoup(response.content, 'html.parser')

In [158]:
#select all titles from the website
titles1=soup.select('div.x-text.song-title > p')

In [159]:
# extract the titles into a list
titles = []
for elem in titles1:
    elem = elem.text
    titles.append(elem)
titles

['The Twist',
 'Smooth',
 'Mack the Knife',
 'Uptown Funk!',
 'How Do I Live',
 'Party Rock Anthem',
 'I Gotta Feeling',
 'Macarena (Bayside Boys Mix)',
 'Shape of You',
 'Physical',
 'You Light Up My Life',
 'Hey Jude',
 'Closer',
 'We Belong Together',
 'Un-Break My Heart',
 'Yeah!',
 'Bette Davis Eyes',
 'Endless Love',
 'Tonight’s the Night (Gonna Be Alright)',
 'You Were Meant for Me/Foolish Games',
 '(Everything I Do) I Do It for You',
 'I’ll Make Love to You',
 'The Theme From “A Summer Place”',
 'Le Freak',
 'How Deep Is Your Love',
 'Eye of the Tiger',
 'We Found Love',
 'Low',
 'Just Want to Be Your Everything',
 'Too Close',
 'Every Breath You Take',
 'Somebody That I Used to Know',
 'Despacito',
 'Flashdance… What a Feeling',
 'Rolling in the Deep',
 'Tossin’ and Turnin’',
 'The Battle of New Orleans',
 'One Sweet Day',
 'Truly Madly Deeply',
 'Silly Love Songs',
 'Let’s Get It On',
 'Night Fever',
 'Another One Bites the Dust',
 'Say Say Say',
 'How You Remind Me',
 'Tie a

In [160]:
# select all artist
artiststemp=soup.select('div.x-text.artist-name > p > a')

In [161]:
# single out artists into a list
artists = []
for elem in artiststemp:
    elem = elem.text
    artists.append(elem)
artists

['Chubby Checker',
 'Santana Feat. Rob Thomas',
 'Bobby Darin',
 'Mark Ronson Feat. Bruno Mars',
 'Leann Rimes',
 'LMFAO Feat. Lauren Bennett & GoonRock',
 'The Black Eyed Peas',
 'Los Del Rio',
 'Ed Sheeran',
 'Olivia Newton-John',
 'Debby Boone',
 'The Beatles',
 'The Chainsmokers Feat. Halsey',
 'Mariah Carey',
 'Toni Braxton',
 'Usher Feat. Lil Jon & Ludacris',
 'Kim Carnes',
 'Diana Ross & Lionel Richie',
 'Rod Stewart',
 'Jewel',
 'Bryan Adams',
 'Boyz II Men',
 'Percy Faith & His Orchestra',
 'Chic',
 'Bee Gees',
 'Survivor',
 'Rihanna Feat. Calvin Harris',
 'Flo Rida Feat. T-Pain',
 'Andy Gibb',
 'Next',
 'The Police',
 'Gotye Feat. Kimbra',
 'Luis Fonsi & Daddy Yankee Feat. Justin Bieber',
 'Irene Cara',
 'Adele',
 'Bobby Lewis',
 'Johnny Horton',
 'Mariah Carey & Boyz II Men',
 'Savage Garden',
 'Wings',
 'Marvin Gaye',
 'Bee Gees',
 'Queen',
 'Paul McCartney & Michael Jackson',
 'Nickelback',
 'Dawn Feat. Tony Orlando',
 'Tommy Edwards',
 'The Beatles',
 'Andy Gibb',
 'Carly

In [162]:
#create dataframe with the songs
alltime=pd.DataFrame({'title': titles, 
        'artist': artists})


In [163]:
alltime

Unnamed: 0,title,artist
0,The Twist,Chubby Checker
1,Smooth,Santana Feat. Rob Thomas
2,Mack the Knife,Bobby Darin
3,Uptown Funk!,Mark Ronson Feat. Bruno Mars
4,How Do I Live,Leann Rimes
...,...,...
95,All Night Long (All Night),Lionel Richie
96,Nothing Compares 2 U,Sinead O’Connor
97,I Swear,All-4-One
98,Family Affair,Mary J. Blige


In [164]:
#rename columns so that they are all uniform
greatest = greatest.rename(columns={"Artist": "artist", "Name": "title"})
greatest

Unnamed: 0,artist,title
0,Bob Dylan,Like a Rolling Stone
1,The Beatles,Strawberry Fields Forever (original stereo stu...
2,The Rolling Stones,(I Can’t Get No) Satisfaction (live: Royal Alb...
3,John Lennon,Imagine (original studio mix)
4,Marvin Gaye,What’s Going On (original LP version)
...,...,...
95,The Shangri‐Las,Remember (Walkin’ in the Sand)
96,Big Star,Thirteen
97,Blue Öyster Cult,(Don’t Fear) The Reaper (single edit)
98,Lynyrd Skynyrd,Sweet Home Alabama


In [165]:
hot100

Unnamed: 0,title,artist
0,Save Your Tears,The Weeknd & Ariana Grande
1,Leave The Door Open,Silk Sonic (Bruno Mars & Anderson .Paak)
2,Peaches,Justin Bieber Featuring Daniel Caesar & Giveon
3,Rapstar,Polo G
4,Levitating,Dua Lipa Featuring DaBaby
...,...,...
95,4 Da Gang,42 Dugg & Roddy Ricch
96,Blame It On You,Jason Aldean
97,Wasted On You,Morgan Wallen
98,Way Less Sad,AJR


In [166]:
#standardize all dataframes:
hot100["title"] = hot100["title"].str.lower()
hot100["artist"] = hot100["artist"].str.lower()
greatest["title"] = greatest["title"].str.lower()
greatest["artist"] = greatest["artist"].str.lower()
alltime["title"] = alltime["title"].str.lower()
alltime["artist"] = alltime["artist"].str.lower()


In [167]:
# export all dataframes into seperate csv's

In [169]:
hot100.to_csv('hot100.csv' ,index=False)
alltime.to_csv('alltime.csv',index=False)
greatest.to_csv('greatest.csv',index=False)