In [1]:
#! pip install bs4 # (installing the library in case it is not installed)
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:

# Let's get the URL and store it in a variable
url="https://www.billboard.com/charts/hot-100/"

In [3]:

# getting the HTML code from our URL using request from requests library and then getting the status code
request_charts = requests.get(url)
print("request_charts:", request_charts.status_code)

request_charts: 200


In [4]:
# getting the code with the attribute content
request_charts.content[:100]
# Since we essentially have a giant string of HTML, we can print a slice of 100 characters to confirm we have the source of 
# the page and now it is not messy

b'<!DOCTYPE html>\n<!--[if IE 6]>\n<html id="ie6" lang="en-US">\n<![endif]-->\n<!--[if IE 7]>\n<html id="ie'

In [5]:
# parsing the element and getting the code with the attribute content using the 'html.parser' so we know that we have html code
# Print the prettify version of soup instead if the simple soup, so it is not so messy like previously
soup = BeautifulSoup(request_charts.content, 'html.parser')
# soup
# html well indented. not always works great...
print(soup.prettify()[:3000])
# we could say that the html code looks like the way it should look and it is saved in a beautiful soup objec

<!DOCTYPE html>
<!--[if IE 6]>
<html id="ie6" lang="en-US">
<![endif]-->
<!--[if IE 7]>
<html id="ie7" lang="en-US">
<![endif]-->
<!--[if IE 8]>
<html id="ie8" lang="en-US">
<![endif]-->
<!--[if !(IE 6) | !(IE 7) | !(IE 8) ]><!-->
<html lang="en-US">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <meta content="#ffffff" name="theme-color"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport">
   <!-- Add to home screen for iOS -->
   <meta content="black-translucent" name="apple-mobile-web-app-status-bar-style"/>
   <link href="https://www.billboard.com/wp-content/themes/vip/pmc-billboard-2021/assets/app/icons/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/>
   <!-- Tile icons for Windows -->
   <meta content="https://www.billboard.com/wp-content/themes/vip/pmc-billboard-2021/assets/app/browserconfig.xml" name="msapplication-config"/>
   <meta content="https://www.billboard.com/wp

In [6]:
# basic tree navigation, let's get the title of the object
soup.title.get_text(strip=True)

'Billboard Hot 100 – Billboard'

In [7]:
# or
soup.title.string

'Billboard Hot 100 – Billboard'

In [8]:
soup.title.name

'title'

In [9]:
# finding the paragraphs
soup.p

<p class="c-tagline a-font-primary-s lrv-u-padding-b-1">THE WEEK’S MOST POPULAR CURRENT SONGS ACROSS ALL GENRES, RANKED BY STREAMING ACTIVITY DATA BY ONLINE MUSIC SOURCES TRACKED BY LUMINATE, RADIO AIRPLAY AUDIENCE IMPRESSIONS AS MEASURED BY LUMINATE AND SALES DATA AS COMPILED BY LUMINATE.</p>

In [10]:
# getting 10 of the lists
soup.find_all("li")[:10]

[<li class="o-nav__list-item lrv-u-align-items-center lrv-u-flex">
 <a class="c-link lrv-a-unstyle-link lrv-a-unstyle-link lrv-u-color-brand-accent-blue:hover lrv-a-hover-effect lrv-u-whitespace-nowrap lrv-u-color-grey-lightest" href="/charts">
 	Charts</a>
 </li>,
 <li class="o-nav__list-item lrv-u-align-items-center lrv-u-flex">
 <a class="c-link lrv-a-unstyle-link lrv-a-unstyle-link lrv-u-color-brand-accent-blue:hover lrv-a-hover-effect lrv-u-whitespace-nowrap lrv-u-color-grey-lightest" href="https://www.billboard.com/c/music/">
 	Music</a>
 </li>,
 <li class="o-nav__list-item lrv-u-align-items-center lrv-u-flex">
 <a class="c-link lrv-a-unstyle-link lrv-a-unstyle-link lrv-u-color-brand-accent-blue:hover lrv-a-hover-effect lrv-u-whitespace-nowrap lrv-u-color-grey-lightest" href="https://www.billboard.com/c/culture/">
 	Culture</a>
 </li>,
 <li class="o-nav__list-item lrv-u-align-items-center lrv-u-flex">
 <a class="c-link lrv-a-unstyle-link lrv-a-unstyle-link lrv-u-color-brand-accen

In [11]:
# get some elements (100) of a tag
ps = [i.get_text(strip=True) for i in soup.find_all("p")]
ps[:100]

['THE WEEK’S MOST POPULAR CURRENT SONGS ACROSS ALL GENRES, RANKED BY STREAMING ACTIVITY DATA BY ONLINE MUSIC SOURCES TRACKED BY LUMINATE, RADIO AIRPLAY AUDIENCE IMPRESSIONS AS MEASURED BY LUMINATE AND SALES DATA AS COMPILED BY LUMINATE.',
 'Lizzo',
 'Last week',
 'Weeks at no. 1',
 'Weeks on chart',
 'B.Slatktin, E.B.Frederic, L.Price, M.McLaren, M.Jefferson, R.Larkins, S.Hague, T.M.Thomas',
 'Ricky Reed, B.Slatkin',
 'Nice Life/Atlantic',
 'Week of August 6, 2022',
 'click to see more',
 'B.Slatktin, E.B.Frederic, L.Price, M.McLaren, M.Jefferson, R.Larkins, S.Hague, T.M.Thomas',
 'Ricky Reed, B.Slatkin',
 'Nice Life/Atlantic',
 'H.Styles, T.E.P.Hull, T.Johnson',
 'Kid Harpoon, T.Johnson',
 'Erskine/Columbia',
 'K.Bush',
 'K.Bush',
 'Fish People/Noble And Brite/Rhino/Warner',
 'Biggest gain in airplay',
 'Gains In Performance',
 'J.T.Harlow, D.Ford, J.Velazquez, R.Chahayed, Charlie Handsome, J.L.Harris, N.J.Pabon, C.B.Bridges, J.Jones, M.Raheem, E.Williams, S.Duhamel, W.Adams',
 'Charl

In [13]:
soup.find_all('h3', attrs={'class': 'c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 lrv-u-font-size-18@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-330 u-max-width-230@tablet-only'})

[<h3 class="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 lrv-u-font-size-18@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-330 u-max-width-230@tablet-only" id="title-of-a-story">
 
 	
 	
 		
 					As It Was		
 	
 </h3>,
 <h3 class="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 lrv-u-font-size-18@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-330 u-max-width-230@tablet-only" id="title-of-a-story">
 
 	
 	
 		
 					Running Up That Hill (A Deal With God)		
 	
 </h3>,
 <h3 class="c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 lrv-u-font-size-18@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-330 u-max-width-230@tablet-only" id="title-of-a-story">
 
 	
 	
 		
 					First Class		
 	
 </h3>,
 <h3 class="c-title a-no-trucate a-font-primary-bold-s u-letter-sp

In [14]:
# looping and getting the text only
for title in soup.find_all('h3', attrs={'class': 'c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 lrv-u-font-size-18@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-330 u-max-width-230@tablet-only'}):
    print(title.get_text())



	
	
		
					As It Was		
	



	
	
		
					Running Up That Hill (A Deal With God)		
	



	
	
		
					First Class		
	



	
	
		
					Wait For U		
	



	
	
		
					Break My Soul		
	



	
	
		
					Late Night Talking		
	



	
	
		
					Me Porto Bonito		
	



	
	
		
					I Like You (A Happier Song)		
	



	
	
		
					Heat Waves		
	



	
	
		
					Bad Habit		
	



	
	
		
					Sunroof		
	



	
	
		
					Big Energy		
	



	
	
		
					Wasted On You		
	



	
	
		
					Jimmy Cooks		
	



	
	
		
					Titi Me Pregunto		
	



	
	
		
					The Kind Of Love We Make		
	



	
	
		
					Stay		
	



	
	
		
					Numb Little Bug		
	



	
	
		
					Ghost		
	



	
	
		
					Glimpse Of Us		
	



	
	
		
					You Proof		
	



	
	
		
					Get Into It (Yuh)		
	



	
	
		
					I Ain't Worried		
	



	
	
		
					She Had Me At Heads Carolina		
	



	
	
		
					Moscow Mule		
	



	
	
		
					Shivers		
	



	
	
		
					Vegas		
	



	
	
		
					Cold Heart (PNAU Remix)		
	



	
	
		
					Like I Love Country Music		
	





In [15]:
# creating a list where we append the titles, titles have specific tag and specific attribute
songs = []
for title in soup.find_all('h3', attrs={'class': 'c-title a-no-trucate a-font-primary-bold-s u-letter-spacing-0021 lrv-u-font-size-18@tablet lrv-u-font-size-16 u-line-height-125 u-line-height-normal@mobile-max a-truncate-ellipsis u-max-width-330 u-max-width-230@tablet-only'}):
    songs.append(title.get_text().strip()) # method strip: removes adjacent characters and gaps in the text   
# getting the songs
songs

['As It Was',
 'Running Up That Hill (A Deal With God)',
 'First Class',
 'Wait For U',
 'Break My Soul',
 'Late Night Talking',
 'Me Porto Bonito',
 'I Like You (A Happier Song)',
 'Heat Waves',
 'Bad Habit',
 'Sunroof',
 'Big Energy',
 'Wasted On You',
 'Jimmy Cooks',
 'Titi Me Pregunto',
 'The Kind Of Love We Make',
 'Stay',
 'Numb Little Bug',
 'Ghost',
 'Glimpse Of Us',
 'You Proof',
 'Get Into It (Yuh)',
 "I Ain't Worried",
 'She Had Me At Heads Carolina',
 'Moscow Mule',
 'Shivers',
 'Vegas',
 'Cold Heart (PNAU Remix)',
 'Like I Love Country Music',
 'In A Minute',
 'Damn Strait',
 'Rock And A Hard Place',
 'Provenza',
 'Boyfriend',
 'Sticky',
 'Fall In Love',
 'Enemy',
 'Super Gremlin',
 'Last Night Lonely',
 'Something In The Orange',
 'Thats What I Want',
 'Woman',
 'Efecto',
 'Hot Shit',
 'Take My Name',
 'Left And Right',
 '5 Foot 9',
 'Sweetest Pie',
 'Thousand Miles',
 'What Happened To Virgil',
 'TV',
 'Sleazy Flow',
 'Party',
 'Pressurelicious',
 'Son Of A Sinner',
 'Co

In [16]:
# Let's find the artists by recognizing a pattern on them
soup.find_all('span', attrs={'class': 'c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only'})

[<span class="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only">
 	
 	Harry Styles
 </span>,
 <span class="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only">
 	
 	Kate Bush
 </span>,
 <span class="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only">
 	
 	Jack Harlow
 </span>,
 <span class="c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only">
 	
 	Fut

In [17]:
# looping and getting the text only
for artist in soup.find_all('span', attrs={'class': 'c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only'}):
    print(artist.get_text())


	
	Harry Styles


	
	Kate Bush


	
	Jack Harlow


	
	Future Featuring Drake & Tems


	
	Beyonce


	
	Harry Styles


	
	Bad Bunny & Chencho Corleone


	
	Post Malone Featuring Doja Cat


	
	Glass Animals


	
	Steve Lacy


	
	Nicky Youre & dazy


	
	Latto


	
	Morgan Wallen


	
	Drake Featuring 21 Savage


	
	Bad Bunny


	
	Luke Combs


	
	The Kid LAROI & Justin Bieber


	
	Em Beihold


	
	Justin Bieber


	
	Joji


	
	Morgan Wallen


	
	Doja Cat


	
	OneRepublic


	
	Cole Swindell


	
	Bad Bunny


	
	Ed Sheeran


	
	Doja Cat


	
	Elton John & Dua Lipa


	
	Kane Brown


	
	Lil Baby


	
	Scotty McCreery


	
	Bailey Zimmerman


	
	Karol G


	
	Dove Cameron


	
	Drake


	
	Bailey Zimmerman


	
	Imagine Dragons X JID


	
	Kodak Black


	
	Jon Pardi


	
	Zach Bryan


	
	Lil Nas X


	
	Doja Cat


	
	Bad Bunny


	
	Cardi B, Ye & Lil Durk


	
	Parmalee


	
	Charlie Puth Featuring Jung Kook


	
	Tyler Hubbard


	
	Megan Thee Stallion & Dua Lipa


	
	The Kid LAROI


	
	Lil Durk Featuring Gunna


	

In [18]:
# creating a list where we append the artists, artists have specific tag and specific attribute
artists = []
for artist in soup.find_all('span', attrs={'class': 'c-label a-no-trucate a-font-primary-s lrv-u-font-size-14@mobile-max u-line-height-normal@mobile-max u-letter-spacing-0021 lrv-u-display-block a-truncate-ellipsis-2line u-max-width-330 u-max-width-230@tablet-only'}):
    artists.append(artist.get_text().strip()) 
# getting the artists
artists

['Harry Styles',
 'Kate Bush',
 'Jack Harlow',
 'Future Featuring Drake & Tems',
 'Beyonce',
 'Harry Styles',
 'Bad Bunny & Chencho Corleone',
 'Post Malone Featuring Doja Cat',
 'Glass Animals',
 'Steve Lacy',
 'Nicky Youre & dazy',
 'Latto',
 'Morgan Wallen',
 'Drake Featuring 21 Savage',
 'Bad Bunny',
 'Luke Combs',
 'The Kid LAROI & Justin Bieber',
 'Em Beihold',
 'Justin Bieber',
 'Joji',
 'Morgan Wallen',
 'Doja Cat',
 'OneRepublic',
 'Cole Swindell',
 'Bad Bunny',
 'Ed Sheeran',
 'Doja Cat',
 'Elton John & Dua Lipa',
 'Kane Brown',
 'Lil Baby',
 'Scotty McCreery',
 'Bailey Zimmerman',
 'Karol G',
 'Dove Cameron',
 'Drake',
 'Bailey Zimmerman',
 'Imagine Dragons X JID',
 'Kodak Black',
 'Jon Pardi',
 'Zach Bryan',
 'Lil Nas X',
 'Doja Cat',
 'Bad Bunny',
 'Cardi B, Ye & Lil Durk',
 'Parmalee',
 'Charlie Puth Featuring Jung Kook',
 'Tyler Hubbard',
 'Megan Thee Stallion & Dua Lipa',
 'The Kid LAROI',
 'Lil Durk Featuring Gunna',
 'Billie Eilish',
 'SleazyWorld Go Featuring Lil Bab

In [19]:
# creatimg a dataframe with information of the top 100 songs and their respective artists
top100 = pd.DataFrame({"songs": songs, "artists": artists})
top100

Unnamed: 0,songs,artists
0,As It Was,Harry Styles
1,Running Up That Hill (A Deal With God),Kate Bush
2,First Class,Jack Harlow
3,Wait For U,Future Featuring Drake & Tems
4,Break My Soul,Beyonce
...,...,...
94,Te Felicito,Shakira & Rauw Alejandro
95,Are You Entertained,Russ & Ed Sheeran
96,"Bzrp Music Sessions, Vol. 52",Bizarrap & Quevedo
97,Right On,Lil Baby
