# Lab | Web Scraping Single Page (GNOD part 1)

In [1]:
from bs4 import BeautifulSoup

In [2]:
import requests
import pandas as pd

In [3]:
# 2. find url and store it in a variable
url = "https://www.popvortex.com/music/charts/top-100-songs.php"

In [4]:
# 3. download html with a get request
response = requests.get(url)

In [5]:
# Check if the request was successful (status code 200)
response.status_code # 200 status code means OK!

200

In [6]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [7]:
# 4.2. check that the html code looks like it should
# soup

In [8]:
song_elements = soup.find_all('div',class_ ='chart-content')

In [9]:
for i, element in enumerate(song_elements):
        title = element.find('cite', class_='title').text.strip()
        artist = element.find('em', class_='artist').text.strip()
        genre = element.find('ul').find('li').text.strip()
      
        print(f"{i + 1}. {title} - {artist} - {genre}")

1. TEXAS HOLD 'EM - Beyoncé - Genre: Country
2. Beautiful Messes - Hillary Scott & The Scott Family - Genre: Country
3. Lose Control - Teddy Swims - Genre: Pop
4. Beautiful Things - Benson Boone - Genre: Pop
5. Flowers - Miley Cyrus - Genre: Pop
6. TEXAS HOLD 'EM - Beyoncé - Genre: Country
7. Lovin On Me - Jack Harlow - Genre: Hip-Hop / Rap
8. Turn the Lights Back On - Billy Joel - Genre: Pop
9. Don't Let the Old Man In - Toby Keith - Genre: Country
10. Selfish - Justin Timberlake - Genre: Pop
11. I Remember Everything (feat. Kacey Musgraves) - Zach Bryan - Genre: Country
12. Made For Me - Muni Long - Genre: R&B / Soul
13. Fast Car - Luke Combs - Genre: Country
14. Houdini - Dua Lipa - Genre: Pop
15. Live Like You Were Dying - Tim McGraw - Genre: Country
16. Where the Wild Things Are - Luke Combs - Genre: Country
17. Lil Boo Thang - Paul Russell - Genre: Pop
18. Training Season - Dua Lipa - New Release
19. Save Me - Jelly Roll - Genre: Rap
20. Fast Car - Tracy Chapman - Genre: Singer/S

In [10]:
titles = []
artists = []
genres = []
for i, element in enumerate(song_elements):
        title = element.find('cite', class_='title').text.strip()
        titles.append(title)
        artist = element.find('em', class_='artist').text.strip()
        artists.append(artist)
        genre = element.find('ul').find('li').text.strip()
        genres.append(genre)
        print(f"{title} - {artist} - {genre}")
        #data.append('title', 'artist', 'genre'])


TEXAS HOLD 'EM - Beyoncé - Genre: Country
Beautiful Messes - Hillary Scott & The Scott Family - Genre: Country
Lose Control - Teddy Swims - Genre: Pop
Beautiful Things - Benson Boone - Genre: Pop
Flowers - Miley Cyrus - Genre: Pop
TEXAS HOLD 'EM - Beyoncé - Genre: Country
Lovin On Me - Jack Harlow - Genre: Hip-Hop / Rap
Turn the Lights Back On - Billy Joel - Genre: Pop
Don't Let the Old Man In - Toby Keith - Genre: Country
Selfish - Justin Timberlake - Genre: Pop
I Remember Everything (feat. Kacey Musgraves) - Zach Bryan - Genre: Country
Made For Me - Muni Long - Genre: R&B / Soul
Fast Car - Luke Combs - Genre: Country
Houdini - Dua Lipa - Genre: Pop
Live Like You Were Dying - Tim McGraw - Genre: Country
Where the Wild Things Are - Luke Combs - Genre: Country
Lil Boo Thang - Paul Russell - Genre: Pop
Training Season - Dua Lipa - New Release
Save Me - Jelly Roll - Genre: Rap
Fast Car - Tracy Chapman - Genre: Singer/Songwriter
16 CARRIAGES - Beyoncé - Genre: Country
Until I Found You - S

In [11]:
presis_df = pd.DataFrame({"title":titles,
                          "artist":artists,
                          "genre": genres})
presis_df

Unnamed: 0,title,artist,genre
0,TEXAS HOLD 'EM,Beyoncé,Genre: Country
1,Beautiful Messes,Hillary Scott & The Scott Family,Genre: Country
2,Lose Control,Teddy Swims,Genre: Pop
3,Beautiful Things,Benson Boone,Genre: Pop
4,Flowers,Miley Cyrus,Genre: Pop
...,...,...,...
95,Leave a Light On,Papa Roach,Genre: Hard Rock
96,...Ready For It?,Taylor Swift,Genre: Pop
97,The Painter,Cody Johnson,Genre: Country
98,Scared To Start,Michael Marcagi,Genre: Alternative


In [12]:
import re
from datetime import datetime

if not song_elements:
    print("No song elements found. Please check if the website structure has changed.")
    exit()

data = []
for i, element in enumerate(song_elements):
    title = element.find('cite', class_='title').text.strip()
    artist = element.find('em', class_='artist').text.strip()

    # Extracting genre
    genre = "Genre not found"
    ul_tag = element.find('ul')
    if ul_tag:
        for li_tag in ul_tag.find_all('li'):
            if 'Genre' in li_tag.text:
                genre = li_tag.text.strip().replace('Genre:', '')
                break
                
    # Extracting release date using regular expression
    release_date_text = ""
    release_date_match = re.search(r'Release Date: (\w+ \d{1,2}, \d{4})', element.get_text())
    if release_date_match:
        release_date_text = release_date_match.group(1)
        
        # Convert release date to date format
        release_date = datetime.strptime(release_date_text, "%B %d, %Y").strftime("%d/%m/%Y")

    data.append([title, artist, genre, release_date])

df = pd.DataFrame(data, columns=['title', 'artist', 'genre', 'release_date'])
display(df)

Unnamed: 0,title,artist,genre,release_date
0,TEXAS HOLD 'EM,Beyoncé,Country,11/02/2024
1,Beautiful Messes,Hillary Scott & The Scott Family,Country,08/07/2016
2,Lose Control,Teddy Swims,Pop,23/06/2023
3,Beautiful Things,Benson Boone,Pop,19/01/2024
4,Flowers,Miley Cyrus,Pop,12/01/2023
...,...,...,...,...
95,Leave a Light On,Papa Roach,Hard Rock,08/04/2022
96,...Ready For It?,Taylor Swift,Pop,01/01/2017
97,The Painter,Cody Johnson,Country,11/08/2023
98,Scared To Start,Michael Marcagi,Alternative,12/01/2024


# Lab | Web Scraping Single Page (GNOD part 2)¶

In [13]:
import random

In [14]:
# Recommend function for a random song if the input song is found in the DataFrame
def recommend_song(song_title):
    # Check if the song is in the DataFrame (case insensitive)
    if df['title'].str.lower().str.contains(song_title.lower()).any():
        # Get a random index that is not the index of the input song
        random_index = df[df['title'].str.lower() != song_title.lower()].sample().index[0]
        # Get the random song and artist
        random_song = df.loc[random_index, 'title']
        random_artist = df.loc[random_index, 'artist']
        return f"Thank you for the input and now we recommend \"{random_song}\" by {random_artist}"
    else:
        return "Thank you for the input but we currently have no recommendation for this song."

# Loop to recommend songs for 5 times
for i in range(5):
    input_song = input("Enter a song title: ")
    recommendation = recommend_song(input_song)
    print(recommendation)

Enter a song title: flowers
Thank you for the input and now we recommend "Scared To Start" by Michael Marcagi
Enter a song title: Flowers
Thank you for the input and now we recommend "Let's Go" by Key Glock
Enter a song title: lose controls
Thank you for the input but we currently have no recommendation for this song.
Enter a song title: unwritten
Thank you for the input and now we recommend "Scared To Start" by Michael Marcagi
Enter a song title: the painter
Thank you for the input and now we recommend "Sorrys & Ferraris" by Polo G


# Lab | Web Scraping Single Page (GNOD part 3)

In [15]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [16]:
secrets_file = open("spotify.txt","r")
string = secrets_file.read()

In [17]:
secrets_dict={}
for line in string.split('\n'):
    if len(line) > 0:
        #print(line.split(':'))
        secrets_dict[line.split(':')[0]]=line.split(':')[1].strip()

In [18]:
# Enter to Spotify

#Initialize SpotiPy with user credentials
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id=secrets_dict['clientid'],
                                                           client_secret=secrets_dict['clientsecret']))

In [19]:
# we will use our 'first paid music' playlist as an example:
playlist = sp.user_playlist_tracks("spotify", "1TM3rECdFTc0R0tgiIf0oW")

In [20]:
# this one is biiiig!
playlist["total"] 

4690

In [21]:
# playlist['items'] contains the tracks on the playlist
# playlist['items']

In [22]:
# we could use the url to the next page which is provided...
playlist['next']

'https://api.spotify.com/v1/playlists/1TM3rECdFTc0R0tgiIf0oW/tracks?offset=100&limit=100&additional_types=track'

In [23]:
from random import randint
from time import sleep

def get_playlist_tracks(playlist_id):
    results = sp.user_playlist_tracks("spotify",playlist_id)
    tracks = results['items']
    while results['next']!=None:
        results = sp.next(results)
        tracks = tracks + results['items']
        sleep(randint(1,3000)/1000) # respectful nap
    return tracks

In [24]:
all_tracks = get_playlist_tracks("1TM3rECdFTc0R0tgiIf0oW")
len(all_tracks)

4690

In [25]:
import pandas as pd
from pandas import json_normalize

In [26]:
tracks2 = json_normalize(all_tracks)

In [27]:
tracks2

Unnamed: 0,added_at,is_local,primary_color,added_by.external_urls.spotify,added_by.href,added_by.id,added_by.type,added_by.uri,track.album.album_type,track.album.artists,...,track.id,track.is_local,track.name,track.popularity,track.preview_url,track.track,track.track_number,track.type,track.uri,video_thumbnail.url
0,2022-03-12T12:22:16Z,False,,https://open.spotify.com/user/ben.woody.cheval,https://api.spotify.com/v1/users/ben.woody.cheval,ben.woody.cheval,user,spotify:user:ben.woody.cheval,album,[{'external_urls': {'spotify': 'https://open.s...,...,3l9eg9RtisizG12a1D6nZl,False,Pretty Pimpin,0,,True,1,track,spotify:track:3l9eg9RtisizG12a1D6nZl,
1,2022-03-12T12:22:16Z,False,,https://open.spotify.com/user/ben.woody.cheval,https://api.spotify.com/v1/users/ben.woody.cheval,ben.woody.cheval,user,spotify:user:ben.woody.cheval,album,[{'external_urls': {'spotify': 'https://open.s...,...,50M7nY1oQuNHecs0ahWAtI,False,I Need My Girl,68,https://p.scdn.co/mp3-preview/d566beee419ec326...,True,10,track,spotify:track:50M7nY1oQuNHecs0ahWAtI,
2,2022-03-12T12:22:16Z,False,,https://open.spotify.com/user/ben.woody.cheval,https://api.spotify.com/v1/users/ben.woody.cheval,ben.woody.cheval,user,spotify:user:ben.woody.cheval,album,[{'external_urls': {'spotify': 'https://open.s...,...,2TTAPkrGQQOZkaUTUlt21Q,False,Strange,48,https://p.scdn.co/mp3-preview/8942abcf1bc1fe32...,True,4,track,spotify:track:2TTAPkrGQQOZkaUTUlt21Q,
3,2022-03-12T12:22:16Z,False,,https://open.spotify.com/user/ben.woody.cheval,https://api.spotify.com/v1/users/ben.woody.cheval,ben.woody.cheval,user,spotify:user:ben.woody.cheval,album,[{'external_urls': {'spotify': 'https://open.s...,...,7BvcpEdO7PUDGGSER1S9LA,False,Todeswalzer,36,https://p.scdn.co/mp3-preview/3cf49f28fa44b6bd...,True,1,track,spotify:track:7BvcpEdO7PUDGGSER1S9LA,
4,2022-03-12T12:22:16Z,False,,https://open.spotify.com/user/ben.woody.cheval,https://api.spotify.com/v1/users/ben.woody.cheval,ben.woody.cheval,user,spotify:user:ben.woody.cheval,album,[{'external_urls': {'spotify': 'https://open.s...,...,0VjIjW4GlUZAMYd2vXMi3b,False,Blinding Lights,93,,True,9,track,spotify:track:0VjIjW4GlUZAMYd2vXMi3b,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4685,2024-01-16T11:15:02Z,False,,https://open.spotify.com/user/ben.woody.cheval,https://api.spotify.com/v1/users/ben.woody.cheval,ben.woody.cheval,user,spotify:user:ben.woody.cheval,album,[{'external_urls': {'spotify': 'https://open.s...,...,6ucR4KfvsBFWCMVFDvyKKl,False,Apologize,75,,True,16,track,spotify:track:6ucR4KfvsBFWCMVFDvyKKl,
4686,2024-01-16T11:17:23Z,False,,https://open.spotify.com/user/ben.woody.cheval,https://api.spotify.com/v1/users/ben.woody.cheval,ben.woody.cheval,user,spotify:user:ben.woody.cheval,album,[{'external_urls': {'spotify': 'https://open.s...,...,241LK9hqfXJyWpQ7oClQSN,False,Goddamn Lonely Love - Remastered,48,https://p.scdn.co/mp3-preview/bc56f38acfadbd63...,True,17,track,spotify:track:241LK9hqfXJyWpQ7oClQSN,
4687,2024-02-02T20:25:33Z,False,,https://open.spotify.com/user/ben.woody.cheval,https://api.spotify.com/v1/users/ben.woody.cheval,ben.woody.cheval,user,spotify:user:ben.woody.cheval,album,[{'external_urls': {'spotify': 'https://open.s...,...,6hF9etLMoADWLsoui5ejGW,False,Graveless yet Dead,19,https://p.scdn.co/mp3-preview/ee6e043add10f082...,True,1,track,spotify:track:6hF9etLMoADWLsoui5ejGW,
4688,2024-02-15T21:15:28Z,False,,https://open.spotify.com/user/ben.woody.cheval,https://api.spotify.com/v1/users/ben.woody.cheval,ben.woody.cheval,user,spotify:user:ben.woody.cheval,album,[{'external_urls': {'spotify': 'https://open.s...,...,3vkQ5DAB1qQMYO4Mr9zJN6,False,Gimme! Gimme! Gimme! (A Man After Midnight),84,,True,13,track,spotify:track:3vkQ5DAB1qQMYO4Mr9zJN6,


In [28]:
artists_df2 = pd.DataFrame(columns=['href', 'id', 'name', 'type', 'uri', 'external_urls.spotify','song_id', 'song_name', 'popularity' ])
for i in tracks2.index:
    artists_for_song = json_normalize(tracks2.iloc[i]['track.artists'])
    artists_for_song['song_id']    = tracks2.iloc[i]['track.id']         # we want to keep song_id, it is the sae for all artists
    artists_for_song['song_name']  = tracks2.iloc[i]['track.name']       # we want to keep song_name, it is the sae for all artists
    artists_for_song['popularity'] = tracks2.iloc[i]['track.popularity'] # same for popularity   
    artists_df2 = pd.concat([artists_df2, artists_for_song], axis=0)

In [29]:
artists_df2.head()

Unnamed: 0,href,id,name,type,uri,external_urls.spotify,song_id,song_name,popularity
0,https://api.spotify.com/v1/artists/5gspAQIAH8n...,5gspAQIAH8nJUrMYgXjCJ2,Kurt Vile,artist,spotify:artist:5gspAQIAH8nJUrMYgXjCJ2,https://open.spotify.com/artist/5gspAQIAH8nJUr...,3l9eg9RtisizG12a1D6nZl,Pretty Pimpin,0
0,https://api.spotify.com/v1/artists/2cCUtGK9sDU...,2cCUtGK9sDU2EoElnk0GNB,The National,artist,spotify:artist:2cCUtGK9sDU2EoElnk0GNB,https://open.spotify.com/artist/2cCUtGK9sDU2Eo...,50M7nY1oQuNHecs0ahWAtI,I Need My Girl,68
0,https://api.spotify.com/v1/artists/6guTJsgPymD...,6guTJsgPymDUVfqDJyz5UG,Galaxie 500,artist,spotify:artist:6guTJsgPymDUVfqDJyz5UG,https://open.spotify.com/artist/6guTJsgPymDUVf...,2TTAPkrGQQOZkaUTUlt21Q,Strange,48
0,https://api.spotify.com/v1/artists/2ytfu1MWsf7...,2ytfu1MWsf763hCBQmaQr6,Windir,artist,spotify:artist:2ytfu1MWsf763hCBQmaQr6,https://open.spotify.com/artist/2ytfu1MWsf763h...,7BvcpEdO7PUDGGSER1S9LA,Todeswalzer,36
0,https://api.spotify.com/v1/artists/1Xyo4u8uXC1...,1Xyo4u8uXC1ZmMpatF05PJ,The Weeknd,artist,spotify:artist:1Xyo4u8uXC1ZmMpatF05PJ,https://open.spotify.com/artist/1Xyo4u8uXC1ZmM...,0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,93


In [30]:
df_final2 = artists_df2[['song_name', 'name', 'song_id', 'popularity']]
df_final2

Unnamed: 0,song_name,name,song_id,popularity
0,Pretty Pimpin,Kurt Vile,3l9eg9RtisizG12a1D6nZl,0
0,I Need My Girl,The National,50M7nY1oQuNHecs0ahWAtI,68
0,Strange,Galaxie 500,2TTAPkrGQQOZkaUTUlt21Q,48
0,Todeswalzer,Windir,7BvcpEdO7PUDGGSER1S9LA,36
0,Blinding Lights,The Weeknd,0VjIjW4GlUZAMYd2vXMi3b,93
...,...,...,...,...
1,Apologize,OneRepublic,6ucR4KfvsBFWCMVFDvyKKl,75
0,Goddamn Lonely Love - Remastered,Drive-By Truckers,241LK9hqfXJyWpQ7oClQSN,48
0,Graveless yet Dead,Convocation,6hF9etLMoADWLsoui5ejGW,19
0,Gimme! Gimme! Gimme! (A Man After Midnight),ABBA,3vkQ5DAB1qQMYO4Mr9zJN6,84
