# Lab | Web Scraping Single Page (GNOD part 1)

In [2]:
from bs4 import BeautifulSoup

In [3]:
import requests
import pandas as pd

In [4]:
# 2. find url and store it in a variable
url = "https://www.popvortex.com/music/charts/top-100-songs.php"

In [5]:
# 3. download html with a get request
response = requests.get(url)

In [6]:
# Check if the request was successful (status code 200)
response.status_code # 200 status code means OK!

200

In [7]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")

In [8]:
# 4.2. check that the html code looks like it should
# soup

In [9]:
song_elements = soup.find_all('div',class_ ='chart-content')

In [10]:
for i, element in enumerate(song_elements):
        title = element.find('cite', class_='title').text.strip()
        artist = element.find('em', class_='artist').text.strip()
        genre = element.find('ul').find('li').text.strip()
      
        print(f"{i + 1}. {title} - {artist} - {genre}")

1. TEXAS HOLD 'EM - Beyoncé - Genre: Country
2. Lose Control - Teddy Swims - Genre: Pop
3. Beautiful Things - Benson Boone - Genre: Pop
4. Flowers - Miley Cyrus - Genre: Pop
5. Turn the Lights Back On - Billy Joel - Genre: Pop
6. Don't Let the Old Man In - Toby Keith - Genre: Country
7. Lovin On Me - Jack Harlow - Genre: Hip-Hop / Rap
8. Selfish - Justin Timberlake - Genre: Pop
9. Sorrys & Ferraris - Polo G - New Release
10. TEXAS HOLD 'EM - Beyoncé - Genre: Country
11. I Remember Everything (feat. Kacey Musgraves) - Zach Bryan - Genre: Country
12. Made For Me - Muni Long - Genre: R&B / Soul
13. Houdini - Dua Lipa - Genre: Pop
14. Training Season - Dua Lipa - New Release
15. Fast Car - Luke Combs - Genre: Country
16. Fast Car - Tracy Chapman - Genre: Singer/Songwriter
17. Live Like You Were Dying - Tim McGraw - Genre: Country
18. Save Me - Jelly Roll - Genre: Rap
19. Let's Go - Key Glock - Genre: Hip-Hop / Rap
20. Yeah! (feat. Lil Jon & Ludacris) - USHER - Genre: R&B / Soul
21. Lil Boo

In [11]:
titles = []
artists = []
genres = []
for i, element in enumerate(song_elements):
        title = element.find('cite', class_='title').text.strip()
        titles.append(title)
        artist = element.find('em', class_='artist').text.strip()
        artists.append(artist)
        genre = element.find('ul').find('li').text.strip()
        genres.append(genre)
        print(f"{title} - {artist} - {genre}")
        #data.append('title', 'artist', 'genre'])


TEXAS HOLD 'EM - Beyoncé - Genre: Country
Lose Control - Teddy Swims - Genre: Pop
Beautiful Things - Benson Boone - Genre: Pop
Flowers - Miley Cyrus - Genre: Pop
Turn the Lights Back On - Billy Joel - Genre: Pop
Don't Let the Old Man In - Toby Keith - Genre: Country
Lovin On Me - Jack Harlow - Genre: Hip-Hop / Rap
Selfish - Justin Timberlake - Genre: Pop
Sorrys & Ferraris - Polo G - New Release
TEXAS HOLD 'EM - Beyoncé - Genre: Country
I Remember Everything (feat. Kacey Musgraves) - Zach Bryan - Genre: Country
Made For Me - Muni Long - Genre: R&B / Soul
Houdini - Dua Lipa - Genre: Pop
Training Season - Dua Lipa - New Release
Fast Car - Luke Combs - Genre: Country
Fast Car - Tracy Chapman - Genre: Singer/Songwriter
Live Like You Were Dying - Tim McGraw - Genre: Country
Save Me - Jelly Roll - Genre: Rap
Let's Go - Key Glock - Genre: Hip-Hop / Rap
Yeah! (feat. Lil Jon & Ludacris) - USHER - Genre: R&B / Soul
Lil Boo Thang - Paul Russell - Genre: Pop
Cruel Summer - Taylor Swift - Genre: Pop

In [13]:
presis_df = pd.DataFrame({"title":titles,
                          "artist":artists,
                          "genre": genres})
presis_df

Unnamed: 0,title,artist,genre
0,TEXAS HOLD 'EM,Beyoncé,Genre: Country
1,Lose Control,Teddy Swims,Genre: Pop
2,Beautiful Things,Benson Boone,Genre: Pop
3,Flowers,Miley Cyrus,Genre: Pop
4,Turn the Lights Back On,Billy Joel,Genre: Pop
...,...,...,...
95,Tell It To My Heart,Cash Cash & Taylor Dayne,Genre: Dance
96,"We Takin' Over (feat. Akon, T.I., Rick Ross, F...",DJ Khaled,Genre: Hip-Hop / Rap
97,Used To Be Young,Miley Cyrus,Genre: Pop
98,Heart Like a Truck,Lainey Wilson,Genre: Country


In [16]:
import re
from datetime import datetime

if not song_elements:
    print("No song elements found. Please check if the website structure has changed.")
    exit()

data = []
for i, element in enumerate(song_elements):
    title = element.find('cite', class_='title').text.strip()
    artist = element.find('em', class_='artist').text.strip()

    # Extracting genre
    genre = "Genre not found"
    ul_tag = element.find('ul')
    if ul_tag:
        for li_tag in ul_tag.find_all('li'):
            if 'Genre' in li_tag.text:
                genre = li_tag.text.strip().replace('Genre:', '')
                break
                
    # Extracting release date using regular expression
    release_date_text = ""
    release_date_match = re.search(r'Release Date: (\w+ \d{1,2}, \d{4})', element.get_text())
    if release_date_match:
        release_date_text = release_date_match.group(1)
        
        # Convert release date to date format
        release_date = datetime.strptime(release_date_text, "%B %d, %Y").strftime("%d/%m/%Y")

    data.append([title, artist, genre, release_date])

df = pd.DataFrame(data, columns=['title', 'artist', 'genre', 'release_date'])
display(df)

Unnamed: 0,title,artist,genre,release_date
0,TEXAS HOLD 'EM,Beyoncé,Country,11/02/2024
1,Lose Control,Teddy Swims,Pop,23/06/2023
2,Beautiful Things,Benson Boone,Pop,19/01/2024
3,Flowers,Miley Cyrus,Pop,12/01/2023
4,Turn the Lights Back On,Billy Joel,Pop,01/02/2024
...,...,...,...,...
95,Tell It To My Heart,Cash Cash & Taylor Dayne,Dance,15/09/2023
96,"We Takin' Over (feat. Akon, T.I., Rick Ross, F...",DJ Khaled,Hip-Hop / Rap,27/03/2007
97,Used To Be Young,Miley Cyrus,Pop,25/08/2023
98,Heart Like a Truck,Lainey Wilson,Country,20/05/2022


# Lab | Web Scraping Single Page (GNOD part 2)¶

In [22]:
import random

In [26]:
# Function to recommend a random song if the input song is found in the DataFrame
def recommend_song(song_title):
    # Check if the song is in the DataFrame (case insensitive)
    if df['title'].str.lower().str.contains(song_title.lower()).any():
        # Get a random index that is not the index of the input song
        random_index = df[df['title'].str.lower() != song_title.lower()].sample().index[0]
        # Get the random song and artist
        random_song = df.loc[random_index, 'title']
        random_artist = df.loc[random_index, 'artist']
        return f"Thank you for the input and now we recommend \"{random_song}\" by {random_artist}"
    else:
        return "Thank you for the input but we currently have no recommendation for this song."

# Loop to recommend songs for 5 times
for i in range(5):
    input_song = input("Enter a song title: ")
    recommendation = recommend_song(input_song)
    print(recommendation)

Enter a song title: flowers
Thank you for the input and now we recommend "Good Day" by Forrest Frank
Enter a song title: Flowers
Thank you for the input and now we recommend "Fast Car" by Tracy Chapman
Enter a song title: flower
Thank you for the input and now we recommend "Dance You Outta My Head" by Cat Janice
Enter a song title: greedy
Thank you for the input and now we recommend "EASY" by LE SSERAFIM
Enter a song title: no greedy
Thank you for the input but we currently have no recommendation for this song.
