# 1 - Data Collection

To develop this project, we will use two datasets:
A. *Hot100*: top 100 songs in [billboard.com](https://www.billboard.com/charts/hot-100), scraped using BeautifulSoup

B. *MillionSongSubset*: a 10000 song subset from the famous [MillionSongDataset](http://millionsongdataset.com)


## Import Libraries

In [3]:
import os
import pandas as pd
import requests
import glob

import hdf5_getters

from bs4 import BeautifulSoup

## 1. Hot100 Dataset

In [4]:
# Function to scrape data from the web

def scrape_hot100() -> pd.DataFrame:
    '''

    This function scrapes the top 100 songs from billboard.com and returns a 
    DataFrame including the song title and artist.

    Output:
    Pandas DataFrame

    '''

    # Define the base url
    url = "https://www.billboard.com/charts/hot-100"
    
    # Request the url data
    response = requests.get(url)

    # Create soup
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Define select string
    title_select = 'div.chart-results-list .c-title.a-truncate-ellipsis'
    
    # Parse song titles
    titles = [li.get_text().strip() for li in soup.select(title_select)]
    
    # Define artist select
    artist_select = 'div.chart-results-list .c-label.a-no-trucate'
    
    # Parse song artists
    artists = [li.get_text().strip() for li in soup.select(artist_select)]

    # Create DataFrame
    songs = pd.DataFrame({"title": titles, "artist": artists})

    return songs


In [None]:
# Get hot 100 songs
hot100_df = scrape_hot100()

# Save as DataFrame
hot100_df.to_csv('../data/1_hot100_raw.csv',index = False, sep=";")

# Show top 5 rows
hot100_df.head()

Unnamed: 0,title,artist
0,Vampire,Olivia Rodrigo
1,Paint The Town Red,Doja Cat
2,I Remember Everything,Zach Bryan Featuring Kacey Musgraves
3,Fast Car,Luke Combs
4,Cruel Summer,Taylor Swift


## 2. MillionSongSubset

In [6]:
# Define function to get all song titles and artists from MillionSongSubset folder (previously downloaded)

def get_million_song_subset(basedir,ext='.h5') :
    '''
    
    This function returns a list of all song titles and artists from the MillionSongSubset folder.

    '''

    titles = []
    artists = []
    for root, dirs, files in os.walk(basedir):
        files = glob.glob(os.path.join(root,'*'+ext))
        for f in files:
            h5 = hdf5_getters.open_h5_file_read(f)
            titles.append( hdf5_getters.get_title(h5) )
            artists.append( hdf5_getters.get_artist_name(h5) )
            h5.close()
    return titles, artists


In [7]:
# Call function to collect all titles and artists
mss_titles, mss_artists = get_million_song_subset('../Data/MillionSongSubset', 'h5')

In [10]:
# Create a DataFrame with all titles and artists
mss_df = pd.DataFrame({'title': mss_titles, 'artist': mss_artists})

# Save DataFrame to csv
mss_df.to_csv('../data/2_million_song_subset_raw.csv',index = False, sep=";")

# Show first 5 rows & shape
mss_df.head()

Unnamed: 0,title,artist
0,b'Je Sais Que La Terre Est Plate',b'Rapha\xc3\xabl'
1,b'On Efface',b'Julie Zenatti'
2,b'Howells Delight',b'The Baltimore Consort'
3,b'Martha Served',b'I Hate Sally'
4,b'Zip-A-Dee-Doo-Dah (Song of the South)',b'Orlando Pops Orchestra'
