In [4]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from bs4 import BeautifulSoup
import pandas as pd
import time
import collections
from time import sleep
from functools import wraps

from IPython.display import clear_output

import requests

In [5]:
def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        total_time = end_time - start_time
        print(f'Function {func.__name__} {args} Took {total_time:.4f} seconds')
        return result
    return timeit_wrapper

# Deezer API

In [6]:
# Set the base URL for the Deezer API
base_url = "https://api.deezer.com/search"

In [7]:
def get_album_genre(album_id):
    list_genres = []
    album_url = f"https://api.deezer.com/album/{album_id}"
    album_data = requests.get(album_url)
    album_data = album_data.json()
    for genre in album_data['genres']['data']:
        list_genres.append(genre['name'])
    return list_genres

In [28]:
def get_track_genres(title, artist):# Set the search query
    query = f"{title}	{artist}"

    # Set the parameters for the request
    params = { "q": query, "limit": 1}

    # Send the request to the Deezer API
    response = requests.get(base_url, params=params)

    data = response.json()

    try:
        track = data['data'][0]
        genres = get_album_genre(track['album']['id'])

    except (IndexError, KeyError):
        return []

    return genres

In [75]:
def get_track_additional_infos(title, artist):# Set the search query
    query = f"{title}	{artist}"

    # Set the parameters for the request
    params = { "q": query, "limit": 1}

    # Send the request to the Deezer API
    response = requests.get(base_url, params=params)

    data = response.json()
    try:
        track = data['data'][0]
        track_infos = dict()
        track_infos['explicit_lyrics'] = 1 if bool(track['explicit_lyrics']) else 0
        track_infos['duration'] = int(track['duration'])
        track_infos['genres'] = get_album_genre(track['album']['id'])

    except (IndexError, KeyError):
        track_infos = dict()
        track_infos['explicit_lyrics'] = None
        track_infos['duration'] = None
        track_infos['genres'] = []

    return track_infos

# HOT100

In [9]:
chromedriver_path = r"C:/Users/Aymeric Leboucher/OneDrive - De Vinci/ESILV/A5/Webscrapping/TP2/chromedriver.exe"
chromedriver_path_erget = r"C:/Users/AymericLEBOUCHER/OneDrive - Groupe ERGET/Ressources/ESILV/Webscrapping/chromedriver_win32/chromedriver.exe"

In [10]:
def get_page_source(link, driver):
    driver.get(link)
    src=driver.page_source
    return src

In [11]:
def page_parser(page):
    parser = BeautifulSoup(page, "lxml")
    table = parser.find("div", attrs={"class":"chart-results-list // lrv-u-padding-t-150 lrv-u-padding-t-050@mobile-max"})
    return table

In [12]:
to_remove_from_song_titles = [ 'Imprint/Promotion Label:',
 'Songwriter(s):',
 'Producer(s):']

def extract_song_titles(table):
    song_titles = table.find_all("h3")
    ori_songlist = [h.text.strip() for h in song_titles[2:]]
    songlist = [word for word in ori_songlist if word not in to_remove_from_song_titles]
    return songlist

In [46]:

def extract_artists(table):
    to_remove_from_artists = ["",'Share Chart on Twitter','Share Chart on Embed', 'This Week',
                            'Award',
                            'Last Week',
                            'Peak Pos.',
                            'Wks on Chart',
                            'Twitter',
                            'Share Chart on Copy Link',
                            'Copy Link',
                            'Share Chart on Facebook',
                            'Facebook',
                            'Embed',
                            'RIAA Certification:',
                            'Diamond',
                            "Platinum",
                            'Platinum x2',
                            'Platinum x3',
                            'Platinum x4',
                            'Platinum x5',
                            'Platinum x6',
                            'Platinum x7',
                            'Platinum x8',
                            'Platinum x9',
                            'RE-\nENTRY',
                            'Gold',
                            'NEW',
                            "-"]

    artists = table.find_all("span")
    artists = [h.text.strip() for h in artists]
    artists = [artists[i] for i in range(len(artists)) if artists[i] not in to_remove_from_artists if not artists[i].isdigit()]
    return artists 

In [14]:
def convert_to_top_100(songlist, artists):
    top_100 = pd.concat([pd.Series(songlist), pd.Series(artists)], axis=1)
    top_100.rename(columns = {0:"title", 1:"artist"}, inplace=True)
    return top_100

In [63]:
def additional_infos(row):
    track_infos = get_track_additional_infos(row.title, row.artist)
    row.genres = track_infos["genres"]
    row.duration = track_infos['duration']
    row.explicit_content = track_infos['explicit_lyrics']
    return row 

In [50]:
def genres(row):
    row.genres = get_track_genres(row.title, row.artist)
    return row 

In [79]:
service = ChromeService(executable_path=chromedriver_path)
options = webdriver.ChromeOptions()
options.add_argument('-headless')

@timeit
def get_top_100(year, month, day, service=service, verbose=0, additional=False):
    driver = webdriver.Chrome(service=service, options=options)
    month = str(month)
    if len(month)==1:
        month = "0" + month
    day = str(day)
    if len(day)==1:
        day = "0" + day
    link = f"https://www.billboard.com/charts/hot-100/{year}-{month}-{day}/"
    src = get_page_source(link, driver)
    table = page_parser(src)
    songlist = extract_song_titles(table)
    artists = extract_artists(table)
    top_100 = convert_to_top_100(songlist, artists)
    top_100['genres'] = None
    if additional:
        top_100['duration'] = None
        top_100['explicit_content'] = None
        top_100 = top_100.apply(additional_infos, axis=1)
    else: 
        top_100 = top_100.apply(genres, axis=1)
    if verbose==1:
        freq = collections.Counter(top_100.genres.sum())
        return top_100, dict(sorted(freq.items(), key=lambda x: x[1], reverse=True))
    return top_100

In [None]:
get_top_100(1989, 2, 1, additional=True)

# HOT100 Generator with Genres

In [None]:
for annee in range(1978, 2022):
    for month in range(1, 13):
        top_100, top_style = get_top_100(annee, month, 1, verbose=1, additional=True)
        print("Top style: ", list(top_style.keys())[0])
        top_100.to_csv(f"C:/Users/AymericLEBOUCHER/OneDrive - Groupe ERGET/Ressources/ESILV/Webscrapping/month_top_100/{annee}-{month}.csv", sep=";")