# Imports

In [26]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromeService
from bs4 import BeautifulSoup
import pandas as pd
import time
import collections
from time import sleep
from functools import wraps

from tqdm import tqdm
tqdm.pandas()

from IPython.display import clear_output

import requests

In [4]:
def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        total_time = end_time - start_time
        print(f'Function {func.__name__} {args} Took {total_time:.4f} seconds')
        return result
    return timeit_wrapper

# Deezer API

In [5]:
# Set the base URL for the Deezer API
base_url = "https://api.deezer.com/search"

In [94]:
def deezer_query(title, artist):
    query = f"{title}	{artist}"

    # Set the parameters for the request
    params = { "q": query, "limit": 1}

    # Send the request to the Deezer API
    response = requests.get(base_url, params=params)

    data = response.json()
    return data 

In [6]:
def get_album_genre(album_id):
    list_genres = []
    album_url = f"https://api.deezer.com/album/{album_id}"
    album_data = requests.get(album_url)
    album_data = album_data.json()
    for genre in album_data['genres']['data']:
        list_genres.append(genre['name'])
    return list_genres

In [7]:
def get_track_genres(title, artist):# Set the search query
    query = f"{title}	{artist}"

    # Set the parameters for the request
    params = { "q": query, "limit": 1}

    # Send the request to the Deezer API
    response = requests.get(base_url, params=params)

    data = response.json()

    try:
        track = data['data'][0]
        genres = get_album_genre(track['album']['id'])

    except (IndexError, KeyError):
        return []

    return genres

In [95]:
def get_track_additional_infos(title, artist, genre=True):# Set the search query
    data = deezer_query(title, artist)
    try:
        track = data['data'][0]
        track_infos = dict()
        track_infos['explicit_lyrics'] = 1 if bool(track['explicit_lyrics']) else 0
        track_infos['duration'] = int(track['duration'])
        if genre: 
            track_infos['genres'] = get_album_genre(track['album']['id'])

    except (IndexError, KeyError):
        track_infos = dict()
        track_infos['explicit_lyrics'] = None
        track_infos['duration'] = None
        track_infos['genres'] = []

    return track_infos

# HOT100

In [9]:
chromedriver_path = r"C:/Users/Aymeric Leboucher/OneDrive - De Vinci/ESILV/A5/Webscrapping/TP2/chromedriver.exe"
chromedriver_path_erget = r"C:/Users/AymericLEBOUCHER/OneDrive - Groupe ERGET/Ressources/ESILV/Webscrapping/chromedriver_win32/chromedriver.exe"

In [10]:
def get_page_source(link, driver):
    driver.get(link)
    src=driver.page_source
    return src

In [11]:
def page_parser(page):
    parser = BeautifulSoup(page, "lxml")
    table = parser.find("div", attrs={"class":"chart-results-list // lrv-u-padding-t-150 lrv-u-padding-t-050@mobile-max"})
    return table

In [12]:
to_remove_from_song_titles = [ 'Imprint/Promotion Label:',
 'Songwriter(s):',
 'Producer(s):']

def extract_song_titles(table):
    song_titles = table.find_all("h3")
    ori_songlist = [h.text.strip() for h in song_titles[2:]]
    songlist = [word for word in ori_songlist if word not in to_remove_from_song_titles]
    return songlist

In [13]:

def extract_artists(table):
    to_remove_from_artists = ["",'Share Chart on Twitter','Share Chart on Embed', 'This Week',
                            'Award',
                            'Last Week',
                            'Peak Pos.',
                            'Wks on Chart',
                            'Twitter',
                            'Share Chart on Copy Link',
                            'Copy Link',
                            'Share Chart on Facebook',
                            'Facebook',
                            'Embed',
                            'RIAA Certification:',
                            'Diamond',
                            "Platinum",
                            'Platinum x2',
                            'Platinum x3',
                            'Platinum x4',
                            'Platinum x5',
                            'Platinum x6',
                            'Platinum x7',
                            'Platinum x8',
                            'Platinum x9',
                            'RE-\nENTRY',
                            'Gold',
                            'NEW',
                            "-"]

    artists = table.find_all("span")
    artists = [h.text.strip() for h in artists]
    artists = [artists[i] for i in range(len(artists)) if artists[i] not in to_remove_from_artists if not artists[i].isdigit()]
    return artists 

In [14]:
def convert_to_top_100(songlist, artists):
    top_100 = pd.concat([pd.Series(songlist), pd.Series(artists)], axis=1)
    top_100.rename(columns = {0:"title", 1:"artist"}, inplace=True)
    return top_100

In [15]:
def additional_infos(row):
    track_infos = get_track_additional_infos(row.title, row.artist)
    row.genres = track_infos["genres"]
    row.duration = track_infos['duration']
    row.explicit_content = track_infos['explicit_lyrics']
    return row 

In [16]:
def genres(row):
    row.genres = get_track_genres(row.title, row.artist)
    return row 

In [17]:
service = ChromeService(executable_path=chromedriver_path)
options = webdriver.ChromeOptions()
options.add_argument('-headless')

@timeit
def get_top_100(year, month, day, service=service, verbose=0, additional=False):
    driver = webdriver.Chrome(service=service, options=options)
    month = str(month)
    if len(month)==1:
        month = "0" + month
    day = str(day)
    if len(day)==1:
        day = "0" + day
    link = f"https://www.billboard.com/charts/hot-100/{year}-{month}-{day}/"
    src = get_page_source(link, driver)
    table = page_parser(src)
    songlist = extract_song_titles(table)
    artists = extract_artists(table)
    top_100 = convert_to_top_100(songlist, artists)
    top_100['genres'] = None
    if additional:
        top_100['duration'] = None
        top_100['explicit_content'] = None
        top_100 = top_100.apply(additional_infos, axis=1)
    else: 
        top_100 = top_100.apply(genres, axis=1)
    if verbose==1:
        freq = collections.Counter(top_100.genres.sum())
        return top_100, dict(sorted(freq.items(), key=lambda x: x[1], reverse=True))
    return top_100

# HOT100 Generator with Genres

In [24]:
for annee in range(1958, 1959):
    for month in range(9, 13):
        top_100 = get_top_100(annee, month, 1, additional=True)
        top_100.to_csv(f"C:/Users/AymericLEBOUCHER/OneDrive - Groupe ERGET/Ressources/ESILV/Webscrapping/month_top_100/{annee}-{month}.csv", sep=";")

Function get_top_100 (1958, 9, 1) Took 56.1519 seconds
Function get_top_100 (1958, 10, 1) Took 47.6068 seconds
Function get_top_100 (1958, 11, 1) Took 48.0370 seconds
Function get_top_100 (1958, 12, 1) Took 45.9378 seconds


# Completion with additional infos

In [74]:
def add_additional_infos(row):
    global songs_data
    id = (row.title, row.artist)
    if id in list(songs_data.id):
        row.duration = songs_data[songs_data.id == id].iloc[0].duration
        row.explicit_content = songs_data[songs_data.id == id].iloc[0].explicit_content
    else: 
        infos = get_track_additional_infos(row.title, row.artist, genre=False)
        row.duration = infos["duration"]
        row.explicit_content = infos["explicit_lyrics"]
        song_data = row.copy()
        song_data["id"]=(row.title, row.artist)
        songs_data.loc[len(songs_data)] =  song_data
        
    return row

In [78]:
for annee in tqdm(range(1960, 1972)):
    for month in range(1, 13):
        top_100 = pd.read_csv(f"C:/Users/AymericLEBOUCHER/OneDrive - Groupe ERGET/Ressources/ESILV/Webscrapping/month_top_100/{annee}-{month}.csv", sep=";")
        top_100['duration'] = None
        top_100['explicit_content'] = None
        top_100 = top_100.apply(add_additional_infos, axis=1)
        del(top_100[top_100.columns[0]])
        top_100.to_csv(f"C:/Users/AymericLEBOUCHER/OneDrive - Groupe ERGET/Ressources/ESILV/Webscrapping/month_top_100/{annee}-{month}.csv", sep=";")

100%|██████████| 12/12 [24:38<00:00, 123.20s/it]


# Songs Dataset

In [65]:
songs_data = pd.DataFrame(columns=["id", "title", "artist", "genres", "duration", "explicit_content"])

In [None]:
def enrich_songs_data(row):
    global songs_data
    row["id"] = (row.title, row.artist)
    id = (row.title, row.artist)
    if id not in list(songs_data.id):
        songs_data.loc[len(songs_data)] =  row

for annee in tqdm(range(1972, 2022)):
    for month in range(1, 13):
        top_100 = pd.read_csv(f"C:/Users/AymericLEBOUCHER/OneDrive - Groupe ERGET/Ressources/ESILV/Webscrapping/month_top_100/{annee}-{month}.csv", sep=";")
        top_100.apply(enrich_songs_data, axis=1)

In [129]:
def search_additional_infos(row):
    artist = row.artist
    try:
        artist_split = row.artist.split()
        if len(artist_split)>2:
            artist = " ".join(artist_split[:2])
    except AttributeError:
        pass
    
    title = row.artist
    try:
        title_split = row.title.split()
        if len(title_split)>2:
            title = " ".join(title_split[:2])
    except AttributeError:
        pass
    track_infos = get_track_additional_infos(title, artist)
    row.genres = track_infos["genres"]
    row.duration = track_infos['duration']
    row.explicit_content = track_infos['explicit_lyrics']
    return row 


In [None]:
unfound_songs = unfound_songs[unfound_songs.duration.isnull()]
unfound_songs = unfound_songs.progress_apply(search_additional_infos, axis=1)

In [130]:
unfound_songs[unfound_songs.duration.isnull()]

Unnamed: 0,id,title,artist,genres,duration,explicit_content
77,"(Little Brass Band, David Seville)",Little Brass Band,David Seville,[],,
94,"(Old MacDonald, The Chargers)",Old MacDonald,The Chargers,[],,
195,"(Intermission Riff, Bernie Lowe Orchestra)",Intermission Riff,Bernie Lowe Orchestra,[],,
205,"(The Teen Commandments, Paul Anka-Geo. Hamilto...",The Teen Commandments,Paul Anka-Geo. Hamilton IV-Johnny Nash,[],,
209,"(A House, A Car And A Wedding Ring, Mike Preston)","A House, A Car And A Wedding Ring",Mike Preston,[],,
...,...,...,...,...,...,...
28555,"(Absolutely Right, Five Man Electrical Band)",Absolutely Right,Five Man Electrical Band,[],,
28573,"(Gimme Some Lovin'-Pt. 1, Traffic, Etc.)",Gimme Some Lovin'-Pt. 1,"Traffic, Etc.",[],,
28586,"(If It's Alright With You, Rose Colored Glass)",If It's Alright With You,Rose Colored Glass,[],,
28587,"(I Bet He Don't Love You (Like I Love You), Th...",I Bet He Don't Love You (Like I Love You),The Intruders,[],,


In [127]:
deezer_query("Gimme Some", "Traffic")	

{'data': [{'id': 2536195,
   'readable': True,
   'title': "Gimme Some Lovin' (Live Version)",
   'title_short': "Gimme Some Lovin'",
   'title_version': '(Live Version)',
   'link': 'https://www.deezer.com/track/2536195',
   'duration': 542,
   'rank': 198171,
   'explicit_lyrics': False,
   'explicit_content_lyrics': 0,
   'explicit_content_cover': 2,
   'preview': 'https://cdns-preview-a.dzcdn.net/stream/c-a93dbb615b0077d38d946fc5bd751392-8.mp3',
   'md5_image': 'e8b53d6b92888a36e500d6ba2d7c731e',
   'artist': {'id': 6126,
    'name': 'Traffic',
    'link': 'https://www.deezer.com/artist/6126',
    'picture': 'https://api.deezer.com/artist/6126/image',
    'picture_small': 'https://e-cdns-images.dzcdn.net/images/artist/ba8c92b33603ca638c69fbb0fdb7cf08/56x56-000000-80-0-0.jpg',
    'picture_medium': 'https://e-cdns-images.dzcdn.net/images/artist/ba8c92b33603ca638c69fbb0fdb7cf08/250x250-000000-80-0-0.jpg',
    'picture_big': 'https://e-cdns-images.dzcdn.net/images/artist/ba8c92b33603c

In [90]:
songs_data.to_csv(f"C:/Users/AymericLEBOUCHER/OneDrive - Groupe ERGET/Ressources/ESILV/Webscrapping/songs_data.csv", sep=";")