In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import unidecode

## Getting the list of Billboard Hot 100 songs from Wikipedia

In [None]:
def scrape(year):
    url = f"https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    table = soup.find('table', {'class': 'wikitable'})
    data = []
    if table:
        rows = table.find_all('tr')[1:]
        for row in rows:
            cells = row.find_all('td')
            if len(cells) >= 3:
                rank = cells[0].text.strip()
                title = cells[1].text.strip()
                artist = cells[2].text.strip()
                data.append({'Rank': rank, 'Title': title, 'Artist': artist})
    return data

billboard = pd.DataFrame(columns=['Rank', 'Title', 'Artist'])
columns = ['Rank', 'Title', 'Artist']

for year in range(2018, 2024):
    year_data = scrape(year)
    year_df = pd.DataFrame(year_data, columns=columns)
    year_df['Year'] = year
    billboard = pd.concat([billboard, year_df], ignore_index=True)

In [None]:
def scrape_no_artist(year):
    url = f"https://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_{year}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    table = soup.find('table', {'class': 'wikitable'})

    data = []
    for row in table.find_all('tr')[1:]:
        columns = row.find_all('td')
        if len(columns) >= 2:
            rank = columns[0].text.strip()
            title = columns[1].text.strip()
            data.append({'Year': year, 'Rank': rank, 'Title': title})
    return data

all_data = []

for year in range(2018, 2024):
    year_data = scrape_no_artist(year)
    all_data.extend(year_data)

no_artists = pd.DataFrame(all_data)

In [None]:
Billboard100 = pd.merge(no_artists, billboard, on=['Year', 'Rank'], how='left')
Billboard100['Artist'].fillna(method='ffill', inplace=True)
Billboard100.drop(columns=['Title_y'], inplace=True)
Billboard100 = Billboard100.rename(columns={'Title_x': 'Title', 'Artist_x': 'Artist'})

## Cleaning the list

In [None]:
def clean_song(x):
    x = x.replace("-", " ")
    x = x.replace("'", " ")
    x = re.sub(r'[^\w\s]','', x)
    x = unidecode.unidecode(x)
    return x

Billboard100[['Artist']]\
    = Billboard100[['Artist']].applymap(
        lambda x: x.replace('&', 'and'))

Billboard100[['song_clean', 'artist_clean']]\
    = Billboard100[['Title', 'Artist']].applymap(clean_song)

In [None]:
Billboard100['song_clean'] = Billboard100['song_clean'].apply(lambda x: x.lower())
Billboard100['artist_clean'] = Billboard100['artist_clean'].apply(lambda x: x.lower())
Billboard100['artist_clean'] = Billboard100['artist_clean'].str.replace('featuring', 'feat')

## Using the list to obtain the lyrics from Lyrics Translations

In [None]:
def get_lyrics(url):
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        lyrics_div = soup.find('p', class_='songLyricsV14')
        
        if lyrics_div:
            lyrics = lyrics_div.get_text("\n")
            return lyrics.strip()
        else:
            return "Lyrics not found"
    else:
        return "Failed to get lyrics"

In [None]:
def fetch_lyrics(artist, song):
    search_url = f"https://www.songlyrics.com/{artist}/{song}-lyrics/"
    return get_lyrics(search_url)

In [None]:
Billboard100['Lyrics'] = ""

for index, row in Billboard100.iterrows():
    artist_clean = row['artist_clean']
    song_clean = row['song_clean']
    lyrics = fetch_lyrics(artist_clean, song_clean)
    Billboard100.at[index, 'Lyrics'] = lyrics

In [None]:
Billboard100['Lyrics'] = Billboard100['Lyrics'].fillna('')
Billboard100['Lyrics'] = Billboard100['Lyrics'].apply(lambda x: re.sub(r'\[.*?\]', '', str(x)))
Billboard100['Lyrics'] = Billboard100['Lyrics'].apply(lambda x: x.replace('\n', ' ')) 

In [None]:
missed_songs = Billboard100[(Billboard100['Lyrics'] == "Failed to get lyrics") | (Billboard100['Lyrics'] == "Lyrics not found")]
Billboard100 = Billboard100.drop(missed_songs.index)

In [None]:
def extract_artist(artist):
    parts = re.split(r',| and | featuring ', artist, flags=re.IGNORECASE)
    return parts[0].strip().lower()

missed_songs['artist_clean'] = missed_songs['Artist'].apply(extract_artist)
missed_songs['artist_clean'] = missed_songs['artist_clean'].str.replace('the ', '')
missed_songs['artist_clean'] = missed_songs['artist_clean'].str.replace('.', '')

In [None]:
missed_songs['song_clean'] = missed_songs['song_clean'].replace('bood', "boo'd")
missed_songs['song_clean'] = missed_songs['song_clean'].replace('lemon', "lemons")
missed_songs['artist_clean'] = missed_songs['artist_clean'].replace('tones', "tones and i")
missed_songs['artist_clean'] = missed_songs['artist_clean'].replace('bts', "bts bangtan boys")

In [None]:
def clean_song(x):
    x = x.replace("-", " ")
    x = x.replace("'", "")
    x = re.sub(r'[^\w\s]', '', x)
    x = unidecode.unidecode(x)
    unwanted_words = ['the', 'like', 'a', 'with', 'in', 'for', 'up', 'to', 'at', 
                      'on', 'that', 'from', 'of', 'but', 'as', 'before', 'is', 'by']
    words = x.split()
    cleaned_words = [word for word in words if word.lower() not in unwanted_words]
    return ' '.join(cleaned_words).lower()

missed_songs['song_clean'] = missed_songs['Title'].apply(clean_song)

In [None]:
def scrape_lyrics(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "lxml")
        lyrics_element = soup.find("div", class_="ltf")
        if lyrics_element:
            lyrics = lyrics_element.get_text(separator="\n")
            return lyrics.strip() 
    return None

def generate_url(artist, song):
    artist = artist.replace(" ", "-").lower()
    song = song.replace(" ", "-").lower()
    return f"https://lyricstranslate.com/en/{artist}-{song}-lyrics.html"


In [None]:
missed_songs['Lyrics'] = missed_songs.apply(lambda row: scrape_lyrics(generate_url(row['artist_clean'], row['song_clean'])), axis=1)

In [None]:
missed_songs['Lyrics'] = missed_songs['Lyrics'].fillna('')
missed_songs['Lyrics'] = missed_songs['Lyrics'].apply(lambda x: re.sub(r'\[.*?\]', '', str(x)))
missed_songs['Lyrics'] = missed_songs['Lyrics'].apply(lambda x: x.replace('\n', ' ')) 

In [None]:
missed_songs = pd.read_csv('missed_songs.csv')

In [None]:
Billboard100 = Billboard100.drop(columns=['song_clean', 'artist_clean'])
missed_songs = missed_songs.drop(columns=['song_clean', 'artist_clean'])

In [None]:
appended_df = pd.concat([Billboard100, missed_songs], ignore_index=True)

In [None]:
appended_df = appended_df.sort_values(by=['Year', 'Rank'], ascending=[True, True])
appended_df['Lyrics'] = appended_df['Lyrics'].str.lower()

def clean_text(text):
    return text.strip('"').lower()

appended_df['Title'] = appended_df['Title'].apply(clean_text)

In [None]:
appended_df.to_csv('Billboard100.csv', index=False)