In [1]:
# 1. import libraries
import requests # to download html code
from bs4 import BeautifulSoup # to navigate through the html code
import pandas as pd
import numpy as np
import re

In [2]:
# We use the dataset from kaggle https://www.kaggle.com/datasets/saurabhshahane/music-dataset-1950-to-2019?resource=download
# containing over 28 k songs between 1950 and 2019

songs = pd.read_csv("C:/Users/e.sansebastian/OneDrive - EXPONDO/Ironhack\Weeks/Week_6/Day_1/Afternoon/lab-not-hot-songs/tcc_ceds_music.csv")
songs.head()

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.13711,sadness,1.0
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.64754,0.954819,2e-06,0.325021,0.26324,world/life,1.0
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.00277,0.00277,0.00277,...,0.00277,0.225422,0.456298,0.585288,0.840361,0.0,0.351814,0.139112,music,1.0
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.77535,0.743736,romantic,1.0
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.00135,0.00135,0.417772,...,0.0688,0.00135,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.0


In [3]:
# We check the shape to see how many records we have
songs.shape

(28372, 31)

In [14]:
# We isolate only the columns "artist_name"and "track_name" and perform some changes in the columns to align them with the df hot_songs

not_hot = songs[["artist_name","track_name"]]
not_hot= not_hot.rename(columns={"artist_name": "artist", "track_name": "title"})
display(not_hot.head())
display(not_hot.shape)

Unnamed: 0,artist,title
0,mukesh,mohabbat bhi jhoothi
1,frankie laine,i believe
2,johnnie ray,cry
3,pérez prado,patricia
4,giorgos papadopoulos,apopse eida oneiro


(28372, 2)

In [15]:
# We pick a sample of only 3000 random songs to make the file a bit "lighter" faster to process
not_hot = not_hot.sample(n=3000, random_state=1)
not_hot.shape

(3000, 2)

In [20]:
# We load the hot_songs df we created in the previous lab:

hot_songs= pd.read_csv("C:/Users/e.sansebastian/OneDrive - EXPONDO/Ironhack/Weeks/Week_6/Day_1/Morning/lab-web-scraping-single-page/hot_100_billboard.csv")
hot_songs.head()

Unnamed: 0,artist,title
0,brenda lee,rockin' around the christmas tree
1,mariah carey,all i want for christmas is you
2,bobby helms,jingle bell rock
3,wham!,last christmas
4,burl ives,a holly jolly christmas


In [59]:
# We look for potential duplicates in both the hot_song df:

hot_songs.groupby("title").size().sort_values(ascending=False)

title
it's beginning to look a lot like christmas             2
jingle bells                                            2
(there's no place like) home for the holidays (1954)    1
santa claus is comin' to town                           1
santa baby                                              1
                                                       ..
have yourself a merry little christmas                  1
harley quinn                                            1
happy xmas (war is over)                                1
happy holiday / the holiday season                      1
you're losing me (from the vault)                       1
Length: 98, dtype: int64

In [35]:
# We concatenate both not_hot and hot_songs df to check for duplicates before applying a function to remove them all
checking_df = pd.concat([not_hot, hot_songs], ignore_index=True)
checking_df.head()

Unnamed: 0,artist,title
0,glenn miller,the little man who wasn't there
1,misfits,american psycho
2,elliott smith,somebody that i used to know
3,june carter cash,juke box blues
4,"emerson, lake & palmer","karn evil 9 1st impression, pt. 1"


In [36]:
# We check the total shape of the concatenated df:
checking_df.shape

(3100, 2)

In [67]:
# Before creating the function we check the occurrences of the songs on the concatenated table:

grouped_songs=pd.DataFrame(checking_df.groupby("title").size().sort_values(ascending=False))
grouped_songs= grouped_songs.rename(columns={0: "occurrences"}).reset_index()
display(grouped_songs)
duplicated_songs= grouped_songs[grouped_songs["occurrences"] > 1]
duplicated_songs_sum = duplicated_songs["occurrences"].sum()
total_records = len(duplicated_songs["title"])

print(f"Sum of duplicates of songs present in both df: {duplicated_songs_sum} from a total of {total_records} records")

Unnamed: 0,title,occurrences
0,change,3
1,a fine romance,3
2,home,3
3,promises,3
4,runaway,3
...,...,...
2994,holy cow,1
2995,holy mount zion,1
2996,homage,1
2997,home (feat. michael bublé),1


Sum of duplicates of songs present in both df: 191 from a total of 90 records


In [84]:
# We create a function to remove the duplicates on the "not_hot" df

def remove_duplicates(df1: pd.DataFrame, df2: pd.DataFrame, col1: str, col2: str) -> pd.DataFrame:
    '''
    Remove duplicates from df1 based on the comparison between col1 in df1 and col2 in df2.
    
    Input: 
    df1: df we want to remove the duplicates from if found on df2
    
    col1: column we want to check the duplicates on df1
    col2: column where we want to check the duplicates on df2
    
    
    Output:
    
    df without duplicates
    
    '''
    
    df1_c = df1.copy()
    df2_c = df2.copy()
    
    # Extract unique values from col2 in df2
    unique_values_df2 = set(df2_c[col2])
    
    # Filter df1 to keep only rows where col1 is not in unique_values_df2
    df1_c = df1_c[~df1_c[col1].isin(unique_values_df2)].reset_index(drop=True)
    
    return df1_c

In [85]:
not_hot_songs = remove_duplicates(not_hot,hot_songs,"artist","artist")

In [86]:
not_hot_songs.head()

Unnamed: 0,artist,title
0,glenn miller,the little man who wasn't there
1,misfits,american psycho
2,elliott smith,somebody that i used to know
3,june carter cash,juke box blues
4,"emerson, lake & palmer","karn evil 9 1st impression, pt. 1"


In [88]:
not_hot_songs.to_csv('not_hot_songs.csv', index=False)