In [136]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import matplotlib.ticker as ticker
import seaborn as sns

In [137]:
artists_dataset = pd.read_csv('dataset/tabular/artists.csv')
tracks_dataset = pd.read_csv('dataset/tabular/tracks.csv')

In [138]:
tracks_dataset

Unnamed: 0,id,name,disc_number,duration_ms,explicit,popularity,track_number,artists,album_type,album_name,...,features_duration_ms,time_signature,start_of_fade_out,tempo_confidence,time_signature_confidence,key_confidence,mode_confidence,n_beats,n_bars,genre
0,4rjA5kJJWbwU1prXCvg6Fk,Grey,1,290479,False,52,5,Kölsch,album,1989,...,290479,3,275.90530,0.020,1.000,0.500,0.525,705.0,234.0,minimal-techno
1,6xzpUzzIquIyUzTLbbgSdI,Thrown,1,539229,False,45,5,Kiasmos,album,Kiasmos,...,539229,4,517.28253,0.678,0.326,0.410,0.589,1074.0,269.0,minimal-techno
2,56tXgHlSHCfgmGhwVXNizc,Routine,1,264200,False,39,5,Joris Delacroix,album,Night Visions,...,264200,3,259.04180,0.847,0.457,0.904,0.760,516.0,171.0,minimal-techno
3,0s3wIBczp6TdSJ2y8cveJl,Confronted - Anfisa Letyago Stranger Remix,1,387413,False,0,6,Pan-Pot;Anfisa Letyago,single,Confronted Remixes,...,387414,4,387.41360,0.882,0.498,0.512,0.503,826.0,207.0,minimal-techno
4,4PSbDDd1LRYMhqPXvza6I2,Jupiter Sunrise,1,248956,False,0,1,Kollektiv Turmstrasse,compilation,10 Years Diynamic,...,248957,4,239.63574,0.882,1.000,0.474,0.459,493.0,123.0,minimal-techno
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109542,0wiDjWz3U1WfwXSrWHXe2b,I'm Good (Blue) - REAPER Extended Remix,1,209655,True,23,2,David Guetta;Bebe Rexha;REAPER,single,I'm Good (Blue) [REAPER Remix],...,209655,4,200.10376,0.729,1.000,0.088,0.182,585.0,145.0,edm
109543,46n9OJc7LOIVdj8t2l5WA5,End Of The Night,1,205724,True,56,1,Danny Avila,single,End Of The Night,...,205724,4,200.98611,0.712,1.000,0.395,0.535,339.0,84.0,edm
109544,127uq83uGFapbddqiMUKky,Sexy Bitch (feat. Akon),1,195853,True,80,3,David Guetta;Akon,album,One More Love,...,195853,4,189.48643,0.814,0.990,0.097,0.330,417.0,106.0,edm
109545,0ZdUHFxifUJNqo7G4aJzoF,Savannah,1,208698,False,60,1,Diviners;Philly K.,single,Savannah,...,208698,4,197.41605,0.650,1.000,0.545,0.604,359.0,89.0,edm


# Artists Dataset

Dropping rows with NaN values from artists dataset

In [139]:
# Drop the rows with missing values
artists_dataset = artists_dataset.dropna()
# Drop the duplicated rows
artists_dataset = artists_dataset.drop_duplicates()

# Tracks Dataset

## Aggregated all rows with duplicated 'id' values into a single row, keeping the unique genres in a list. Then dropped the duplicated rows from the original dataset and added the aggregated rows.


In [140]:
merged_df = tracks_dataset.groupby('id')['genre'].agg(list)
df_merged = pd.merge(tracks_dataset, merged_df, on='id', how='left')

# Find the indices of the rows with the highest popularity within each group (ID)
indices_to_keep = df_merged.groupby('id')['popularity'].idxmax()
# Filter the dataframe to keep only the rows with the highest popularity within each group
tracks_dataset = df_merged.loc[indices_to_keep]
tracks_dataset.rename(columns={'genre_y': 'genre'}, inplace=True)
tracks_dataset = tracks_dataset.drop(columns=['genre_x'])
# Display the resulting dataframe
tracks_dataset

Unnamed: 0,id,name,disc_number,duration_ms,explicit,popularity,track_number,artists,album_type,album_name,...,features_duration_ms,time_signature,start_of_fade_out,tempo_confidence,time_signature_confidence,key_confidence,mode_confidence,n_beats,n_bars,genre
43417,0000vdREvCVMxbQTkS888c,Lolly,1,160725,True,35,1,Rill,single,Lolly,...,160726,4,154.11664,0.578,1.000,0.685,0.583,276.0,67.0,[german]
93608,000CC8EParg64OmTxVnZ0p,It's All Coming Back To Me Now (Glee Cast Vers...,1,322933,False,49,10,Glee Cast,album,Glee Love Songs,...,322933,4,313.21976,0.030,0.659,0.712,0.717,937.0,238.0,[club]
61659,000Iz0K615UepwSJ5z2RE5,Böxig Leise - Pig&Dan Remix,1,515360,False,0,5,Paul Kalkbrenner;Pig&Dan,album,X,...,515360,4,474.23273,0.832,0.996,0.052,0.322,1025.0,257.0,[minimal-techno]
423,000RDCYioLteXcutOjeweY,Teeje Week,1,190203,False,62,1,Jordan Sandhu;Bunty Bains,single,Teeje Week,...,190203,4,183.53633,0.103,1.000,0.339,0.473,501.0,124.0,[hip-hop]
31,000qpdoc97IMTBvF8gwcpy,Tief,1,331240,False,20,9,Paul Kalkbrenner,album,Zeit,...,331240,4,321.92145,0.622,1.000,0.011,0.160,699.0,175.0,[minimal-techno]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62633,7zxHiMmVLt4LGWpOMqOpUh,"Aethu Kari Raavilum - From ""Bangalore Days""",1,325156,False,64,1,Haricharan;Gopi Sundar,album,Bangalore Days,...,325156,4,315.87845,0.736,1.000,0.504,0.547,641.0,162.0,[pop-film]
106924,7zxpdh3EqMq2JCkOI0EqcG,"Two Worlds (From ""Tarzan"")",1,109573,False,31,3,Piano Genie,single,Disney Favourites,...,109574,4,105.35474,0.206,0.993,0.863,1.000,151.0,38.0,[disney]
58393,7zyYmIdjqqiX6kLryb7QBx,以後別做朋友,1,260573,False,64,2,Eric Chou,album,學著愛,...,260573,4,235.45615,0.270,0.822,0.395,0.533,539.0,135.0,[mandopop]
36699,7zybSU9tFO9HNlwmGF7stc,Sunset Drive,1,234300,False,60,5,Stereoclip,album,Echoes,...,234300,4,228.68753,0.958,0.895,0.493,0.233,485.0,121.0,[electronic]


## Handling songs with duplicated names

In [141]:
# Now let's check if we have duplicate names
duplicated_rows = tracks_dataset[tracks_dataset.duplicated('name', keep=False)]
duplicated_rows

Unnamed: 0,id,name,disc_number,duration_ms,explicit,popularity,track_number,artists,album_type,album_name,...,features_duration_ms,time_signature,start_of_fade_out,tempo_confidence,time_signature_confidence,key_confidence,mode_confidence,n_beats,n_bars,genre
93608,000CC8EParg64OmTxVnZ0p,It's All Coming Back To Me Now (Glee Cast Vers...,1,322933,False,49,10,Glee Cast,album,Glee Love Songs,...,322933,4,313.21976,0.030,0.659,0.712,0.717,937.0,238.0,[club]
23666,001APMDOl3qtx1526T11n1,Better,1,176320,False,0,36,Pink Sweat$;KIRBY,compilation,New RnB,...,176321,4,167.67130,0.049,0.945,0.245,0.571,411.0,102.0,"[soul, chill]"
3080,003lo4y8gOylAqDs2scLx2,Addiction,1,177166,True,18,7,Dither,compilation,Dominator - We Will Prevail,...,177167,4,172.80290,0.673,0.878,0.473,0.510,527.0,130.0,[idm]
14603,003vvx7Niy0yvhvHt4a68B,Mr. Brightside,1,222973,False,88,2,The Killers,album,Hot Fuss,...,222973,4,213.19402,0.887,1.000,0.656,0.718,551.0,137.0,"[alternative, alt-rock, rock]"
108267,006c9li2Mybyg5vm6doEfO,Finish Line,1,249696,False,17,14,Logistics;Zara Kershaw,album,Electric Sun,...,249697,4,235.52580,0.738,1.000,0.499,0.590,689.0,172.0,[drum-and-bass]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13100,7zrxUrilLtTk4XnT5G7zF8,Frosty The Snowman,1,131733,False,0,7,Ella Fitzgerald,compilation,All I Want For Christmas Is You,...,131733,4,128.99265,0.179,0.716,0.681,0.517,165.0,41.0,"[jazz, blues]"
106723,7zsw78LtXUD7JfEwH64HK2,Poor Unfortunate Souls,1,291693,False,64,8,Pat Carroll;Disney,album,The Little Mermaid Special Edition,...,291693,4,281.34457,0.164,0.165,0.681,0.585,344.0,85.0,[disney]
70008,7zubR9uYAWjb5KPZTMm85e,Ley Seca,1,263666,False,0,22,Jhayco;Anuel AA,compilation,El perreo es el futuro,...,263667,4,257.06810,0.659,0.870,0.478,0.676,457.0,114.0,"[reggae, reggaeton, latin, latino]"
101803,7zw9dCl3DhKyakaZFfjY5k,I Wonder Why,1,324640,False,15,7,Paul Johnson,album,Feel The Music,...,324640,4,312.16907,0.939,1.000,0.164,0.320,650.0,162.0,[chicago-house]


In [142]:
# Drop all the duplicates rows with same 'name' and 'artists' values, and keep only the row with the highest popularity
tracks_dataset = tracks_dataset.sort_values('popularity', ascending=False).drop_duplicates(['name', 'artists'])
# Display the resulting dataframe
tracks_dataset

Unnamed: 0,id,name,disc_number,duration_ms,explicit,popularity,track_number,artists,album_type,album_name,...,features_duration_ms,time_signature,start_of_fade_out,tempo_confidence,time_signature_confidence,key_confidence,mode_confidence,n_beats,n_bars,genre
13984,45OX2jjEw1l7lOFJfDP9fv,MONEY,1,168227,False,95,2,LISA,single,LALISA,...,168228,4,162.20880,0.789,0.989,0.043,0.253,381.0,95.0,"[k-pop, pop]"
5996,5XeFesFbtLpXzIVDNQP22n,I Wanna Be Yours,1,183956,False,94,12,Arctic Monkeys,album,AM,...,183956,4,174.09161,0.097,1.000,0.545,0.533,202.0,50.0,"[indie, garage, rock]"
83090,2bRKxuH1o7pTmb1y4GfdEc,Clean White Noise - Loopable with no fade,1,90228,False,94,1,White Noise Baby Sleep;White Noise for Babies,album,Best White Noise for Baby Sleep - Loopable wit...,...,90228,0,90.22821,0.000,0.000,0.000,0.116,0.0,0.0,[sleep]
62967,4uUG5RXrOk84mYEfFvj3cK,I'm Good (Blue),1,175238,True,94,1,David Guetta;Bebe Rexha,single,I'm Good (Blue),...,175238,4,163.43945,0.808,0.789,0.898,0.773,368.0,91.0,[pop]
62863,0WtM2NBVQNNJLh6scP13H8,Calm Down (with Selena Gomez),1,239317,False,93,1,Rema;Selena Gomez,single,Calm Down (with Selena Gomez),...,239318,4,231.09079,0.492,1.000,0.593,0.597,424.0,105.0,[pop]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95106,1nSYUQngGS5foiagiilbYU,Singles You Up,1,182693,False,0,3,Jordan Davis,compilation,Mientras hago aromaterapia,...,182693,4,177.22050,0.137,1.000,0.439,0.533,296.0,74.0,[country]
43751,1nU1tlW9ccptgRLfgq05I3,"Schlagzeile groß, Hirn zu klein",1,255093,False,0,26,Frei.Wild,album,Feinde deiner Feinde (Gold Edition),...,255093,4,247.60017,0.120,0.762,0.558,0.509,818.0,205.0,[german]
90963,72ZHywsIxzF1ZkNlMyhsqt,"Ding! Dong! The Witch Is Dead (From ""The Wizar...",1,179035,False,0,4,The Countdown Kids,album,Happy Halloween! (Spooky Favorites for Kids),...,179036,4,175.07846,0.826,0.959,0.715,0.532,352.0,87.0,[children]
16533,5Wt0gIt7RrXz5mJj5pslLC,Zenitsu Theme V2 (Thunder Clap and Flash!),1,137948,False,0,5,Samuel Kim,single,Demon Slayer: Epic Collection,...,137949,4,132.38857,0.545,1.000,0.791,0.703,263.0,67.0,[anime]


In [143]:
# which are the 3 distinct values of 'album_release_date_precision'?
tracks_dataset['album_release_date_precision'].unique()

array(['day', 'year', 'month'], dtype=object)

In [144]:
# count how many rows have each distinct value of 'album_release_date_precision'
tracks_dataset['album_release_date_precision'].value_counts()

album_release_date_precision
day      76274
year      4912
month       39
Name: count, dtype: int64

## Dropping columns track_number, disc_number, album_type, album_total_tracks 

In [145]:
# Drop the columns 'track_number', 'disc_number', 'album_type', 'album_total_tracks'
tracks_dataset = tracks_dataset.drop(columns=['track_number', 'disc_number', 'album_type', 'album_total_tracks'])

## Creating 3 new columns: 'release_year', 'release_month', 'release_day' from the 'release_date' column

In [147]:
# 