In [2]:
import pandas as pd
import numpy as np

In [12]:
artists_dataset = pd.read_csv('dataset/tabular/artists.csv')
tracks_dataset = pd.read_csv('dataset/tabular/tracks.csv')

In [13]:
tracks_dataset

Unnamed: 0,id,name,disc_number,duration_ms,explicit,popularity,track_number,artists,album_type,album_name,...,features_duration_ms,time_signature,start_of_fade_out,tempo_confidence,time_signature_confidence,key_confidence,mode_confidence,n_beats,n_bars,genre
0,4rjA5kJJWbwU1prXCvg6Fk,Grey,1,290479,False,52,5,Kölsch,album,1989,...,290479,3,275.90530,0.020,1.000,0.500,0.525,705.0,234.0,minimal-techno
1,6xzpUzzIquIyUzTLbbgSdI,Thrown,1,539229,False,45,5,Kiasmos,album,Kiasmos,...,539229,4,517.28253,0.678,0.326,0.410,0.589,1074.0,269.0,minimal-techno
2,56tXgHlSHCfgmGhwVXNizc,Routine,1,264200,False,39,5,Joris Delacroix,album,Night Visions,...,264200,3,259.04180,0.847,0.457,0.904,0.760,516.0,171.0,minimal-techno
3,0s3wIBczp6TdSJ2y8cveJl,Confronted - Anfisa Letyago Stranger Remix,1,387413,False,0,6,Pan-Pot;Anfisa Letyago,single,Confronted Remixes,...,387414,4,387.41360,0.882,0.498,0.512,0.503,826.0,207.0,minimal-techno
4,4PSbDDd1LRYMhqPXvza6I2,Jupiter Sunrise,1,248956,False,0,1,Kollektiv Turmstrasse,compilation,10 Years Diynamic,...,248957,4,239.63574,0.882,1.000,0.474,0.459,493.0,123.0,minimal-techno
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109542,0wiDjWz3U1WfwXSrWHXe2b,I'm Good (Blue) - REAPER Extended Remix,1,209655,True,23,2,David Guetta;Bebe Rexha;REAPER,single,I'm Good (Blue) [REAPER Remix],...,209655,4,200.10376,0.729,1.000,0.088,0.182,585.0,145.0,edm
109543,46n9OJc7LOIVdj8t2l5WA5,End Of The Night,1,205724,True,56,1,Danny Avila,single,End Of The Night,...,205724,4,200.98611,0.712,1.000,0.395,0.535,339.0,84.0,edm
109544,127uq83uGFapbddqiMUKky,Sexy Bitch (feat. Akon),1,195853,True,80,3,David Guetta;Akon,album,One More Love,...,195853,4,189.48643,0.814,0.990,0.097,0.330,417.0,106.0,edm
109545,0ZdUHFxifUJNqo7G4aJzoF,Savannah,1,208698,False,60,1,Diviners;Philly K.,single,Savannah,...,208698,4,197.41605,0.650,1.000,0.545,0.604,359.0,89.0,edm


In [14]:
artists_dataset

Unnamed: 0,id,name,popularity,followers,genres
0,19slOlozrbxkEIMD8L3Qsv,Fast Eddie,28.0,9904.0,"['acid house', 'chicago house', 'chicago rap',..."
1,4BIamAD25vwYldaOWTEsXd,Joe Smooth,38.0,13047.0,"['chicago house', 'classic house']"
2,0B9P7RXrukgIdmutz9XMVN,"Farley ""Jackmaster"" Funk",21.0,11933.0,"['acid house', 'chicago house', 'classic house']"
3,09xC3MewWz48F1OpYckXTZ,Jomanda,20.0,4724.0,"['chicago house', 'garage house']"
4,0RBnTX5xoVa1bDYt9Qbies,Floorplan,37.0,42515.0,"['chicago house', 'deep house', 'float house',..."
...,...,...,...,...,...
30136,2KjxvxgJvbwweNVRMSuIRG,Hoang,49.0,22134.0,[]
30137,6FPDULwgllPquFdqdzj5gi,Robin Hustin,45.0,14198.0,"['dutch edm', 'gaming edm']"
30138,7CSAJPH9eLCOvPc9jn1I6e,TobiMorrow,44.0,2339.0,['sky room']
30139,22lnnGKlaDxk8sfzCNRJuA,Diviners,47.0,74955.0,['gaming edm']


# Data Exploration and Cleaning

## Artists Dataset

In [16]:
artists_dataset.describe()

Unnamed: 0,popularity,followers
count,30140.0,30140.0
mean,36.659788,416866.3
std,17.229059,2484219.0
min,0.0,0.0
25%,24.0,1472.0
50%,37.0,15814.5
75%,49.0,118639.2
max,100.0,114163500.0


#### Checking and handling missing values

In [17]:
# Check for missing values
artists_dataset.isnull().sum()

id            1
name          2
popularity    1
followers     1
genres        1
dtype: int64

In [18]:
# print the rows with missing values
artists_dataset[artists_dataset.isnull().any(axis=1)]

Unnamed: 0,id,name,popularity,followers,genres
11872,,,,,
21223,4oPYazJJ1o4rWBrTw9lm40,,47.0,35655.0,[]


In [19]:
# Drop the rows with missing values
artists_dataset = artists_dataset.dropna()

A way to join the datasets could be to merge the datasets on the 'name' column in the 'artists' dataset and the 'artists' column in the 'tracks' dataset. However, the 'artists' column in the 'tracks' dataset contains multiple artists separated by a semicolon. To join the datasets on the 'artists' column, we need to extract the primary artist's name from the 'artists' column in the 'tracks' dataset. We can then merge the datasets on the primary artist's name.

In [9]:
# Extract primary artist's name from the 'artists' column in the 'tracks' dataset
# Assuming the primary artist is the first listed in the 'artists' column
tracks_dataset['primary_artist'] = tracks_dataset['artists'].apply(lambda x: x.split(';')[0])

# Merge the datasets on artist name
merged_df = pd.merge(tracks_dataset, artists_dataset, how='left', left_on='primary_artist', right_on='name', suffixes=('_track', '_artist'))

# Drop the duplicated 'name' column and any other unnecessary columns from the merge
merged_df = merged_df.drop(columns=['name_artist'])

merged_df

Unnamed: 0,id_track,name_track,disc_number,duration_ms,explicit,popularity_track,track_number,artists,album_type,album_name,...,key_confidence,mode_confidence,n_beats,n_bars,genre,primary_artist,id_artist,popularity_artist,followers,genres
0,4rjA5kJJWbwU1prXCvg6Fk,Grey,1,290479,False,52,5,Kölsch,album,1989,...,0.500,0.525,705.0,234.0,minimal-techno,Kölsch,2D9Oe8R9UhbMvFAsMJpXj0,51.0,218818.0,"['danish electronic', 'danish techno', 'deep e..."
1,6xzpUzzIquIyUzTLbbgSdI,Thrown,1,539229,False,45,5,Kiasmos,album,Kiasmos,...,0.410,0.589,1074.0,269.0,minimal-techno,Kiasmos,6X8lhZ7YaRUBlOsOYimlyD,47.0,231613.0,"['electronica', 'icelandic electronic']"
2,56tXgHlSHCfgmGhwVXNizc,Routine,1,264200,False,39,5,Joris Delacroix,album,Night Visions,...,0.904,0.760,516.0,171.0,minimal-techno,Joris Delacroix,3HRRzIZNQFus3xlUx2xKy1,44.0,100828.0,"['deep euro house', 'minimal melodic techno', ..."
3,0s3wIBczp6TdSJ2y8cveJl,Confronted - Anfisa Letyago Stranger Remix,1,387413,False,0,6,Pan-Pot;Anfisa Letyago,single,Confronted Remixes,...,0.512,0.503,826.0,207.0,minimal-techno,Pan-Pot,6OQOvP7RAdmAKVXXQqD0Se,36.0,209027.0,"['german techno', 'minimal techno', 'raw techno']"
4,4PSbDDd1LRYMhqPXvza6I2,Jupiter Sunrise,1,248956,False,0,1,Kollektiv Turmstrasse,compilation,10 Years Diynamic,...,0.474,0.459,493.0,123.0,minimal-techno,Kollektiv Turmstrasse,1oXiuCd5F0DcnmXH5KaM6N,45.0,245839.0,"['german techno', 'hamburg electronic', 'minim..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110721,46n9OJc7LOIVdj8t2l5WA5,End Of The Night,1,205724,True,56,1,Danny Avila,single,End Of The Night,...,0.395,0.535,339.0,84.0,edm,Danny Avila,5y3G1B8cpCTaoq0uDgjwzH,47.0,46184.0,[]
110722,127uq83uGFapbddqiMUKky,Sexy Bitch (feat. Akon),1,195853,True,80,3,David Guetta;Akon,album,One More Love,...,0.097,0.330,417.0,106.0,edm,David Guetta,1Cs0zKBU1kc0i8ypK3B9ai,86.0,26445596.0,"['big room', 'dance pop', 'edm', 'pop', 'pop d..."
110723,0ZdUHFxifUJNqo7G4aJzoF,Savannah,1,208698,False,60,1,Diviners;Philly K.,single,Savannah,...,0.545,0.604,359.0,89.0,edm,Diviners,22lnnGKlaDxk8sfzCNRJuA,47.0,74955.0,['gaming edm']
110724,0M4HcGtxIWVVH4rSNp6XhQ,Dejalo,1,202826,False,57,5,King,album,Champagne Talk,...,0.847,0.710,304.0,75.0,hip-hop,King,5NHm4TU5Twz7owibYxJfFU,71.0,5284466.0,"['desi hip hop', 'desi pop', 'hindi hip hop']"


However, this approach isn't perfect because the streams of the songs are distributed among all the artists as said by [spotify it's self](https://community.spotify.com/t5/Content-Questions/Two-main-artists-on-a-song/td-p/5826073). 

Then we can adopt a more inclusive approach and join for each artist linked in a track. This however, introduces complexity, as it requires expanding the tracks dataset to create a row for each artist-track combination before joining with the 'artists' dataset.


In [8]:
# Create a new DataFrame to hold the expanded artist-track combinations
expanded_tracks_df = tracks_dataset.assign(artists=tracks_dataset['artists'].str.split(';')).explode('artists')

# Strip leading and trailing spaces from the exploded 'artists' column to ensure clean matching
expanded_tracks_df['artists'] = expanded_tracks_df['artists'].str.strip()

# Merge the expanded tracks dataset with the artists dataset on artist name
expanded_merged_df = pd.merge(expanded_tracks_df, artists_dataset, how='left', left_on='artists', right_on='name', suffixes=('_track', '_artist'))

# Preview the newly merged dataset
expanded_merged_df

Unnamed: 0,id_track,name_track,disc_number,duration_ms,explicit,popularity_track,track_number,artists,album_type,album_name,...,mode_confidence,n_beats,n_bars,genre,primary_artist,id_artist,name_artist,popularity_artist,followers,genres
0,4rjA5kJJWbwU1prXCvg6Fk,Grey,1,290479,False,52,5,Kölsch,album,1989,...,0.525,705.0,234.0,minimal-techno,Kölsch,2D9Oe8R9UhbMvFAsMJpXj0,Kölsch,51.0,218818.0,"['danish electronic', 'danish techno', 'deep e..."
1,6xzpUzzIquIyUzTLbbgSdI,Thrown,1,539229,False,45,5,Kiasmos,album,Kiasmos,...,0.589,1074.0,269.0,minimal-techno,Kiasmos,6X8lhZ7YaRUBlOsOYimlyD,Kiasmos,47.0,231613.0,"['electronica', 'icelandic electronic']"
2,56tXgHlSHCfgmGhwVXNizc,Routine,1,264200,False,39,5,Joris Delacroix,album,Night Visions,...,0.760,516.0,171.0,minimal-techno,Joris Delacroix,3HRRzIZNQFus3xlUx2xKy1,Joris Delacroix,44.0,100828.0,"['deep euro house', 'minimal melodic techno', ..."
3,0s3wIBczp6TdSJ2y8cveJl,Confronted - Anfisa Letyago Stranger Remix,1,387413,False,0,6,Pan-Pot,single,Confronted Remixes,...,0.503,826.0,207.0,minimal-techno,Pan-Pot,6OQOvP7RAdmAKVXXQqD0Se,Pan-Pot,36.0,209027.0,"['german techno', 'minimal techno', 'raw techno']"
4,0s3wIBczp6TdSJ2y8cveJl,Confronted - Anfisa Letyago Stranger Remix,1,387413,False,0,6,Anfisa Letyago,single,Confronted Remixes,...,0.503,826.0,207.0,minimal-techno,Pan-Pot,7icoOm5fKKPo49jVxoj1Cq,Anfisa Letyago,40.0,87586.0,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
153332,127uq83uGFapbddqiMUKky,Sexy Bitch (feat. Akon),1,195853,True,80,3,Akon,album,One More Love,...,0.330,417.0,106.0,edm,David Guetta,0z4gvV4rjIZ9wHck67ucSV,Akon,79.0,3902855.0,['dance pop']
153333,0ZdUHFxifUJNqo7G4aJzoF,Savannah,1,208698,False,60,1,Diviners,single,Savannah,...,0.604,359.0,89.0,edm,Diviners,22lnnGKlaDxk8sfzCNRJuA,Diviners,47.0,74955.0,['gaming edm']
153334,0ZdUHFxifUJNqo7G4aJzoF,Savannah,1,208698,False,60,1,Philly K.,single,Savannah,...,0.604,359.0,89.0,edm,Diviners,1v8lhkt5jZgHT8xi1wYNUS,Philly K.,38.0,2152.0,[]
153335,0M4HcGtxIWVVH4rSNp6XhQ,Dejalo,1,202826,False,57,5,King,album,Champagne Talk,...,0.710,304.0,75.0,hip-hop,King,5NHm4TU5Twz7owibYxJfFU,King,71.0,5284466.0,"['desi hip hop', 'desi pop', 'hindi hip hop']"
