In [3]:
import numpy as np
import pandas as pd

In [None]:
#load in spotify songs, train, and test sets 
spotify = pd.read_csv('spotify_hot100.csv').drop_duplicates(subset=["track_name", "artist_name"])
spotify = spotify.loc[:, ~spotify.columns.str.contains('^Unnamed')]

train = pd.read_csv('../data/final_train.csv')
test = pd.read_csv('../data/final_test.csv')

In [7]:
print(train.columns)
print(spotify.columns)

Index(['chords', 'simplified_chords', 'decade', 'main_genre',
       'spotify_song_id'],
      dtype='object')
Index(['Unnamed: 0', 'track_id', 'artist_id', 'success', 'track_name',
       'artists', 'album_name', 'release_date', 'popularity', 'duration_ms',
       'artist_name', 'genres', 'artist_popularity', 'followers', 'song_key',
       'on_hot100'],
      dtype='object')


In [None]:
#find out how much of our training and test data each will be in the popularity subset

spotify_ids = set(spotify['track_id'])
train_spotify_ids = set(train['spotify_song_id'])
test_spotify_ids = set(test['spotify_song_id'])

matches = spotify_ids.intersection(train_spotify_ids)
num_matches = len(matches)

matches2 = spotify_ids.intersection(test_spotify_ids)
num_matches2 = len(matches2)

per = (num_matches/train.shape[0]) * 100
per2 = (num_matches2/test.shape[0]) * 100

print(len(matches))
print(len(matches2))

print(f"Percentage of Training data songs in popularity subset: {per}")
print(f"Percentage of Test data songs in popularity subset: {per2}")



182012
32206
Percentage of Training data songs in popularity subset: 71.20803111038082
Percentage of Test data songs in popularity subset: 71.39911765357927


In [None]:
train_pop = pd.merge(train, spotify, left_on="spotify_song_id", right_on="track_id", how="inner")
print(train_pop.shape)
print(train_pop['on_hot100'].value_counts(dropna=False))
print(train_pop['popularity'].describe())

train_pop.head(5)

#note that some on_hot100 NaN values exist for songs with duplicate song keys that couldn't be assesed for hot 100 correctly


(182012, 20)
on_hot100
0.0    174764
1.0      6395
NaN       853
Name: count, dtype: int64
count    182012.000000
mean         22.748220
std          16.964062
min           0.000000
25%           9.000000
50%          21.000000
75%          34.000000
max          89.000000
Name: popularity, dtype: float64


Unnamed: 0,chords,simplified_chords,decade,main_genre,spotify_song_id,track_id,artist_id,success,track_name,artists,album_name,release_date,popularity,duration_ms,artist_name,genres,artist_popularity,followers,song_key,on_hot100
0,<intro_1> G A Fsmin Bmin G A Fsmin Bmin <verse...,"G,A,Fsmin,Bmin,G,A,Fsmin,Bmin,G,A,Fsmin,Bmin,G...",2010.0,pop,7vpGKEUPrA4UEsS4o4W1tP,7vpGKEUPrA4UEsS4o4W1tP,6tzRZ39aZlNqlUzQlkuhDV,True,Amor de Que,Pabllo Vittar,111,2020-03-24,52,157928,Pabllo Vittar,"brazilian pop, funk pop, tecnobrega",66,4394595,amordeque_pabllo,0.0
1,C F G C F G F Dmin G C F Dmin G C F G C F G F ...,"C,F,G,C,F,G,F,Dmin,G,C,F,Dmin,G,C,F,G,C,F,G,F,...",2000.0,alternative,7MTpNQUBKyyymbS3gPuqwQ,7MTpNQUBKyyymbS3gPuqwQ,1w4UUVXVsk63VT2vTwx1e1,True,The Mummy,Benji Hughes,A Love Extreme,2008-07-22,0,98573,Benji Hughes,,29,18540,themummy_benji,0.0
2,C F C G Amin G F C F C G Amin G F C G C F C G ...,"C,F,C,G,Amin,G,F,C,F,C,G,Amin,G,F,C,G,C,F,C,G,...",2000.0,alternative,6jIIMhcBPRTrkTWh3PXIc7,6jIIMhcBPRTrkTWh3PXIc7,6arKuGbH3PuYfGN6yJ7RA9,True,First Impression,DeYarmond Edison,Silent Signs,2005-01-01,17,210000,DeYarmond Edison,,24,27807,firstimpression_deyarmond,0.0
3,Amin G Gmin B Amin G Gmin B Amin G Gmin B Amin...,"Amin,G,Gmin,B,Amin,G,Gmin,B,Amin,G,Gmin,B,Amin...",2010.0,pop,2zAfQdoOeYujy7QIgDUq9p,2zAfQdoOeYujy7QIgDUq9p,6idjJt47PjFydVRrAlNw4C,True,Saa Blaa,Hans Philip,Forevigt,2019-03-17,42,303620,Hans Philip,"dansktop, dansk pop, dansk rap",54,55512,saablaa_hans,0.0
4,A Amin Emin A Amin Emin A Amin Emin A Amin Emi...,"A,Amin,Emin,A,Amin,Emin,A,Amin,Emin,A,Amin,Emi...",2010.0,metal,2p58AzW86Z0B0pXgE0K2NO,2p58AzW86Z0B0pXgE0K2NO,1Joel9mDWSEZfHPE2KooW3,True,Hell Freezes over II,Gazpacho,March of Ghosts,2012-03-12,19,276813,Gazpacho,"progressive rock, art rock, progressive metal",27,57284,hellfreezesoverii_gazpacho,0.0


In [36]:
test_pop = pd.merge(test, spotify, left_on="spotify_song_id", right_on="track_id", how="inner")
print(test_pop.shape)

print(test_pop['on_hot100'].value_counts(dropna=False))
print(test_pop['popularity'].describe())
test_pop.head(5)

(32206, 20)
on_hot100
0.0    30905
1.0     1161
NaN      140
Name: count, dtype: int64
count    32206.000000
mean        22.785816
std         17.006068
min          0.000000
25%          9.000000
50%         21.000000
75%         34.000000
max         86.000000
Name: popularity, dtype: float64


Unnamed: 0,chords,simplified_chords,decade,main_genre,spotify_song_id,track_id,artist_id,success,track_name,artists,album_name,release_date,popularity,duration_ms,artist_name,genres,artist_popularity,followers,song_key,on_hot100
0,<verse_1> G Fsmin Bmin D/A G D/Fs G Emin A D G...,"G,Fsmin,Bmin,D,G,D,G,Emin,A,D,G,D,G,D,G,Fsmin,...",2010.0,country,4d7FN4kiCq8Mh78nCBj1xf,4d7FN4kiCq8Mh78nCBj1xf,5MM1SY6POwJ5AE6WTJsZ0X,True,Belfast Town,P.J. Murrihy,My Father's House,1994,2,170000,P.J. Murrihy,,21,2012,belfasttown_pj,0.0
1,<intro_1> G Emin <verse_1> G Emin G Emin G Emi...,"G,Emin,G,Emin,G,Emin,G,Emin,G,Emin,G,Emin,G,Am...",2020.0,alternative,2fIPEgY8CJ4hh5UDZa2lB9,2fIPEgY8CJ4hh5UDZa2lB9,1b7AEdUSudOQoZF5ebUxCL,True,EL POETA,HUMBE,ENTROPÍA,2021-03-12,63,199614,HUMBE,,70,2853996,elpoeta_humbe,0.0
2,F Amin G F Amin G F Amin G F Amin G F Amin G F...,"F,Amin,G,F,Amin,G,F,Amin,G,F,Amin,G,F,Amin,G,F...",2000.0,pop rock,5ef3mAxAAjk93V6IpriOrz,5ef3mAxAAjk93V6IpriOrz,5PokPZn11xzZXyXSfnvIM3,True,From Where You Are,Lifehouse,Smoke & Mirrors,2010,51,181280,Lifehouse,post-grunge,63,2143945,fromwhereyouare_lifehouse,1.0
3,<verse_1> Emin7 D C G D/Fs Emin7 D C G D/Fs <c...,"Emin7,D,C,G,D,Emin7,D,C,G,D,Emin7,D,C,G,D,Emin...",2010.0,rock,0QkfzeIbcQ3UlRVzKYD996,0QkfzeIbcQ3UlRVzKYD996,1b9KuJbOULGZUuog0LD1rh,True,Lass Sie rein,Stoppok,Lass Sie rein,2019-12-06,2,268266,Stoppok,"singer-songwriter, german pop",28,23175,lasssierein_stoppok,0.0
4,Gmin Cmin F7 Cmin F7 Cmin F7 Cmin F7 Gmin Cmin...,"Gmin,Cmin,F7,Cmin,F7,Cmin,F7,Cmin,F7,Gmin,Cmin...",1990.0,pop,5k199KPNyhWhlqWVrQLVAE,5k199KPNyhWhlqWVrQLVAE,0PbO1lSBsJPgyqdEypJJVb,True,Esa mujer,Dyango,"Alma, corazón y vida",1975-11-01,50,253000,Dyango,bolero,58,1197206,esamujer_dyango,0.0


In [22]:
#save training and testing popularity subdata

train.to_csv("final_train_pop.csv")
test.to_csv("final_test_pop.csv")