Import data files and concat/reorganize unpopular lists to match intended feature list

Combine with popular to do data clean up work

In [1]:
import pandas as pd
import numpy as np
import os

features = ['uri','year','genre','release_date','popular','key','explicit','mode','chartrank','acousticness','danceability','energy','duration_ms','instrumentalness','valence','tempo','liveness','loudness','speechiness','time_signature']

popIn = pd.read_csv("Data/popularData.csv",header=0)
unPopIn = []

for _, _, files in os.walk("Data/"):
    for file in files:
        if file.find('Popular') == -1:
            unPopIn.append(pd.read_csv(("Data/"+file),header=0))
            
unPopIn = pd.concat(unPopIn)

for i,col in enumerate(unPopIn.columns.tolist()):
    if col not in features:
        unPopIn = unPopIn.drop(col, axis=1)
        
for i,col in enumerate(popIn.columns.tolist()):
    if col not in features:
        popIn = popIn.drop(col, axis=1)

for f in features:
    if f not in unPopIn.columns.tolist():
        unPopIn.insert(0, f, 0)
    if f not in popIn.columns.tolist():
        popIn.insert(0, f, 0)
        
unPopIn = unPopIn[features]
popIn = popIn[features]

rawData = pd.concat([popIn,unPopIn])
rawData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24494 entries, 0 to 3795
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   uri               24494 non-null  object 
 1   year              24494 non-null  int64  
 2   genre             24494 non-null  object 
 3   release_date      24494 non-null  object 
 4   popular           24494 non-null  int64  
 5   key               24494 non-null  int64  
 6   explicit          24494 non-null  int64  
 7   mode              24494 non-null  int64  
 8   chartrank         24494 non-null  int64  
 9   acousticness      24494 non-null  float64
 10  danceability      24494 non-null  float64
 11  energy            24494 non-null  float64
 12  duration_ms       24494 non-null  int64  
 13  instrumentalness  24494 non-null  float64
 14  valence           24494 non-null  float64
 15  tempo             24494 non-null  float64
 16  liveness          24494 non-null  float64

In [2]:
# Matching case for genres
rawData.genre = rawData.genre.str.lower()
pd.unique(rawData.genre)

array(['country', 'jazz', 'latin', 'pop', 'r&b'], dtype=object)

In [3]:
# Normalize duration using min/max
rawData.duration_ms = (rawData.duration_ms-rawData.duration_ms.min())/(rawData.duration_ms.max()-rawData.duration_ms.min())

In [4]:
# Get dummies for key, explicit, mode, time_sig. Leave out popular as that's our target variable
dummies = ['key','explicit','mode','time_signature']
rawData = pd.get_dummies(data=rawData, columns=dummies)

In [5]:
# Clean up release date - some data is only the year, some data is stored year-month-day, and some is stored month-day-year
releaseDates = rawData.release_date.str.split(pat=r"[-,/]").to_numpy()

release_year = []
release_month = []
release_day = []

# year month day
for date in releaseDates:
    a = int(date[0])
    
    if len(date) > 1:
        b = int(date[1])
        c = int(date[2])
        
        if int(date[0]) > 1000:
            release_year.append(a)
            release_month.append(b)
            release_day.append(c)
        else:
            release_year.append(c)
            release_month.append(a)
            release_day.append(b)
    else:
        release_year.append(a)
        release_month.append(7)
        release_day.append(1)
        
rawData = rawData.drop('release_date', axis=1)
rawData.insert(3, 'release_year', release_year)
rawData.insert(4, 'release_month', release_month)
rawData.insert(5, 'release_day', release_day)

In [6]:
releaseDatePercent = []

for i,month in enumerate(release_month):
    releaseDatePercent.append(((month-1) * 30.4 + release_day[i]-1) / 365)
    
releaseDatePercent = (np.array(releaseDatePercent)-min(releaseDatePercent))/(max(releaseDatePercent)-min(releaseDatePercent))
rawData.insert(3, 'releaseDatePercent', releaseDatePercent)

rawData.info()
rawData.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24494 entries, 0 to 3795
Data columns (total 39 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   uri                 24494 non-null  object 
 1   year                24494 non-null  int64  
 2   genre               24494 non-null  object 
 3   releaseDatePercent  24494 non-null  float64
 4   release_year        24494 non-null  int64  
 5   release_month       24494 non-null  int64  
 6   release_day         24494 non-null  int64  
 7   popular             24494 non-null  int64  
 8   chartrank           24494 non-null  int64  
 9   acousticness        24494 non-null  float64
 10  danceability        24494 non-null  float64
 11  energy              24494 non-null  float64
 12  duration_ms         24494 non-null  float64
 13  instrumentalness    24494 non-null  float64
 14  valence             24494 non-null  float64
 15  tempo               24494 non-null  float64
 16  liven

Unnamed: 0,uri,year,genre,releaseDatePercent,release_year,release_month,release_day,popular,chartrank,acousticness,...,key_10,key_11,explicit_0,explicit_1,mode_0,mode_1,time_signature_1,time_signature_3,time_signature_4,time_signature_5
0,spotify:track:1sR3kJi14jA8Gau3a0yXAo,2002,country,0.253019,2002,4,2,1,1,0.255,...,0,0,1,0,0,1,0,0,1,0
1,spotify:track:1FV374EPG5CrjdIbIMLkcv,2002,country,0.038419,2002,1,15,1,2,0.338,...,0,1,1,0,0,1,0,0,1,0
2,spotify:track:3YxKqZFpcxBPvpUssL8FS2,2002,country,0.0,2001,1,1,1,3,0.347,...,0,0,1,0,0,1,0,0,1,0
3,spotify:track:0eHT8N5YQglv8cYYizXvSw,2002,country,0.756312,2000,10,3,1,4,0.452,...,0,0,1,0,0,1,0,0,1,0
4,spotify:track:0qnoS5hMZMphvKWi0D2LQh,2002,country,0.0,2002,1,1,1,5,0.155,...,0,0,1,0,0,1,0,0,1,0


In [30]:
# Consider dropping these tracks since release year is greater than chart year
rawData[rawData.release_year > rawData.year].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1161 entries, 5 to 2803
Data columns (total 39 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   uri                 1161 non-null   object 
 1   year                1161 non-null   int64  
 2   genre               1161 non-null   object 
 3   releaseDatePercent  1161 non-null   float64
 4   release_year        1161 non-null   int64  
 5   release_month       1161 non-null   int64  
 6   release_day         1161 non-null   int64  
 7   popular             1161 non-null   int64  
 8   chartrank           1161 non-null   int64  
 9   acousticness        1161 non-null   float64
 10  danceability        1161 non-null   float64
 11  energy              1161 non-null   float64
 12  duration_ms         1161 non-null   float64
 13  instrumentalness    1161 non-null   float64
 14  valence             1161 non-null   float64
 15  tempo               1161 non-null   float64
 16  livene