In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.dpi"] = 300

In [75]:
df = pd.read_csv("data/track_features.csv.gz")
df = df.iloc[:, 1:]
df.head()

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,year,release_date
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],1,1,False,0.47,...,0.0727,0.0261,1.1e-05,0.356,0.503,117.906,210133,4.0,1999,1999-11-02
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],2,1,True,0.599,...,0.188,0.0129,7.1e-05,0.155,0.489,103.68,206200,4.0,1999,1999-11-02
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],3,1,False,0.315,...,0.483,0.0234,2e-06,0.122,0.37,149.749,298893,4.0,1999,1999-11-02
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],4,1,True,0.44,...,0.237,0.163,4e-06,0.121,0.574,96.752,213640,4.0,1999,1999-11-02
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,['Rage Against The Machine'],['2d0hyoQ5ynDBnkvAbJKORj'],5,1,False,0.426,...,0.0701,0.00162,0.105,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02


In [76]:
df.shape

(1204025, 24)

In [77]:
# artists column currently read as string, as well as artist id
# convert these to lists
df["artists"] = df["artists"].str[1:-1].str.replace("'", "").str.split(",")
df["artist_ids"] = df["artist_ids"].str[1:-1].str.replace("'", "").str.split(",")

# create 2 new columns based on artist list
df["primary_artist"] = df["artists"].str[0]
df["num_artists"] = df["artists"].str.len()

# create new dependent var by rounding the year variable 
df["decade"] = df["year"] // 10 * 10

# remove artists with over 1000 songs in this dataset; most of them are old classical artists like Beethoven 
# who have a ton of cover artists / musicians play their music and list them as primary artist
# also filter artists with greater than 10 songs, approximately meaning they've put out an album
df = df.groupby('primary_artist').filter(lambda x: x['primary_artist'].count() < 1000 and x['primary_artist'].count() > 10)

# filter out songs older than 1950 since there's too few of them (~.1% of the dataset)
df = df[df["decade"] > 1950]

In [78]:
decade_mapper = {1950: "0", 1960: "1", 1970: "2", 1980: "3", 1990: "4", 2000: "5", 2010: "6", 2020: "7"}
df["decade"] = df["decade"].replace(decade_mapper)

In [79]:
df.head()

Unnamed: 0,id,name,album,album_id,artists,artist_ids,track_number,disc_number,explicit,danceability,...,liveness,valence,tempo,duration_ms,time_signature,year,release_date,primary_artist,num_artists,decade
0,7lmeHLHBe4nmXzuXc0HDjk,Testify,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],[2d0hyoQ5ynDBnkvAbJKORj],1,1,False,0.47,...,0.356,0.503,117.906,210133,4.0,1999,1999-11-02,Rage Against The Machine,1,4
1,1wsRitfRRtWyEapl0q22o8,Guerrilla Radio,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],[2d0hyoQ5ynDBnkvAbJKORj],2,1,True,0.599,...,0.155,0.489,103.68,206200,4.0,1999,1999-11-02,Rage Against The Machine,1,4
2,1hR0fIFK2qRG3f3RF70pb7,Calm Like a Bomb,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],[2d0hyoQ5ynDBnkvAbJKORj],3,1,False,0.315,...,0.122,0.37,149.749,298893,4.0,1999,1999-11-02,Rage Against The Machine,1,4
3,2lbASgTSoDO7MTuLAXlTW0,Mic Check,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],[2d0hyoQ5ynDBnkvAbJKORj],4,1,True,0.44,...,0.121,0.574,96.752,213640,4.0,1999,1999-11-02,Rage Against The Machine,1,4
4,1MQTmpYOZ6fcMQc56Hdo7T,Sleep Now In the Fire,The Battle Of Los Angeles,2eia0myWFgoHuttJytCxgX,[Rage Against The Machine],[2d0hyoQ5ynDBnkvAbJKORj],5,1,False,0.426,...,0.0789,0.539,127.059,205600,4.0,1999,1999-11-02,Rage Against The Machine,1,4


In [80]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=88)
train.to_csv("data/train.csv.gz", compression="gzip")
test.to_csv("data/test.csv.gz", compression="gzip")