In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams["figure.dpi"] = 300

In [16]:
train = pd.read_csv("data/train.csv.gz")
test = pd.read_csv("data/test.csv.gz")

train = train.iloc[:, 1:]
test = test.iloc[:, 1:]

train["artists"] = train["artists"].str[1:-1].str.replace("'", "").str.split(",")
train["artist_ids"] = train["artist_ids"].str[1:-1].str.replace("'", "").str.split(",")

test["artists"] = test["artists"].str[1:-1].str.replace("'", "").str.split(",")
test["artist_ids"] = test["artist_ids"].str[1:-1].str.replace("'", "").str.split(",")

In [17]:
y_train = train["decade"] - 1
y_test = test["decade"] - 1

X_train = train.drop(["year", "decade"], axis=1)
X_test = test.drop(["year", "decade"], axis=1)

In [18]:
continuous_cols = ["explicit", "danceability", "energy", "key", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo", "duration_ms", "time_signature", "num_artists"]
X_train = X_train[continuous_cols + ["primary_artist"]]
X_test = X_test[continuous_cols + ["primary_artist"]]

In [19]:
import category_encoders as ce

te = ce.TargetEncoder()
te.fit(X_train["primary_artist"], train["year"])

X_train["primary_artist"] = te.transform(X_train["primary_artist"])
X_train["explicit"] = X_train["explicit"].astype(int)

X_test["primary_artist"] = te.transform(X_test["primary_artist"])
X_test["explicit"] = X_test["explicit"].astype(int)

In [20]:
X_train

Unnamed: 0,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,num_artists,primary_artist
0,0,0.434,0.248,1,-17.622,1,0.0594,0.99400,0.804000,0.0968,0.2520,102.034,254693,4.0,1,2007.032433
1,0,0.585,0.187,9,-17.351,1,0.0428,0.86800,0.000000,0.2010,0.5310,98.907,208507,4.0,1,1978.529700
2,0,0.442,0.421,0,-6.483,1,0.1980,0.52900,0.000001,0.1130,0.9650,205.581,200293,3.0,1,2013.137622
3,0,0.338,0.212,7,-17.780,1,0.0949,0.84700,0.003370,0.1670,0.1790,81.949,640933,4.0,3,2003.890173
4,0,0.511,0.825,11,-10.767,1,0.0350,0.00281,0.033200,0.3590,0.8990,167.625,200707,4.0,1,1994.305755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
771813,0,0.735,0.380,0,-9.730,1,0.0365,0.57700,0.439000,0.1500,0.4660,169.963,156671,3.0,1,2016.246368
771814,0,0.355,0.074,2,-17.670,1,0.0317,0.97600,0.928000,0.0892,0.0605,77.003,197667,4.0,1,2013.794955
771815,0,0.757,0.231,9,-11.634,1,0.0361,0.83000,0.000013,0.2980,0.6050,125.227,144627,4.0,1,2008.709622
771816,0,0.439,0.219,11,-14.741,1,0.0337,0.82800,0.001170,0.1430,0.4020,137.498,212693,3.0,1,1997.927018


In [21]:
from sklearn.model_selection import train_test_split

X_dev, X_val, y_dev, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=88)

In [22]:
X_dev.shape, X_val.shape

((617454, 16), (154364, 16))

In [23]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [24]:
np.unique(y_dev)

array([0, 1, 2, 3, 4, 5, 6], dtype=int64)

In [28]:
model = keras.Sequential([
                        layers.Dense(16, input_shape=(16,)), 
                        layers.Activation("relu"), 
                        layers.Dense(12), 
                        layers.Activation("relu"), 
                        layers.Dense(7), 
                        layers.Activation("softmax")])

In [29]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 16)                272       
                                                                 
 activation_6 (Activation)   (None, 16)                0         
                                                                 
 dense_7 (Dense)             (None, 12)                204       
                                                                 
 activation_7 (Activation)   (None, 12)                0         
                                                                 
 dense_8 (Dense)             (None, 7)                 91        
                                                                 
 activation_8 (Activation)   (None, 7)                 0         
                                                                 
Total params: 567
Trainable params: 567
Non-trainable 

In [30]:
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
history_callback = model.fit(X_dev, y_dev, validation_data=(X_val, y_val), batch_size=128, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [33]:
from sklearn.metrics import accuracy_score

dev_probs = model.predict(X_dev)
test_probs = model.predict(X_test)

print("dev accuracy: ", accuracy_score(dev_probs.argmax(axis=1), y_dev))
print("test accuracy: ", accuracy_score(test_probs.argmax(axis=1), y_test))

dev accuracy:  0.40869117375545383
test accuracy:  0.41113731180845275
