In [1]:
import pandas as pd
import numpy as np
#
import torch
import torch.nn as nn 
from tab_transformer_pytorch import TabTransformer
#
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
#


In [2]:
dataset = pd.read_csv("spotify-tracks-dataset/dataset.csv")

In [3]:
dataset.info(), dataset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Unnamed: 0        114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

(None,
    Unnamed: 0                track_id                 artists  \
 0           0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino   
 1           1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward   
 2           2  1iJBSr7s7jYXzM8EGcbK5b  Ingrid Michaelson;ZAYN   
 3           3  6lfxq3CG4xtTiEg7opyCyx            Kina Grannis   
 4           4  5vjLSffimiIP26QG5WcN2K        Chord Overstreet   
 
                                           album_name  \
 0                                             Comedy   
 1                                   Ghost (Acoustic)   
 2                                     To Begin Again   
 3  Crazy Rich Asians (Original Motion Picture Sou...   
 4                                            Hold On   
 
                    track_name  popularity  duration_ms  explicit  \
 0                      Comedy          73       230666     False   
 1            Ghost - Acoustic          55       149610     False   
 2              To Begin Again          57  

In [4]:
# Primero tenemos que eliminar columnas irrelevantes o que no aporten nada a la recomendación de las canciones como  el id_track o 'unnamed'
dataset.drop(['Unnamed: 0', 'track_id'], axis=1, inplace=True)
dataset.info(), dataset.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   artists           113999 non-null  object 
 1   album_name        113999 non-null  object 
 2   track_name        113999 non-null  object 
 3   popularity        114000 non-null  int64  
 4   duration_ms       114000 non-null  int64  
 5   explicit          114000 non-null  bool   
 6   danceability      114000 non-null  float64
 7   energy            114000 non-null  float64
 8   key               114000 non-null  int64  
 9   loudness          114000 non-null  float64
 10  mode              114000 non-null  int64  
 11  speechiness       114000 non-null  float64
 12  acousticness      114000 non-null  float64
 13  instrumentalness  114000 non-null  float64
 14  liveness          114000 non-null  float64
 15  valence           114000 non-null  float64
 16  tempo             11

(None,
                   artists                                         album_name  \
 0             Gen Hoshino                                             Comedy   
 1            Ben Woodward                                   Ghost (Acoustic)   
 2  Ingrid Michaelson;ZAYN                                     To Begin Again   
 3            Kina Grannis  Crazy Rich Asians (Original Motion Picture Sou...   
 4        Chord Overstreet                                            Hold On   
 
                    track_name  popularity  duration_ms  explicit  \
 0                      Comedy          73       230666     False   
 1            Ghost - Acoustic          55       149610     False   
 2              To Begin Again          57       210826     False   
 3  Can't Help Falling In Love          71       201933     False   
 4                     Hold On          82       198853     False   
 
    danceability  energy  key  loudness  mode  speechiness  acousticness  \
 0         0.

In [5]:
# Ahora vamos a eliminar los valores codificados como na o (non-defined)
dataset.dropna(inplace=True)
dataset.info(), dataset.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113999 entries, 0 to 113999
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   artists           113999 non-null  object 
 1   album_name        113999 non-null  object 
 2   track_name        113999 non-null  object 
 3   popularity        113999 non-null  int64  
 4   duration_ms       113999 non-null  int64  
 5   explicit          113999 non-null  bool   
 6   danceability      113999 non-null  float64
 7   energy            113999 non-null  float64
 8   key               113999 non-null  int64  
 9   loudness          113999 non-null  float64
 10  mode              113999 non-null  int64  
 11  speechiness       113999 non-null  float64
 12  acousticness      113999 non-null  float64
 13  instrumentalness  113999 non-null  float64
 14  liveness          113999 non-null  float64
 15  valence           113999 non-null  float64
 16  tempo             11

(None,
                   artists                                         album_name  \
 0             Gen Hoshino                                             Comedy   
 1            Ben Woodward                                   Ghost (Acoustic)   
 2  Ingrid Michaelson;ZAYN                                     To Begin Again   
 3            Kina Grannis  Crazy Rich Asians (Original Motion Picture Sou...   
 4        Chord Overstreet                                            Hold On   
 
                    track_name  popularity  duration_ms  explicit  \
 0                      Comedy          73       230666     False   
 1            Ghost - Acoustic          55       149610     False   
 2              To Begin Again          57       210826     False   
 3  Can't Help Falling In Love          71       201933     False   
 4                     Hold On          82       198853     False   
 
    danceability  energy  key  loudness  mode  speechiness  acousticness  \
 0         0.

In [6]:
# Ahora vamos a codificar como variables categóricas la columna de artistas, álbum y género
labelEncoder = LabelEncoder()
dataset['artists'] = labelEncoder.fit_transform(dataset['artists'])
dataset['album_name'] = labelEncoder.fit_transform(dataset['album_name'])
dataset['track_genre'] = labelEncoder.fit_transform(dataset['track_genre'])
dataset.info(), dataset.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113999 entries, 0 to 113999
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   artists           113999 non-null  int64  
 1   album_name        113999 non-null  int64  
 2   track_name        113999 non-null  object 
 3   popularity        113999 non-null  int64  
 4   duration_ms       113999 non-null  int64  
 5   explicit          113999 non-null  bool   
 6   danceability      113999 non-null  float64
 7   energy            113999 non-null  float64
 8   key               113999 non-null  int64  
 9   loudness          113999 non-null  float64
 10  mode              113999 non-null  int64  
 11  speechiness       113999 non-null  float64
 12  acousticness      113999 non-null  float64
 13  instrumentalness  113999 non-null  float64
 14  liveness          113999 non-null  float64
 15  valence           113999 non-null  float64
 16  tempo             11

(None,
    artists  album_name                  track_name  popularity  duration_ms  \
 0    10357        8100                      Comedy          73       230666   
 1     3287       14796            Ghost - Acoustic          55       149610   
 2    12397       39162              To Begin Again          57       210826   
 3    14839        8580  Can't Help Falling In Love          71       201933   
 4     5255       16899                     Hold On          82       198853   
 
    explicit  danceability  energy  key  loudness  mode  speechiness  \
 0     False         0.676  0.4610    1    -6.746     0       0.1430   
 1     False         0.420  0.1660    1   -17.235     1       0.0763   
 2     False         0.438  0.3590    0    -9.734     1       0.0557   
 3     False         0.266  0.0596    0   -18.515     1       0.0363   
 4     False         0.618  0.4430    2    -9.681     1       0.0526   
 
    acousticness  instrumentalness  liveness  valence    tempo  time_signatur

In [7]:
# Ahora vamos a crear el modelo

# Primero tenemos que crear las variables X e Y
X = dataset.drop(['track_name'], axis=1)
Y = dataset['track_name']
X.shape, Y.shape

((113999, 18), (113999,))

In [8]:
# Dividimos en entrenamiento y testeo
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3) # Pruebo 0.3 y luego cambiamos
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((79799, 18), (34200, 18), (79799,), (34200,))

In [9]:
# Estandarizamos las características
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
X_train_scaled.shape, X_test_scaled.shape

labelEncoder = LabelEncoder()
Y_train_encoded = labelEncoder.fit_transform(Y_train)
Y_train_encoded.shape, type(Y_train_encoded), type(Y_train_encoded[0])

((79799,), numpy.ndarray, numpy.int64)

In [21]:
# Aseguramos error en las etiquetas de y_train_encoded
#Y_train_encoded = [int(label) for label in Y_train_encoded]

# Pasamos los datos a tensores pytorch
X_train_tensor = torch.tensor(X_train_scaled,   dtype=torch.int)
Y_train_tensor = torch.tensor(Y_train_encoded,  dtype=torch.long)

In [22]:
# mostramos todos los tipos de datos de las columnas
print(dataset.dtypes)

artists               int64
album_name            int64
track_name           object
popularity            int64
duration_ms           int64
explicit               bool
danceability        float64
energy              float64
key                   int64
loudness            float64
mode                  int64
speechiness         float64
acousticness        float64
instrumentalness    float64
liveness            float64
valence             float64
tempo               float64
time_signature        int64
track_genre           int64
dtype: object


In [31]:
# Creamos el modelo
modelo = TabTransformer(
    categories=[len(dataset[col].unique()) for col in X.columns],
    num_continuous=len(X.columns),
    dim=32,
    depth=6,
    heads=8,
    attn_dropout=0.1,
    ff_dropout=.1,
    mlp_hidden_mults=(4,2),
)

In [None]:
X_train_tensor.dtype

In [33]:
# Entrenamos el modelo
predicciones = modelo(X_train_tensor, Y_train_encoded)

: 

In [None]:
accuracy = modelo.score(X_train_scaled, Y_train)
print(f"Precisión del modelo: accuracy = {accuracy}")
