In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import r2_score

from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import lightgbm as lgb #!conda install -c conda-forge lightgbm -y

import plotly.graph_objects as go

In [2]:
def reducir(X_train, y_train, X_test=None, y_test=None, k=2, metodo=""):
    transformadores = {"pca":PCA, "svd":TruncatedSVD, "lda":LinearDiscriminantAnalysis}
    metodo = metodo.lower()
    if metodo not in transformadores:
        raise ValueError("Ponle un valor chido al método")
        
    transformador = transformadores[metodo](n_components=k)
    X_train_r = transformador.fit_transform(X=X_train, y=y_train)
    if X_test is not None:
        X_test_r = transformador.transform(X_test)
        return X_train_r, X_test_r
    
    return X_train_r

In [3]:
data = pd.read_csv("../spotify/data.csv")
data.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991,['Mamie Smith'],0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,['Mamie Smith'],0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,['Oscar Velazquez'],0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,['Mixe'],0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920


In [4]:
quitar = ["popularity", "artists", "id", "release_date", "name"]
X = data.drop(columns=quitar).values
y = data["popularity"].values

In [5]:
X_notest, X_test, y_notest, y_test = train_test_split(X, y, test_size=0.25, random_state=3)

In [16]:
X_notest_r, X_test_r = reducir(X_notest,y_notest, X_test, y_test, k = 3, metodo="pca")

#fig = go.Figure(data = go.Scatter3d(x = X_notest_r[:,0], y = X_notest_r[:,1], z = X_notest_r[:,2], 
                                     #mode = "markers", marker = {"color": y_notest, "size": 5}))
#fig.show()

In [None]:
X_notest_r, X_test_r = reducir(X_notest,y_notest, X_test, y_test, k = 3, metodo="svd")

#fig = go.Figure(data = go.Scatter3d(x = X_notest_r[:,0], y = X_notest_r[:,1], z = X_notest_r[:,2], 
                                     #mode = "markers", marker = {"color": y_notest, "size": 5}))
#fig.show()

In [17]:
X_notest_r, X_test_r = reducir(X_notest,y_notest, X_test, y_test, k = 3, metodo="lda")

#fig = go.Figure(data = go.Scatter3d(x = X_notest_r[:,0], y = X_notest_r[:,1], z = X_notest_r[:,2], 
                                     #mode = "markers", marker = {"color": y_notest, "size": 5}))
#fig.show()

In [25]:
def probar_metodo(estimador, X_notest, y_notest, X_test, y_test, 
                  k, metodo, r2=False, **estimator_params):
    
    X_notest_r, X_test_r = reducir(X_notest, y_notest, 
                                   X_test, y_test, 
                                   k = k, metodo=metodo)
    
    kf = KFold(n_splits=4)

    scores_fold = []
    modelos = []
    for train_index, valid_index in kf.split(X_notest_r, y_notest):
        X_train, X_valid = X_notest_r[train_index,:], X_notest_r[valid_index,:]
        y_train, y_valid = y_notest[train_index], y_notest[valid_index]

        modelo = estimador(**estimator_params)
        modelo.fit(X_train, y_train)
        
        modelos.append(modelo)
        if r2:
            scores_fold.append(r2_score(modelo.predict(X_valid), y_valid))
        else:
            scores_fold.append(modelo.score(X_valid, y_valid))

    nombre_estimador = modelo.__class__.__name__
    print(f"{nombre_estimador} con {metodo}")
    print(f"R^2: {np.mean(scores_fold)} +- {np.std(scores_fold)}")
    
    best_i = np.argmax(scores_fold)
    best_model = modelos[best_i]
    best_score = scores_fold[best_i]
    
    print(f"Test: {best_model.score(X_test_r, y_test)}")
    
    return best_model

# Regresión lineal

In [50]:
%%time
probar_metodo(LinearRegression, 
                   X_notest, y_notest, 
                   X_test, y_test, 
                   k = 3, metodo="pca")

LinearRegression con pca
R^2: 0.26400524845011314 +- 0.0022675107294706184
Test: 0.27323753660597505
CPU times: user 588 ms, sys: 775 ms, total: 1.36 s
Wall time: 202 ms


LinearRegression()

In [51]:
%%time
probar_metodo(LinearRegression, 
                   X_notest, y_notest, 
                   X_test, y_test, 
                   k = 3, metodo="svd")

LinearRegression con svd
R^2: 0.2619127809634059 +- 0.0023531695773167565
Test: 0.2710588571406949
CPU times: user 841 ms, sys: 1 s, total: 1.85 s
Wall time: 261 ms


LinearRegression()

In [52]:
%%time
probar_metodo(LinearRegression, 
                   X_notest, y_notest, 
                   X_test, y_test, 
                   k = 3, metodo="lda")

LinearRegression con lda
R^2: 0.36184019708173365 +- 0.004650323828497517
Test: 0.36856598690593967
CPU times: user 451 ms, sys: 726 ms, total: 1.18 s
Wall time: 242 ms


LinearRegression()

# Polinomial

In [46]:
PolinomialRegression = Pipeline
pasos = [("poly_features", PolynomialFeatures(degree=3)),
         ("model", LinearRegression())]

In [47]:
%%time
probar_metodo(PolinomialRegression, 
                   X_notest, y_notest, 
                   X_test, y_test, 
                   k = 3, metodo="pca", steps=pasos)

Pipeline con pca
R^2: 0.30264679157360064 +- 0.005701685036675731
Test: 0.30443021233952106
CPU times: user 1.04 s, sys: 1.82 s, total: 2.86 s
Wall time: 399 ms


Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
                ('model', LinearRegression())])

In [48]:
%%time
probar_metodo(PolinomialRegression, 
                   X_notest, y_notest, 
                   X_test, y_test, 
                   k = 3, metodo="svd", steps=pasos)

Pipeline con svd
R^2: 0.38838779730303263 +- 0.002757045271950412
Test: 0.39510297358575686
CPU times: user 1.11 s, sys: 1.89 s, total: 3 s
Wall time: 404 ms


Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
                ('model', LinearRegression())])

In [49]:
%%time
probar_metodo(PolinomialRegression, 
                   X_notest, y_notest, 
                   X_test, y_test, 
                   k = 3, metodo="lda", steps=pasos)

Pipeline con lda
R^2: 0.4035123350586728 +- 0.002673981948807318
Test: 0.406888981705244
CPU times: user 859 ms, sys: 1.39 s, total: 2.25 s
Wall time: 379 ms


Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
                ('model', LinearRegression())])

# GBT

In [55]:
%%time
probar_metodo(GradientBoostingRegressor, X_notest, y_notest, X_test, y_test, 
                  k=3, metodo="pca", n_estimators=31, learning_rate=1.0, max_depth=2, random_state=42)

GradientBoostingRegressor con pca
R^2: 0.46574607587115613 +- 0.0050116511685733235
Test: 0.47133351864180784
CPU times: user 7.07 s, sys: 1.06 s, total: 8.13 s
Wall time: 6.61 s


GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=31,
                          random_state=42)

In [56]:
%%time
probar_metodo(GradientBoostingRegressor, X_notest, y_notest, X_test, y_test, 
                  k=3, metodo="svd", n_estimators=31, learning_rate=1.0, max_depth=2, random_state=42)

GradientBoostingRegressor con svd
R^2: 0.26720199269917 +- 0.0012467928510788513
Test: 0.269704911394694
CPU times: user 7.5 s, sys: 1.09 s, total: 8.59 s
Wall time: 6.87 s


GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=31,
                          random_state=42)

In [57]:
%%time
probar_metodo(GradientBoostingRegressor, X_notest, y_notest, X_test, y_test, 
                  k=3, metodo="lda", n_estimators=31, learning_rate=1.0, max_depth=2, random_state=42)

GradientBoostingRegressor con lda
R^2: 0.4437121746800048 +- 0.005654737367492657
Test: 0.4484815864430197
CPU times: user 7.1 s, sys: 578 ms, total: 7.68 s
Wall time: 6.93 s


GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=31,
                          random_state=42)

# K-nn regressor

In [14]:
%%time
probar_metodo(KNeighborsRegressor, X_notest, y_notest, X_test, y_test, 
                  k=3, metodo="pca", n_neighbors=10)

KNeighborsRegressor con pca
R^2: 0.27302025968648413 +- 0.0006544103886522212
Test: 0.28298053733918804
CPU times: user 1.64 s, sys: 1.21 s, total: 2.86 s
Wall time: 1.13 s


KNeighborsRegressor(n_neighbors=10)

In [15]:
%%time
probar_metodo(KNeighborsRegressor, X_notest, y_notest, X_test, y_test, 
                  k=3, metodo="svd", n_neighbors=10)

KNeighborsRegressor con svd
R^2: 0.27180944722693845 +- 0.0008436563815081078
Test: 0.2819018488853876
CPU times: user 1.7 s, sys: 1.29 s, total: 2.99 s
Wall time: 1.14 s


KNeighborsRegressor(n_neighbors=10)

In [16]:
%%time
probar_metodo(KNeighborsRegressor, X_notest, y_notest, X_test, y_test, 
                  k=3, metodo="lda", n_neighbors=10)

KNeighborsRegressor con lda
R^2: 0.44862801122775714 +- 0.003253384235854219
Test: 0.4547489278911776
CPU times: user 1.32 s, sys: 598 ms, total: 1.92 s
Wall time: 1.15 s


KNeighborsRegressor(n_neighbors=10)

# Random Forest

In [38]:
%%time
probar_metodo(RandomForestRegressor,X_notest, y_notest, X_test, y_test,
             k=3, metodo= "pca", n_estimators = 31, random_state = 0)

RandomForestRegressor con pca
R^2: 0.5157523493618581 +- 0.0048419038990016846
Test: 0.5254741565369527
CPU times: user 33.6 s, sys: 1.11 s, total: 34.7 s
Wall time: 33.6 s


RandomForestRegressor(n_estimators=31, random_state=0)

In [39]:
%%time
probar_metodo(RandomForestRegressor,X_notest, y_notest, X_test, y_test,
             k=3, metodo= "lda", n_estimators = 31, random_state = 0)

RandomForestRegressor con lda
R^2: 0.4640239297740527 +- 0.004150582027362178
Test: 0.4730679403231738
CPU times: user 34.1 s, sys: 731 ms, total: 34.9 s
Wall time: 34.5 s


RandomForestRegressor(n_estimators=31, random_state=0)

In [37]:
%%time
probar_metodo(RandomForestRegressor,X_notest, y_notest, X_test, y_test,
             k=3, metodo= "svd", n_estimators = 31, random_state = 0)

RandomForestRegressor con svd
R^2: 0.47305136751529386 +- 0.003636777817901041
Test: 0.48029354812160574
CPU times: user 38.7 s, sys: 1.57 s, total: 40.2 s
Wall time: 38.7 s


RandomForestRegressor(n_estimators=31, random_state=0)

# Arbol de decision

In [34]:
%%time
probar_metodo(DecisionTreeRegressor,X_notest, y_notest, X_test, y_test,
             k=3, metodo= "pca", max_depth=9)

DecisionTreeRegressor con pca
R^2: 0.48867798265706364 +- 0.005604125832801798
Test: 0.4961624199227388
CPU times: user 1.47 s, sys: 1.44 s, total: 2.91 s
Wall time: 1.07 s


DecisionTreeRegressor(max_depth=9)

In [35]:
%%time
probar_metodo(DecisionTreeRegressor,X_notest, y_notest, X_test, y_test,
             k=3, metodo= "svd", max_depth=11)

DecisionTreeRegressor con svd
R^2: 0.28801769753762985 +- 0.005734186224648218
Test: 0.3025459893668959
CPU times: user 1.87 s, sys: 1.2 s, total: 3.07 s
Wall time: 1.2 s


DecisionTreeRegressor(max_depth=11)

In [36]:
%%time
probar_metodo(DecisionTreeRegressor,X_notest, y_notest, X_test, y_test,
             k=3, metodo= "lda", max_depth=9)

DecisionTreeRegressor con lda
R^2: 0.45595283838028355 +- 0.0051475290394939624
Test: 0.4637360703618869
CPU times: user 1.26 s, sys: 501 ms, total: 1.76 s
Wall time: 1.01 s


DecisionTreeRegressor(max_depth=9)

# LGBM

In [26]:
%%time
probar_metodo(lgb.LGBMRegressor, X_notest, y_notest, X_test, y_test, k=3, r2=True, metodo="pca", 
              boosting_type="dart", n_estimators=31, learning_rate=1.95)

LGBMRegressor con pca
R^2: 0.4730147704138652 +- 0.006113759900406764
Test: 0.036273937399711076
CPU times: user 3.98 s, sys: 995 ms, total: 4.98 s
Wall time: 681 ms


LGBMRegressor(boosting_type='dart', learning_rate=1.95, n_estimators=31)

In [33]:
%%time
probar_metodo(lgb.LGBMRegressor, X_notest, y_notest, X_test, y_test, k=3, r2=True, metodo="svd", 
              boosting_type="dart", n_estimators=31, learning_rate=2.15)

LGBMRegressor con svd
R^2: 0.2013684622820602 +- 0.004891387930848354
Test: -6.168702294750761
CPU times: user 4.18 s, sys: 1.3 s, total: 5.49 s
Wall time: 738 ms


LGBMRegressor(boosting_type='dart', learning_rate=2.15, n_estimators=31)

In [28]:
%%time
probar_metodo(lgb.LGBMRegressor, X_notest, y_notest, X_test, y_test, k=3, r2=True, metodo="lda", 
              boosting_type="dart", n_estimators=31, learning_rate=1.95)

LGBMRegressor con lda
R^2: 0.4467073295980709 +- 0.003721549571627958
Test: 0.037745985319599495
CPU times: user 3.53 s, sys: 354 ms, total: 3.88 s
Wall time: 666 ms


LGBMRegressor(boosting_type='dart', learning_rate=1.95, n_estimators=31)