# Índice
1. [Módulos y lectura de datos](#Módulos-y-lectura-de-datos)
1. Modelos
    1. [Regresión lineal](#Regresión-lineal)
    1. [Regresión polinomial](#Polinomial)
    1. [K vecinos](#K-nn-regressor)
    1. [Árboles de decisión](#Árbol-de-decisión)
    1. [Random forests](#Random-Forest)
    1. [Gradient Boosting Trees](#GBT)
    1. [Light Gradient Boosting Machine](#LGBM)
    
1. [Búsqueda de hiperparámetros](#Búsqueda-de-hiperpárametros)

# Módulos y lectura de datos

In [80]:
# Data science
import pandas as pd
import numpy as np

# División de datos
from sklearn.model_selection import train_test_split, KFold

from sklearn.preprocessing import StandardScaler, MinMaxScaler # Preprocesamiento

# Reducción de dimensionalidad
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.metrics import r2_score # métrica

# Modelos
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

# Hiperparámetros
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform, randint

# visualización
import plotly.express as px
import plotly.graph_objects as go 

In [2]:
data = pd.read_csv("../spotify/data.csv")
data.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.991,['Mamie Smith'],0.598,168333,0.224,0,0cS0A1fUEUd1EW3FcF8AEI,0.000522,5,0.379,-12.628,0,Keep A Song In Your Soul,12,1920,0.0936,149.976,0.634,1920
1,0.643,"[""Screamin' Jay Hawkins""]",0.852,150200,0.517,0,0hbkKFIJm7Z05H8Zl9w30f,0.0264,5,0.0809,-7.261,0,I Put A Spell On You,7,1920-01-05,0.0534,86.889,0.95,1920
2,0.993,['Mamie Smith'],0.647,163827,0.186,0,11m7laMUgmOKqI3oYzuhne,1.8e-05,0,0.519,-12.098,1,Golfing Papa,4,1920,0.174,97.6,0.689,1920
3,0.000173,['Oscar Velazquez'],0.73,422087,0.798,0,19Lc5SfJJ5O1oaxY0fpwfh,0.801,2,0.128,-7.311,1,True House Music - Xavier Santos & Carlos Gomi...,17,1920-01-01,0.0425,127.997,0.0422,1920
4,0.295,['Mixe'],0.704,165224,0.707,1,2hJjbsLCytGsnAHfdsLejp,0.000246,10,0.402,-6.036,0,Xuniverxe,2,1920-10-01,0.0768,122.076,0.299,1920


In [3]:
quitar = ["popularity", "artists", "id", "release_date", "name"]
variables = data.drop(columns=quitar).columns
X = data.drop(columns=quitar).values
y = data["popularity"].values

In [4]:
X_notest, X_test, y_notest, y_test = train_test_split(X, y, test_size=0.25, random_state=3)

In [5]:
sc = StandardScaler()
X_notest_st = sc.fit_transform(X_notest)
X_test_st = sc.transform(X_test)

# Análisis y visualizaciones

In [6]:
def reducir(X_train, y_train=None, X_test=None, y_test=None, k=2, metodo="", verbose=False):
    """Realiza reducción de dimensionalidad por PCA, SVD o LDA
    Parámetros:
        X_train: arreglo 2D
        Los datos con los que se inferirá y aplicará la reducción
        
        y_train: arreglo 1D, default=None
        Etiquetas de los datos, solamente usado con LDA
        
        X_test: arreglo 2D, default=None
        Datos a los que se les aplicará la transformación
        
        y_test: arreglo 1D, default=None
        Etiquetas para X_test, solamente usado con LDA
        
        k: int, default=2
        Número de dimensiones a la que se reducirá la información
        
        Método: {"pca", "svd", "lda"}
        El nombre del método a utilizar
        
        Verbose: bool, default=False
        Si se elige PCA o SVD, imprime la varianza explicada conseguida
        
    Regresa:
        X_train_r: arreglo 2D
        Datos tras haberse aplicado la reducción con la técnica elegida
        
        X_test_r: arreglo 2D
        Los otros datos con la reducción tras haberse aplicado la técnica, si es que hay
    """
    transformadores = {"pca":PCA, "svd":TruncatedSVD, "lda":LinearDiscriminantAnalysis}
    metodo = metodo.lower()
    if metodo not in transformadores:
        raise ValueError("Ponle un valor chido al método")
        
    transformador = transformadores[metodo](n_components=k)
    X_train_r = transformador.fit_transform(X=X_train, y=y_train)
    if verbose and metodo != "lda":
        print("Varianza explicada: ", 
              transformador.explained_variance_ratio_.sum())
        
    if X_test is not None:
        X_test_r = transformador.transform(X_test)
        return X_train_r, X_test_r
    
    return X_train_r

In [7]:
def correlaciones_componentes(original, componentes):
    n_rows, n_cols = original.shape
    _, n_comps = componentes.shape
    correlaciones = np.corrcoef(original, componentes, rowvar=False)
    return correlaciones[:-n_comps, n_cols:]

In [18]:
def plot_correlaciones(original, componentes, nombres):
    correlaciones = correlaciones_componentes(original, componentes)
    df = pd.DataFrame(correlaciones, columns=["PC 1", "PC 2"], index=nombres)
    df["size"] = df["PC 1"]**2 + df["PC 2"]**2
    #factor_size = 10
    #df["size"] *= factor_size
    
    # Scatterplot de las correlaciones
    ar = 800
    fig = px.scatter(df, x="PC 1", y="PC 2", size="size", text=nombres,width=ar, height=ar)
    
    # Circulo unitario
    θ = np.linspace(0, 2*np.pi, 100)
    fig.add_trace(
        go.Scatter(
            x=np.cos(θ),
            y=np.sin(θ),
            mode="lines",
            line=go.scatter.Line(color="red", dash="dash"),
            showlegend=False,
            hoverinfo="skip")
    )
    
    # Ejes coordenados
    fig.add_hline(0, opacity=0.3)
    fig.add_vline(0, opacity=0.3)
    #fig.show()
    return fig

## Con PCA

In [19]:
X_notest_r, X_test_r = reducir(X_notest_st, y_notest, X_test_st, y_test, k = 2, metodo="pca", verbose=True)

plot_correlaciones(X_notest_st, X_notest_r, variables)

Varianza explicada:  0.3654786266615734


In [None]:
X_notest_r, X_test_r = reducir(X_notest,y_notest, X_test, y_test, k = 3, metodo="pca")

fig = go.Figure(data = go.Scatter3d(x = X_notest_r[:,0], y = X_notest_r[:,1], z = X_notest_r[:,2], 
                                     mode = "markers", marker = {"color": y_notest, "size": 1}))
fig.show()

## Con SVD

In [20]:
X_notest_r, X_test_r = reducir(X_notest_st, y_notest, X_test_st, y_test, k = 2, metodo="svd", verbose=True)

plot_correlaciones(X_notest_st, X_notest_r, variables)

Varianza explicada:  0.3654786351210018


In [None]:
X_notest_r, X_test_r = reducir(X_notest,y_notest, X_test, y_test, k = 3, metodo="svd")

fig = go.Figure(data = go.Scatter3d(x = X_notest_r[:,0], y = X_notest_r[:,1], z = X_notest_r[:,2], 
                                     mode = "markers", marker = {"color": y_notest, "size": 1}))
fig.show()

## Con LDA

In [24]:
X_notest_r, X_test_r = reducir(X_notest_st, y_notest, X_test_st, y_test, k = 2, metodo="lda", verbose=True)

plot_correlaciones(X_notest_st, X_notest_r, variables)

In [None]:
X_notest_r, X_test_r = reducir(X_notest,y_notest, X_test, y_test, k = 3, metodo="lda")

fig = go.Figure(data = go.Scatter3d(x = X_notest_r[:,0], y = X_notest_r[:,1], z = X_notest_r[:,2], 
                                     mode = "markers", marker = {"color": y_notest, "size": 1}))
fig.show()

# Modelos

In [30]:
def probar_metodo(estimador, X_notest, y_notest, X_test, y_test, 
                  k=None, metodo=None, **estimator_params):
    
    if k and metodo:
        X_notest_r, X_test_r = reducir(X_notest, y_notest, 
                                       X_test, y_test, 
                                       k = k, metodo=metodo)
        X_notest, X_test = X_notest_r, X_test_r

    kf = KFold(n_splits=4)

    scores_fold = []
    modelos = []
    for train_index, valid_index in kf.split(X_notest, y_notest):
        X_train, X_valid = X_notest[train_index,:], X_notest[valid_index,:]
        y_train, y_valid = y_notest[train_index], y_notest[valid_index]

        modelo = estimador(**estimator_params)
        modelo.fit(X_train, y_train)
        
        modelos.append(modelo)
        y_pred = modelo.predict(X_valid)
        scores_fold.append(r2_score(y_valid, y_pred))

    nombre_estimador = modelo.__class__.__name__
    print(f"{nombre_estimador} con {metodo}")
    print(f"R^2: {np.mean(scores_fold)} +- {np.std(scores_fold)}")
    
    best_i = np.argmax(scores_fold)
    best_model = modelos[best_i]
    best_score = scores_fold[best_i]
    
    print(f"Test: {best_model.score(X_test, y_test)}")
    
    return best_model

# Regresión lineal
### PCA

In [31]:
%%time
probar_metodo(LinearRegression, 
                   X_notest, y_notest, 
                   X_test, y_test, 
                   k = 3, metodo="pca")

LinearRegression con pca
R^2: 0.26400524845011303 +- 0.002267510729470586
Test: 0.27323753660597494
CPU times: user 930 ms, sys: 1.78 s, total: 2.71 s
Wall time: 377 ms


LinearRegression()

In [32]:
%%time
probar_metodo(LinearRegression, 
                   X_notest_st, y_notest, 
                   X_test_st, y_test, 
                   k = 3, metodo="pca")

LinearRegression con pca
R^2: 0.2231333825680294 +- 0.006186248685811095
Test: 0.23044243816979215
CPU times: user 673 ms, sys: 837 ms, total: 1.51 s
Wall time: 208 ms


LinearRegression()

### SVD

In [43]:
%%time
probar_metodo(LinearRegression, 
                   X_notest, y_notest, 
                   X_test, y_test, 
                   k = 3, metodo="svd")

LinearRegression con svd
R^2: 0.261912780963406 +- 0.002353169577316616
Test: 0.2710588571406949
CPU times: user 745 ms, sys: 843 ms, total: 1.59 s
Wall time: 223 ms


LinearRegression()

In [44]:
%%time
probar_metodo(LinearRegression, 
                   X_notest_st, y_notest, 
                   X_test_st, y_test, 
                   k = 3, metodo="svd")

LinearRegression con svd
R^2: 0.2231333934921746 +- 0.006186247510525911
Test: 0.2304424388905143
CPU times: user 968 ms, sys: 1.2 s, total: 2.17 s
Wall time: 298 ms


LinearRegression()

### LDA

In [45]:
%%time
probar_metodo(LinearRegression, 
                   X_notest, y_notest, 
                   X_test, y_test, 
                   k = 3, metodo="lda")

LinearRegression con lda
R^2: 0.36184019708173365 +- 0.004650323828497517
Test: 0.36856598690593967
CPU times: user 349 ms, sys: 205 ms, total: 554 ms
Wall time: 207 ms


LinearRegression()

In [46]:
%%time
probar_metodo(LinearRegression, 
                   X_notest_st, y_notest, 
                   X_test_st, y_test, 
                   k = 3, metodo="lda")

LinearRegression con lda
R^2: 0.36184019708173376 +- 0.004650323828497484
Test: 0.3685659869059398
CPU times: user 312 ms, sys: 243 ms, total: 555 ms
Wall time: 207 ms


LinearRegression()

En regresión lineal, **estandarizar no ayuda** y el mejor método para reducir es **LDA**

# Polinomial

In [47]:
PolinomialRegression = Pipeline
pasos = [("poly_features", PolynomialFeatures(degree=3)),
         ("model", LinearRegression())]

### PCA

In [48]:
%%time
probar_metodo(PolinomialRegression, 
                   X_notest, y_notest, 
                   X_test, y_test, 
                   k = 3, metodo="pca", steps=pasos)

Pipeline con pca
R^2: 0.30264679446765697 +- 0.005701675020086246
Test: 0.30443021782602475
CPU times: user 1.14 s, sys: 1.76 s, total: 2.91 s
Wall time: 377 ms


Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
                ('model', LinearRegression())])

In [49]:
%%time
probar_metodo(PolinomialRegression, 
                   X_notest_st, y_notest, 
                   X_test_st, y_test, 
                   k = 3, metodo="pca", steps=pasos)

Pipeline con pca
R^2: 0.31956079421694844 +- 0.005078292177280694
Test: 0.32637508484245525
CPU times: user 967 ms, sys: 1.77 s, total: 2.74 s
Wall time: 361 ms


Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
                ('model', LinearRegression())])

### SVD

In [50]:
%%time
probar_metodo(PolinomialRegression, 
                   X_notest, y_notest, 
                   X_test, y_test, 
                   k = 3, metodo="svd", steps=pasos)

Pipeline con svd
R^2: 0.3885691854456713 +- 0.0026472341146824264
Test: 0.39514469766098737
CPU times: user 1.11 s, sys: 1.84 s, total: 2.95 s
Wall time: 389 ms


Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
                ('model', LinearRegression())])

In [51]:
%%time
probar_metodo(PolinomialRegression, 
                   X_notest_st, y_notest, 
                   X_test_st, y_test, 
                   k = 3, metodo="svd", steps=pasos)

Pipeline con svd
R^2: 0.31956086592978045 +- 0.005078276126323144
Test: 0.32637511433961797
CPU times: user 1.15 s, sys: 1.8 s, total: 2.95 s
Wall time: 373 ms


Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
                ('model', LinearRegression())])

### LDA

In [52]:
%%time
probar_metodo(PolinomialRegression, 
                   X_notest, y_notest, 
                   X_test, y_test, 
                   k = 3, metodo="lda", steps=pasos)

Pipeline con lda
R^2: 0.4035123350586728 +- 0.002673981948807318
Test: 0.406888981705244
CPU times: user 726 ms, sys: 1.13 s, total: 1.86 s
Wall time: 372 ms


Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
                ('model', LinearRegression())])

In [53]:
%%time
probar_metodo(PolinomialRegression, 
                   X_notest_st, y_notest, 
                   X_test_st, y_test, 
                   k = 3, metodo="lda", steps=pasos)

Pipeline con lda
R^2: 0.40351233505867723 +- 0.0026739819488075427
Test: 0.4068889817052481
CPU times: user 816 ms, sys: 1.12 s, total: 1.93 s
Wall time: 444 ms


Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
                ('model', LinearRegression())])

Con polinomios de tercer grado, realmente **estandarizar no ayudó** (solamente un poquito con PCA).

El mejor método para reducir fue con **LDA**

# K-nn regressor

### PCA

In [54]:
%%time
probar_metodo(KNeighborsRegressor, X_notest, y_notest, X_test, y_test, 
                  k=3, metodo="pca", n_neighbors=10)

KNeighborsRegressor con pca
R^2: 0.2730202411296422 +- 0.0006544227170101532
Test: 0.28298053733918804
CPU times: user 1.69 s, sys: 1.17 s, total: 2.86 s
Wall time: 1.29 s


KNeighborsRegressor(n_neighbors=10)

In [55]:
%%time
probar_metodo(KNeighborsRegressor, X_notest_st, y_notest, X_test_st, y_test, 
                  k=3, metodo="pca", n_neighbors=10)

KNeighborsRegressor con pca
R^2: 0.2901262353470088 +- 0.0050933034028981675
Test: 0.296206474472102
CPU times: user 1.71 s, sys: 1.14 s, total: 2.85 s
Wall time: 1.1 s


KNeighborsRegressor(n_neighbors=10)

### SVD

In [56]:
%%time
probar_metodo(KNeighborsRegressor, X_notest, y_notest, X_test, y_test, 
                  k=3, metodo="svd", n_neighbors=10)

KNeighborsRegressor con svd
R^2: 0.27180944722693845 +- 0.0008436563815081078
Test: 0.2819018965171851
CPU times: user 1.66 s, sys: 1.39 s, total: 3.05 s
Wall time: 1.16 s


KNeighborsRegressor(n_neighbors=10)

In [57]:
%%time
probar_metodo(KNeighborsRegressor, X_notest_st, y_notest, X_test_st, y_test, 
                  k=3, metodo="svd", n_neighbors=10)

KNeighborsRegressor con svd
R^2: 0.2901716710298181 +- 0.005082628560314938
Test: 0.296258996948782
CPU times: user 1.75 s, sys: 1.21 s, total: 2.96 s
Wall time: 1.12 s


KNeighborsRegressor(n_neighbors=10)

### LDA

In [58]:
%%time
probar_metodo(KNeighborsRegressor, X_notest, y_notest, X_test, y_test, 
                  k=3, metodo="lda", n_neighbors=10)

KNeighborsRegressor con lda
R^2: 0.44862801122775714 +- 0.003253384235854219
Test: 0.4547489278911776
CPU times: user 1.35 s, sys: 617 ms, total: 1.97 s
Wall time: 1.25 s


KNeighborsRegressor(n_neighbors=10)

In [59]:
%%time
probar_metodo(KNeighborsRegressor, X_notest_st, y_notest, X_test_st, y_test, 
                  k=3, metodo="lda", n_neighbors=10)

KNeighborsRegressor con lda
R^2: 0.44862801122775714 +- 0.003253384235854219
Test: 0.45474903855090953
CPU times: user 1.43 s, sys: 525 ms, total: 1.96 s
Wall time: 1.2 s


KNeighborsRegressor(n_neighbors=10)

Con K-vecinos, podemos decir que **estandarizar sí ayudó** ya que tanto en PCA como SVD fue beneficioso (aunque tampoco tanto) y con LDA fue indiferente.

La mejor metodología fue **LDA**

# Árbol de decisión

### PCA

In [60]:
%%time
probar_metodo(DecisionTreeRegressor,X_notest, y_notest, X_test, y_test,
             k=3, metodo= "pca", max_depth=9)

DecisionTreeRegressor con pca
R^2: 0.48856814631457746 +- 0.0056340640041783316
Test: 0.49627251315168586
CPU times: user 1.52 s, sys: 1.27 s, total: 2.79 s
Wall time: 1.27 s


DecisionTreeRegressor(max_depth=9)

In [61]:
%%time
probar_metodo(DecisionTreeRegressor, X_notest_st, y_notest, X_test_st, y_test,
             k=3, metodo= "pca", max_depth=9)

DecisionTreeRegressor con pca
R^2: 0.3257816216992423 +- 0.005179331232367274
Test: 0.32945439488024564
CPU times: user 1.48 s, sys: 1.14 s, total: 2.62 s
Wall time: 962 ms


DecisionTreeRegressor(max_depth=9)

### SVD

In [62]:
%%time
probar_metodo(DecisionTreeRegressor, X_notest, y_notest, X_test, y_test,
             k=3, metodo= "svd", max_depth=11)

DecisionTreeRegressor con svd
R^2: 0.28808160603224076 +- 0.005637470567837026
Test: 0.30245403523180225
CPU times: user 1.8 s, sys: 1.24 s, total: 3.03 s
Wall time: 1.16 s


DecisionTreeRegressor(max_depth=11)

In [63]:
%%time
probar_metodo(DecisionTreeRegressor, X_notest_st, y_notest, X_test_st, y_test,
             k=3, metodo= "svd", max_depth=11)

DecisionTreeRegressor con svd
R^2: 0.3056459955542706 +- 0.005209417884258692
Test: 0.3119518895338639
CPU times: user 1.75 s, sys: 1.16 s, total: 2.91 s
Wall time: 1.12 s


DecisionTreeRegressor(max_depth=11)

### LDA

In [64]:
%%time
probar_metodo(DecisionTreeRegressor,X_notest, y_notest, X_test, y_test,
             k=3, metodo= "lda", max_depth=9)

DecisionTreeRegressor con lda
R^2: 0.455979790042313 +- 0.005322256207899627
Test: 0.46423269182243876
CPU times: user 1.16 s, sys: 576 ms, total: 1.74 s
Wall time: 977 ms


DecisionTreeRegressor(max_depth=9)

In [65]:
%%time
probar_metodo(DecisionTreeRegressor, X_notest_st, y_notest, X_test_st, y_test,
             k=3, metodo= "lda", max_depth=9)

DecisionTreeRegressor con lda
R^2: 0.4560782053594653 +- 0.005249339906841818
Test: 0.464214012149749
CPU times: user 1.2 s, sys: 560 ms, total: 1.76 s
Wall time: 986 ms


DecisionTreeRegressor(max_depth=9)

Para árboles de decisión, en general podemos decir que **estandarizar no ayudó**, ya que en PCA le fue peor, en SVD ayudó tantito y en LDA fue indiferente.

La mejor metodología fue **PCA**

# Random Forest

### PCA

In [66]:
%%time
probar_metodo(RandomForestRegressor, X_notest, y_notest, X_test, y_test,
             k=3, metodo= "pca", n_estimators = 31, random_state = 0)

RandomForestRegressor con pca
R^2: 0.5157523493618581 +- 0.0048419038990016846
Test: 0.5254741565369527
CPU times: user 32.2 s, sys: 1.42 s, total: 33.6 s
Wall time: 31.9 s


RandomForestRegressor(n_estimators=31, random_state=0)

In [67]:
%%time
probar_metodo(RandomForestRegressor, X_notest_st, y_notest, X_test_st, y_test,
             k=3, metodo= "pca", n_estimators = 31, random_state = 0)

RandomForestRegressor con pca
R^2: 0.31244793985217045 +- 0.005405077917113563
Test: 0.3139869657598242
CPU times: user 33.8 s, sys: 1.09 s, total: 34.9 s
Wall time: 33.4 s


RandomForestRegressor(n_estimators=31, random_state=0)

### SVD

In [68]:
%%time
probar_metodo(RandomForestRegressor, X_notest, y_notest, X_test, y_test,
             k=3, metodo= "svd", n_estimators = 31, random_state = 0)

RandomForestRegressor con svd
R^2: 0.47305136751529386 +- 0.003636777817901041
Test: 0.48029354812160574
CPU times: user 37.6 s, sys: 1.17 s, total: 38.8 s
Wall time: 37.1 s


RandomForestRegressor(n_estimators=31, random_state=0)

In [69]:
%%time
probar_metodo(RandomForestRegressor, X_notest_st, y_notest, X_test_st, y_test,
             k=3, metodo= "svd", n_estimators = 31, random_state = 0)

RandomForestRegressor con svd
R^2: 0.31198261878334943 +- 0.005718131551015409
Test: 0.316942569986315
CPU times: user 34.3 s, sys: 1.34 s, total: 35.7 s
Wall time: 33.9 s


RandomForestRegressor(n_estimators=31, random_state=0)

### LDA

In [100]:
%%time
probar_metodo(RandomForestRegressor, X_notest, y_notest, X_test, y_test,
             k=3, metodo= "lda", n_estimators = 31, random_state = 0)

RandomForestRegressor con lda
R^2: 0.4640239297740527 +- 0.004150582027362178
Test: 0.4730679403231738
CPU times: user 41.1 s, sys: 59.8 ms, total: 41.2 s
Wall time: 40.2 s


RandomForestRegressor(n_estimators=31, random_state=0)

In [101]:
%%time
probar_metodo(RandomForestRegressor, X_notest_st, y_notest, X_test_st, y_test,
             k=3, metodo= "lda", n_estimators = 31, random_state = 0)

RandomForestRegressor con lda
R^2: 0.4640239297740527 +- 0.004150582027362178
Test: 0.4730679403231738
CPU times: user 40.4 s, sys: 156 ms, total: 40.6 s
Wall time: 39.8 s


RandomForestRegressor(n_estimators=31, random_state=0)

Para RandomForest, **estandarizar no fue de ayuda**.

El mejor método para reducir fue **PCA**

# GBT

### PCA

In [102]:
%%time
probar_metodo(GradientBoostingRegressor, X_notest, y_notest, X_test, y_test, 
                  k=3, metodo="pca", n_estimators=31, learning_rate=1.0, max_depth=2, random_state=42)

GradientBoostingRegressor con pca
R^2: 0.46574607587115613 +- 0.0050116511685733235
Test: 0.47133351864180784
CPU times: user 9.41 s, sys: 31.8 ms, total: 9.44 s
Wall time: 7.92 s


GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=31,
                          random_state=42)

In [103]:
%%time
probar_metodo(GradientBoostingRegressor, X_notest_st, y_notest, X_test_st, y_test, 
                  k=3, metodo="pca", n_estimators=31, learning_rate=1.0, max_depth=2, random_state=42)

GradientBoostingRegressor con pca
R^2: 0.3744863535403422 +- 0.006086798439417348
Test: 0.37962194741450317
CPU times: user 9.48 s, sys: 29.2 ms, total: 9.51 s
Wall time: 8.05 s


GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=31,
                          random_state=42)

### SVD

In [104]:
%%time
probar_metodo(GradientBoostingRegressor, X_notest, y_notest, X_test, y_test, 
                  k=3, metodo="svd", n_estimators=31, learning_rate=1.0, max_depth=2, random_state=42)

GradientBoostingRegressor con svd
R^2: 0.26720199269917 +- 0.0012467928510788513
Test: 0.269704911394694
CPU times: user 9.7 s, sys: 32.6 ms, total: 9.73 s
Wall time: 8.24 s


GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=31,
                          random_state=42)

In [105]:
%%time
probar_metodo(GradientBoostingRegressor, X_notest_st, y_notest, X_test_st, y_test, 
                  k=3, metodo="svd", n_estimators=31, learning_rate=1.0, max_depth=2, random_state=42)

GradientBoostingRegressor con svd
R^2: 0.25520186569086856 +- 0.0049415299268875355
Test: 0.26111552842601515
CPU times: user 9.67 s, sys: 43.6 ms, total: 9.71 s
Wall time: 8.16 s


GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=31,
                          random_state=42)

### LDA

In [106]:
%%time
probar_metodo(GradientBoostingRegressor, X_notest, y_notest, X_test, y_test, 
                  k=3, metodo="lda", n_estimators=31, learning_rate=1.0, max_depth=2, random_state=42)

GradientBoostingRegressor con lda
R^2: 0.4437121746800048 +- 0.005654737367492657
Test: 0.4484815864430197
CPU times: user 9.11 s, sys: 48.2 ms, total: 9.16 s
Wall time: 8.18 s


GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=31,
                          random_state=42)

In [107]:
%%time
probar_metodo(GradientBoostingRegressor, X_notest_st, y_notest, X_test_st, y_test, 
                  k=3, metodo="lda", n_estimators=31, learning_rate=1.0, max_depth=2, random_state=42)

GradientBoostingRegressor con lda
R^2: 0.4437121746800048 +- 0.005654737367492657
Test: 0.4484815864430197
CPU times: user 9.05 s, sys: 23.9 ms, total: 9.08 s
Wall time: 8.23 s


GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=31,
                          random_state=42)

Con Gradient Boosting Machine vemos que en PCA empeoró, en SVD sí mejoró y en LDA fue indiferente, por lo que podemos decir que, en general, **estandarizar no ayudó**.

Se alcanzó mejores resultados con **PCA**

# LGBM

### PCA

In [108]:
%%time
probar_metodo(lgb.LGBMRegressor, X_notest, y_notest, X_test, y_test, k=3, metodo="pca", 
              boosting_type="dart", n_estimators=31, learning_rate=1.95)

LGBMRegressor con pca
R^2: 0.047352449577081096 +- 0.019373112925562447
Test: 0.050175506844550744
CPU times: user 4.78 s, sys: 108 ms, total: 4.89 s
Wall time: 962 ms


LGBMRegressor(boosting_type='dart', learning_rate=1.95, n_estimators=31)

In [109]:
%%time
probar_metodo(lgb.LGBMRegressor, X_notest_st, y_notest, X_test_st, y_test, k=3, metodo="pca", 
              boosting_type="dart", n_estimators=31, learning_rate=1.95)

LGBMRegressor con pca
R^2: 0.02573107255799667 +- 0.00973646363499737
Test: 0.02679881976400489
CPU times: user 4.36 s, sys: 51.4 ms, total: 4.41 s
Wall time: 834 ms


LGBMRegressor(boosting_type='dart', learning_rate=1.95, n_estimators=31)

### SVD

In [110]:
%%time
probar_metodo(lgb.LGBMRegressor, X_notest, y_notest, X_test, y_test, k=3, metodo="svd", 
              boosting_type="dart", n_estimators=31, learning_rate=2.15)

LGBMRegressor con svd
R^2: -6.233646443780534 +- 0.09855816068573812
Test: -6.168702294750761
CPU times: user 4.05 s, sys: 51.9 ms, total: 4.11 s
Wall time: 763 ms


LGBMRegressor(boosting_type='dart', learning_rate=2.15, n_estimators=31)

In [111]:
%%time
probar_metodo(lgb.LGBMRegressor, X_notest_st, y_notest, X_test_st, y_test, k=3, metodo="svd", 
              boosting_type="dart", n_estimators=31, learning_rate=2.15)

LGBMRegressor con svd
R^2: -6.859726322439025 +- 0.07278221420479009
Test: -6.766730251916532
CPU times: user 4.05 s, sys: 72 ms, total: 4.13 s
Wall time: 758 ms


LGBMRegressor(boosting_type='dart', learning_rate=2.15, n_estimators=31)

### LDA

In [112]:
%%time
probar_metodo(lgb.LGBMRegressor, X_notest, y_notest, X_test, y_test, k=3, metodo="lda", 
              boosting_type="dart", n_estimators=31, learning_rate=1.95)

LGBMRegressor con lda
R^2: 0.02882809643943912 +- 0.0072701717618820485
Test: 0.037745985319599495
CPU times: user 5.61 s, sys: 84.2 ms, total: 5.69 s
Wall time: 1.16 s


LGBMRegressor(boosting_type='dart', learning_rate=1.95, n_estimators=31)

In [113]:
%%time
probar_metodo(lgb.LGBMRegressor, X_notest_st, y_notest, X_test_st, y_test, k=3, metodo="lda", 
              boosting_type="dart", n_estimators=31, learning_rate=1.95)

LGBMRegressor con lda
R^2: 0.02882809643943912 +- 0.0072701717618820485
Test: 0.037745985319599495
CPU times: user 4.22 s, sys: 63.9 ms, total: 4.28 s
Wall time: 806 ms


LGBMRegressor(boosting_type='dart', learning_rate=1.95, n_estimators=31)

En LGBM podemos decir que **estandarizar no ayudó**

la mejor metodología fue **PCA**

Hay algo raro con este modelo, ya que parece ser que si usas su propio score, da otras cosas

-------------------

En general, creo que estandarizar no resultó útil

De 7 modelos, con 4 (curiosamente los relacionados a árboles) resultó mejor usar PCA y con los otros 3 usar LDA.


---------------

Actualización:
La reducción de dimensionalidad no nos está resultando útil.
* Si no se hace la estandarización, resulta que la variabilidad se la lleva principalmente la variable de duración, porque está en milisegundos y eso hace que se tengan valores grandes en comparación con las otras. Con una sola componente ya se tenía el 99% de la variabilidad

* Si se realiza la estandarización, se resuelve el problema anterior, pero el problema es que con 3 componentes solamente se consigue explicar el 46% de la varianza, lo cual puede explicar porque "no está ayudando" la estandarización.

Lo que se debería probar ahora es usar las variables estandarizadas **sin** reducción

In [70]:
%%time
probar_metodo(LinearRegression, 
                   X_notest_st, y_notest, 
                   X_test_st, y_test)

LinearRegression con None
R^2: 0.3617012768308049 +- 0.0046287555777005445
Test: 0.3684928688975092
CPU times: user 200 ms, sys: 591 ms, total: 791 ms
Wall time: 321 ms


LinearRegression()

In [71]:
%%time
probar_metodo(PolinomialRegression, 
                   X_notest_st, y_notest, 
                   X_test_st, y_test, steps=pasos)

Pipeline con None
R^2: 0.5928728741564682 +- 0.0021989859626463226
Test: 0.5913428537132784
CPU times: user 1min 8s, sys: 20.3 s, total: 1min 29s
Wall time: 16 s


Pipeline(steps=[('poly_features', PolynomialFeatures(degree=3)),
                ('model', LinearRegression())])

In [72]:
%%time
probar_metodo(KNeighborsRegressor, X_notest_st, y_notest, X_test_st, y_test, 
                  n_neighbors=10)

KNeighborsRegressor con None
R^2: 0.5401158770759624 +- 0.0020193106951685554
Test: 0.5393321183496611
CPU times: user 1min 53s, sys: 421 ms, total: 1min 54s
Wall time: 1min 53s


KNeighborsRegressor(n_neighbors=10)

In [73]:
%%time
probar_metodo(DecisionTreeRegressor, X_notest_st, y_notest, X_test_st, y_test,
             max_depth=9)

DecisionTreeRegressor con None
R^2: 0.5974628947946692 +- 0.0052266149435213115
Test: 0.6009178074789587
CPU times: user 2.07 s, sys: 823 µs, total: 2.07 s
Wall time: 2.07 s


DecisionTreeRegressor(max_depth=9)

In [74]:
%%time
probar_metodo(RandomForestRegressor, X_notest_st, y_notest, X_test_st, y_test,
             n_estimators = 31, random_state = 0)

RandomForestRegressor con None
R^2: 0.6691521524160086 +- 0.004136119138327282
Test: 0.6709557084992585
CPU times: user 1min 32s, sys: 62.2 ms, total: 1min 32s
Wall time: 1min 32s


RandomForestRegressor(n_estimators=31, random_state=0)

In [75]:
%%time
probar_metodo(GradientBoostingRegressor, X_notest_st, y_notest, X_test_st, y_test, 
                  n_estimators=31, learning_rate=1.0, max_depth=2, random_state=42)

GradientBoostingRegressor con None
R^2: 0.5985679461792845 +- 0.00594668728787715
Test: 0.6034899340050586
CPU times: user 14.8 s, sys: 818 µs, total: 14.8 s
Wall time: 14.9 s


GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=31,
                          random_state=42)

In [76]:
%%time
probar_metodo(lgb.LGBMRegressor, X_notest_st, y_notest, X_test_st, y_test, 
              boosting_type="dart", n_estimators=31, learning_rate=1.95)

LGBMRegressor con None
R^2: 0.02284778422656286 +- 0.013179546868704092
Test: 0.021201253132549946
CPU times: user 6.11 s, sys: 21.7 ms, total: 6.13 s
Wall time: 2.51 s


LGBMRegressor(boosting_type='dart', learning_rate=1.95, n_estimators=31)

# Búsqueda de hiperpárametros

## Random Forest

In [77]:
params = {"n_estimators":[10,20,25,30],
          "max_depth":[6,9,12,15],
          "min_samples_leaf":[1,2,4,8], 
          "max_features":["auto", "sqrt", "log2"],
          "bootstrap":[True, False]}

combinaciones = np.prod([len(l_params) for _,l_params in params.items()])
combinaciones

384

In [None]:
%%time
grid_rf = GridSearchCV(RandomForestRegressor(max_samples=0.75, random_state=0), params,
                          n_jobs=-1, cv=4, verbose=0)
grid_rf.fit(X_notest_st, y_notest)

print(grid_rf.best_score_)
grid_rf.best_params_

## Gradient Boosting Machine

In [None]:
params = {"loss":["ls", "lad", "huber"],
          "learning_rate":[0.1, 0.01],
          "n_estimators":[25,50,100,150],
          "subsample":[1.0, 0.75],
          "min_samples_leaf":[2,4,8]}

In [None]:
%%time
grid_gbm = GridSearchCV(GradientBoostingRegressor(random_state=3,
                                              n_iter_no_change=10), 
                    params, n_jobs=-1, cv=4, verbose=0)
grid_gbm.fit(X_notest_r, y_notest)

print(grid_gbm.best_score_)
grid_gbm.best_params_

## LGBM

In [86]:
%%time
lgbm = lgb.LGBMRegressor(boosting_type="dart")
distributions = dict(learning_rate=uniform(1.,3), 
                     sub_feature=uniform(0, 2), num_leaves=randint(20, 300),
                     min_data=randint(20, 100), max_depth=randint(5, 200),
                     n_estimators=randint(10, 81))
clf = RandomizedSearchCV(lgbm, distributions, random_state=42, n_iter=1000,n_jobs=-1,cv=4,scoring="r2")

search = clf.fit(X_notest_st, y_notest)
search.best_params_

CPU times: user 9.81 s, sys: 1.85 s, total: 11.7 s
Wall time: 6min 27s


{'learning_rate': 1.1913836687607935,
 'max_depth': 87,
 'min_data': 73,
 'n_estimators': 60,
 'num_leaves': 29,
 'sub_feature': 0.6089808487080608}

In [87]:
search.best_score_

0.6150414448273998

[Vuelve al principio](#Índice)