# Spotify Modeling
**Jacob Torres**

In [42]:
"""Imports"""

# Data manipulation
import numpy as np
import pandas as pd
import sqlite3

# Modeling
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

---
## Data Collection and Massaging

In [2]:
# Load song data from database
DB_FILE = '../app/spotify_db.sqlite3'
GET_TRAIN_QUERY = 'select * from train;'
GET_LIKED_QUERY = 'select * from liked_songs_jt;'
conn = sqlite3.connect(DB_FILE)

train_df = pd.read_sql(GET_TRAIN_QUERY, conn)
liked_df = pd.read_sql(GET_LIKED_QUERY, conn)

num_likes = liked_df.shape[0]
num_songs = train_df.shape[0]
print(f"""
    Liked songs: {num_likes}
    Total songs: {num_songs}
""")


    Liked songs: 6610
    Total songs: 16610



In [3]:
# Create feature matrix and target vector
X = train_df.drop(['index', 'name', 'id', 'mode'], axis=1)
y = pd.Series(
    list(np.zeros(num_songs - num_likes, np.int64)) +
    list(np.ones(num_likes, np.int64))
)

assert len(X) == len(y)

In [4]:
X.isnull().sum()

acousticness        0
danceability        0
duration_ms         0
energy              0
tempo               0
instrumentalness    0
key                 0
liveness            0
loudness            0
valence             0
speechiness         0
dtype: int64

In [5]:
y.value_counts(normalize=True) * 100

0    60.204696
1    39.795304
dtype: float64

In [6]:
X.describe(include='all')

Unnamed: 0,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,valence,speechiness
count,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0
mean,0.462809,0.570025,227049.4,0.495101,116.776362,0.12253,5.157255,0.188872,-10.504243,0.519277,0.093196
std,0.354704,0.168414,103870.9,0.245836,30.333933,0.269373,3.534552,0.157127,5.218244,0.251389,0.139699
min,3e-06,0.0,16653.0,2e-05,0.0,0.0,0.0,0.015,-52.22,0.0,0.0
25%,0.105,0.459,176388.2,0.3,93.717,0.0,2.0,0.0972,-13.369,0.321,0.0345
50%,0.434,0.5825,212805.5,0.49,114.016,8.5e-05,5.0,0.124,-9.561,0.514,0.0454
75%,0.823,0.694,255183.5,0.688,135.306,0.0285,8.0,0.228,-6.615,0.723,0.0802
max,0.996,0.977,3551152.0,1.0,221.741,0.999,11.0,0.996,0.101,1.0,0.968


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3
)

print(f"""
    X_train: {X_train.shape}
    X_test: {X_test.shape}
    y_train: {y_train.shape}
    y_test: {y_test.shape}
""")


    X_train: (11627, 11)
    X_test: (4983, 11)
    y_train: (11627,)
    y_test: (4983,)



In [16]:
# Feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(X_train, y_train)
feature_mask = selector.get_support()

In [20]:
print(selector.scores_)
best_features = X_train.columns[feature_mask]

X_train = X_train[list(best_features)]
X_test = X_test[list(best_features)]

print(X_train.shape)
X_train.head()

[229.86449267 883.05657746  43.56816091  55.24863678   1.44303414
 514.71981128   3.56291113 218.42555759 669.36032357  14.37483877
   9.35929062]
(11627, 5)


Unnamed: 0,acousticness,danceability,instrumentalness,liveness,loudness
14565,0.844,0.593,5e-06,0.127,-14.029
11839,0.0952,0.611,0.0,0.11,-6.372
5100,0.638,0.575,0.0,0.525,-2.944
6461,0.978,0.287,0.62,0.137,-18.274
13729,0.0292,0.761,2e-06,0.39,-3.068


---
## Model Building

In [49]:
rf_pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('clf', RandomForestClassifier())
    ]
)

gb_pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('clf', GradientBoostingClassifier())
    ]
)

kn_pipe = make_pipeline(
    [
        ('scaler', StandardScaler()),
        ('clf', KNeighborsClassifier())
    ]
)

TypeError: Last step of Pipeline should implement fit or be the string 'passthrough'. '[('scaler', StandardScaler()), ('clf', KNeighborsClassifier())]' (type <class 'list'>) doesn't

[Pipeline] ....... (step 1 of 2) Processing transformer, total=   0.0s
[Pipeline] ............... (step 2 of 2) Processing clf, total=   2.6s


GridSearchCV(estimator=Pipeline(steps=[('transformer', StandardScaler()),
                                       ('clf', RandomForestClassifier())],
                                verbose=True),
             n_jobs=-1,
             param_grid={'clf__max_depth': [None, 50, 100],
                         'clf__max_features': [None, 'auto', 3],
                         'clf__n_estimators': [100, 250, 500]},
             return_train_score=True)

In [46]:
rf_gs.best_params_

{'clf__max_depth': None, 'clf__max_features': 'auto', 'clf__n_estimators': 100}

dict_keys(['memory', 'steps', 'verbose', 'transformer', 'clf', 'transformer__copy', 'transformer__with_mean', 'transformer__with_std', 'clf__bootstrap', 'clf__ccp_alpha', 'clf__class_weight', 'clf__criterion', 'clf__max_depth', 'clf__max_features', 'clf__max_leaf_nodes', 'clf__max_samples', 'clf__min_impurity_decrease', 'clf__min_impurity_split', 'clf__min_samples_leaf', 'clf__min_samples_split', 'clf__min_weight_fraction_leaf', 'clf__n_estimators', 'clf__n_jobs', 'clf__oob_score', 'clf__random_state', 'clf__verbose', 'clf__warm_start'])