# Spotify Modeling
**Jacob Torres**

In [20]:
"""Imports"""

# Data manipulation
import numpy as np
import pandas as pd
import sqlite3

# Modeling
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

---
## Data Collection and Massaging

In [2]:
# Load song data from database
DB_FILE = '../app/spotify_db.sqlite3'
GET_TRAIN_QUERY = 'select * from train;'
GET_LIKED_QUERY = 'select * from liked_songs_jt;'
conn = sqlite3.connect(DB_FILE)

train_df = pd.read_sql(GET_TRAIN_QUERY, conn)
liked_df = pd.read_sql(GET_LIKED_QUERY, conn)

num_likes = liked_df.shape[0]
num_songs = train_df.shape[0]
print(f"""
    Liked songs: {num_likes}
    Total songs: {num_songs}
""")


    Liked songs: 6610
    Total songs: 16610



In [3]:
# Create feature matrix and target vector
X = train_df.drop(['index', 'name', 'id', 'mode'], axis=1)
y = pd.Series(
    list(np.zeros(num_songs - num_likes, np.int64)) +
    list(np.ones(num_likes, np.int64))
)

assert len(X) == len(y)

In [4]:
X.isnull().sum()

acousticness        0
danceability        0
duration_ms         0
energy              0
tempo               0
instrumentalness    0
key                 0
liveness            0
loudness            0
valence             0
speechiness         0
dtype: int64

In [5]:
y.value_counts(normalize=True) * 100

0    60.204696
1    39.795304
dtype: float64

In [6]:
X.describe(include='all')

Unnamed: 0,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,valence,speechiness
count,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0,16610.0
mean,0.462809,0.570025,227049.4,0.495101,116.776362,0.12253,5.157255,0.188872,-10.504243,0.519277,0.093196
std,0.354704,0.168414,103870.9,0.245836,30.333933,0.269373,3.534552,0.157127,5.218244,0.251389,0.139699
min,3e-06,0.0,16653.0,2e-05,0.0,0.0,0.0,0.015,-52.22,0.0,0.0
25%,0.105,0.459,176388.2,0.3,93.717,0.0,2.0,0.0972,-13.369,0.321,0.0345
50%,0.434,0.5825,212805.5,0.49,114.016,8.5e-05,5.0,0.124,-9.561,0.514,0.0454
75%,0.823,0.694,255183.5,0.688,135.306,0.0285,8.0,0.228,-6.615,0.723,0.0802
max,0.996,0.977,3551152.0,1.0,221.741,0.999,11.0,0.996,0.101,1.0,0.968


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3
)

print(f"""
    X_train: {X_train.shape}
    X_test: {X_test.shape}
    y_train: {y_train.shape}
    y_test: {y_test.shape}
""")


    X_train: (11627, 11)
    X_test: (4983, 11)
    y_train: (11627,)
    y_test: (4983,)



In [8]:
# Feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(X_train, y_train)
feature_mask = selector.get_support()

In [9]:
print(selector.scores_)
best_features = X_train.columns[feature_mask]

X_train = X_train[list(best_features)]
X_test = X_test[list(best_features)]

print(X_train.shape)
X_train.head()

[214.96067391 870.85694893  37.78627015  48.39582964   3.97898621
 489.92870568   2.66118808 228.14477409 645.01210739  16.44361497
  11.75367942]
(11627, 5)


Unnamed: 0,acousticness,danceability,instrumentalness,liveness,loudness
10392,0.765,0.687,8e-06,0.0897,-11.649
15467,0.152,0.615,0.0,0.109,-4.808
7127,0.0958,0.702,0.00067,0.1,-10.903
6120,0.407,0.415,0.000501,0.114,-4.988
11754,0.167,0.743,0.00951,0.103,-11.174


---
## Model Selection and Evaluation

### Gradient Boosting Classifier

In [10]:
gb_pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('clf', GradientBoostingClassifier(warm_start=True))
    ]
)

In [11]:
%%time
gb_grid = {
    'clf__n_estimators': [100, 500, 1000],
    'clf__max_depth': [5, 10]
}

gb_gs = GridSearchCV(gb_pipe, gb_grid, n_jobs=-1)

gb_gs.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Wall time: 4min 26s


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('clf',
                                        GradientBoostingClassifier(warm_start=True))]),
             n_jobs=-1,
             param_grid={'clf__max_depth': [5, 10],
                         'clf__n_estimators': [100, 500, 1000]},
             verbose=1)

In [12]:
print(gb_gs.best_params_)

{'clf__max_depth': 10, 'clf__n_estimators': 1000}


In [13]:
gb_pipe['clf'].n_estimators = 1000
gb_pipe['clf'].max_depth = 10

print(gb_pipe['clf'].get_params())

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 1000, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': True}


In [21]:
scores = cross_val_score(gb_pipe, X_train, y_train, n_jobs=-1)
print(scores.max())

[0.98839209 0.98194325 0.98322581 0.98580645 0.98580645]


### K Neighbors Classifier

In [14]:
kn_pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('clf', KNeighborsClassifier())
    ]
)

In [15]:
%%time
kn_grid = {
    'clf__n_neighbors': [3, 5, 7],
    'clf__weights': ['uniform', 'distance']
}

kn_gs = GridSearchCV(kn_pipe, kn_grid, n_jobs=-1)

kn_gs.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Wall time: 4.57 s


GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('clf', KNeighborsClassifier())]),
             n_jobs=-1,
             param_grid={'clf__n_neighbors': [3, 5, 7],
                         'clf__weights': ['uniform', 'distance']},
             verbose=1)

In [16]:
print(kn_gs.best_params_)

{'clf__n_neighbors': 3, 'clf__weights': 'distance'}


In [17]:
kn_pipe['clf'].n_neighbors = 3
kn_pipe['clf'].weights = 'distance'

print(kn_pipe['clf'].get_params())

{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}


In [22]:
scores = cross_val_score(kn_pipe, X_train, y_train, n_jobs=-1)
print(scores.max())

0.8989681857265692
