# Spotify Modeling
**Jacob Torres**

In [1]:
"""Imports"""

# Data manipulation
import numpy as np
import pandas as pd
import sqlite3

# Modeling
import pickle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

---
## Data Collection and Massaging

In [2]:
# Load song data from database
DB_FILE = '../app/spotify_db.sqlite3'
GET_TRAIN_QUERY = 'select * from train;'
GET_LIKED_QUERY = 'select * from liked_songs_jt;'
conn = sqlite3.connect(DB_FILE)

train_df = pd.read_sql(GET_TRAIN_QUERY, conn)
liked_df = pd.read_sql(GET_LIKED_QUERY, conn)

num_likes = liked_df.shape[0]
num_songs = train_df.shape[0]
print(f"""
    Liked songs: {num_likes}
    Total songs: {num_songs}
""")


    Liked songs: 5874
    Total songs: 15874



In [3]:
# Create feature matrix and target vector
X = train_df.drop(['index', 'name', 'id', 'mode'], axis=1)
y = pd.Series(
    list(np.zeros(num_songs - num_likes, np.int64)) +
    list(np.ones(num_likes, np.int64))
)

assert len(X) == len(y)

In [4]:
X.isnull().sum()

acousticness        0
danceability        0
duration_ms         0
energy              0
tempo               0
instrumentalness    0
key                 0
liveness            0
loudness            0
valence             0
speechiness         0
dtype: int64

In [5]:
y.value_counts(normalize=True) * 100

0    62.996094
1    37.003906
dtype: float64

In [6]:
X.describe(include='all')

Unnamed: 0,acousticness,danceability,duration_ms,energy,tempo,instrumentalness,key,liveness,loudness,valence,speechiness
count,15874.0,15874.0,15874.0,15874.0,15874.0,15874.0,15874.0,15874.0,15874.0,15874.0,15874.0
mean,0.459044,0.569318,225268.4,0.494837,117.251375,0.134743,5.226156,0.191452,-10.701819,0.523939,0.094003
std,0.358577,0.169377,107818.7,0.249254,29.89741,0.276817,3.538558,0.15934,5.201994,0.254266,0.143338
min,0.0,0.0,5108.0,0.0,0.0,0.0,0.0,0.0,-60.0,0.0,0.0
25%,0.0959,0.458,173631.0,0.295,94.451,0.0,2.0,0.0988,-13.538,0.319,0.0343
50%,0.428,0.582,209766.0,0.49,115.138,0.000189,5.0,0.127,-9.79,0.527,0.045
75%,0.822,0.696,254353.2,0.692,135.7385,0.0602,8.0,0.23,-6.8275,0.734,0.079
max,0.996,0.977,4269407.0,1.0,217.943,1.0,11.0,0.996,0.878,0.985,0.969


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3
)

print(f"""
    X_train: {X_train.shape}
    X_test: {X_test.shape}
    y_train: {y_train.shape}
    y_test: {y_test.shape}
""")


    X_train: (11111, 11)
    X_test: (4763, 11)
    y_train: (11111,)
    y_test: (4763,)



In [8]:
# Feature selection
selector = SelectKBest(f_classif, k=5)
selector.fit(X_train, y_train)
feature_mask = selector.get_support()

In [9]:
print(selector.scores_)
best_features = X_train.columns[feature_mask]

X_train = X_train[list(best_features)]
X_test = X_test[list(best_features)]

print(X_train.shape)
X_train.head()

[189.80455189 650.90525285  58.59475335  27.4197593    4.27056359
 295.72031626   0.92745185 163.70724726 345.39532521   5.54351338
  10.46782438]
(11111, 5)


Unnamed: 0,acousticness,danceability,instrumentalness,liveness,loudness
14807,0.908,0.419,5e-06,0.34,-19.387
3128,0.445,0.717,5e-06,0.0973,-6.598
9518,0.949,0.392,0.246,0.0683,-16.603
7370,0.797,0.421,0.921,0.266,-14.332
2855,0.531,0.636,0.753,0.331,-13.228


---
## Model Selection and Evaluation

In [10]:
gb_pipe = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('clf', GradientBoostingClassifier(warm_start=True))
    ]
)

In [11]:
gb_grid = {
    'clf__n_estimators': [100, 500, 1000],
    'clf__max_depth': [5, 10]
}

gb_gs = GridSearchCV(gb_pipe, gb_grid, n_jobs=-1)

gb_gs.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('clf',
                                        GradientBoostingClassifier(warm_start=True))]),
             n_jobs=-1,
             param_grid={'clf__max_depth': [5, 10],
                         'clf__n_estimators': [100, 500, 1000]})

In [12]:
print(gb_gs.best_params_)

{'clf__max_depth': 10, 'clf__n_estimators': 1000}


In [13]:
gb_pipe['clf'].n_estimators = 1000
gb_pipe['clf'].max_depth = 10

print(gb_pipe['clf'].get_params())

{'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.1, 'loss': 'deviance', 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 1000, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': True}


In [14]:
scores = cross_val_score(gb_pipe, X_train, y_train, n_jobs=-1)
print(scores.max())

0.9896536212325686


---
## Save Model 1

In [15]:
MODEL_FILE = '../models/spotify_clf_model_1.sav'
try:
    pickle.dump(gb_pipe, open(MODEL_FILE, mode='wb'))
    print('Successfully saved the model.')

except:
    print('Something went wrong saving the model.')

Successfully saved the model.
