# Spotify Modeling
**Jacob Torres**

In [1]:
"""Imports"""

# Data manipulation
import numpy as np
import pandas as pd
import sqlite3

# Modeling
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

---
## Data Collection and Massaging

In [3]:
# Load song data from database
DB_FILE = '../app/spotify_db.sqlite3'
GET_TRAIN_QUERY = 'select * from train;'
GET_LIKED_QUERY = 'select * from liked_songs_jt;'
conn = sqlite3.connect(DB_FILE)

train_df = pd.read_sql(GET_TRAIN_QUERY, conn)
liked_df = pd.read_sql(GET_LIKED_QUERY, conn)

num_likes = liked_df.shape[0]
num_songs = train_df.shape[0]
print(f"""
    Liked songs: {num_likes}
    Total songs: {num_songs}
""")


    Liked songs: 6509
    Total songs: 16509



In [9]:
# Create feature matrix and target vector
X = train_df.drop('index', axis=1)
y = pd.Series(
    list(np.zeros(num_songs - num_likes, np.int64)) +
    list(np.ones(num_likes, np.int64))
)

assert len(X) == len(y)

In [12]:
print(f"Features: {list(X.columns)}")

Features: ['acousticness', 'danceability', 'duration_ms', 'energy', 'tempo', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'valence', 'speechiness']


In [14]:
y.value_counts(normalize=True) * 100

0    60.573021
1    39.426979
dtype: float64

---
## Model Building

In [33]:
rf_pipe = make_pipeline(
    StandardScaler(),
    RandomForestClassifier(),
    verbose=True
)

gb_pipe = make_pipeline(
    StandardScaler(),
    GradientBoostingClassifier(),
    verbose=True
)