## Clustering

In [32]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import cluster, pipeline, preprocessing, compose
from sklearn.cluster import DBSCAN, KMeans
from sklearn.model_selection import train_test_split

In [33]:
df = pd.read_csv('EDA-tracks.csv')
print(df.shape)

(114000, 19)


In [34]:
df.head()

Unnamed: 0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


For Clustering, we will need to drop some more columns due to them being text:
- `artists`
- `album_name`
- `track_name`
- `track_genre`

In [35]:
columns = ['artists', 'album_name', 'track_name', 'track_genre']
df = df.drop(columns=columns, axis=1)

df = df.drop(columns='duration_ms', axis=1)

In [36]:
df.head()

Unnamed: 0,popularity,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,73,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4
1,55,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4
2,57,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4
3,71,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3
4,82,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4


### Clustering

In [37]:
# Pipelines

k = 5
EPS = 1
MIN = 10

outliers = ['speechiness', 'explicit', 'liveness', 'instrumentalness']
outliers_zero = ['acousticness']
non_outliers = ['tempo', 'valence', 'popularity', 'key', 'danceability',
                'mode', 'energy', 'loudness', 'time_signature']

outliers_pipe = pipeline.Pipeline(
    steps= [
        ("log_transform", preprocessing.FunctionTransformer(np.log1p)), #log1p instead of log because it works for 0
        ("scaler", preprocessing.RobustScaler())
    ]
)
outliers_zero_pipe = pipeline.Pipeline(
    steps= [
        ("cubic_transform", preprocessing.FunctionTransformer(np.cbrt)),
        ("scaler", preprocessing.RobustScaler())
    ]
)

preprocessing_pipe = compose.ColumnTransformer(
    transformers = [
        ("outliers", outliers_pipe, outliers),
        ("outliers_zeros", outliers_zero_pipe, outliers_zero),
        ("non_outliers", preprocessing.FunctionTransformer(np.cbrt), non_outliers)
    ],
    remainder="passthrough"
)

kmeans_pipe = pipeline.Pipeline(
    steps= [
        ('preprocess', preprocessing_pipe),
        ('kmeans', KMeans(n_clusters=k, n_init=10, max_iter=300)),
    ]
)

dbscan_pipe = pipeline.Pipeline(
    steps=[
        ('scaler', preprocessing.StandardScaler()),
        ('dbscan', DBSCAN(eps=EPS, min_samples=MIN)),
    ]
)

### TODO: use elbow method to find best k value