# Library

In [1]:
import pandas as pd
import sys

In [2]:
print("Can you see this?")

Can you see this?


In [3]:
!{sys.executable} -m pip install PyAthena

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m


In [4]:
from pyathena import connect

In [5]:
import numpy as np
import matplotlib.pyplot as plt

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans,DBSCAN, SpectralClustering, MiniBatchKMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score
from sklearn.impute import SimpleImputer
from sklearn.decomposition import SparsePCA
from scipy import sparse as sp
import scipy

In [7]:
np.random.seed(1)

### Getting Data


In [8]:
### UNCOMMENT THE LOGIC BELOW ON THE FIRST RUN###
# conn = connect(s3_staging_dir='s3://athena-results-c7fhgh8/',
#                region_name='us-east-1')

# df = pd.read_sql("select * from \"millionsongdataset-intermediate\".songdata;", conn)
# %store df
%store -r df   

Stored 'df' (DataFrame)


In [9]:
df.shape

(1000000, 53)

In [10]:
df.isna().sum()

analysis_sample_rate                   0
audio_md5                              0
danceability                           0
duration                               0
end_of_fade_in                         0
energy                                 0
idx_bars_confidence                    0
idx_bars_start                         0
idx_beats_confidence                   0
idx_beats_start                        0
idx_sections_confidence                0
idx_sections_start                     0
idx_segments_confidence                0
idx_segments_loudness_max              0
idx_segments_loudness_max_time         0
idx_segments_loudness_start            0
idx_segments_pitches                   0
idx_segments_start                     0
idx_segments_timbre                    0
idx_tatums_confidence                  0
idx_tatums_start                       0
key                                    0
key_confidence                         0
loudness                               0
mode            

# Preprocessing

## Create Clean Frame
* Filter 0 year and years that are > 2022. ==> Note that this SIGNIFICANTLY reduces # of records we can work with so may choose not to do it.
* Select a subset of columns

In [11]:
# filtered_df = df[(df['year']>0)&(df['year']<=2022)][['loudness','tempo','artist_hotttness'
#                                                      ,'artist_familiarity','genre','song_hotttness'
#                                                      ,'track_id','song_id','artist_id'
#                                                      ,'artist_name','title']].copy()
filtered_df_2 = df[[
    'loudness',
    'tempo',
    'artist_familiarity'
]].copy()

## Pipeline for Feature Selection

In [12]:
scaler_step = Pipeline([
    ("imputer", SimpleImputer(strategy='constant', fill_value=0)),
    ("scaler", StandardScaler())
])

encoder_step = Pipeline([
    ("encoder", OneHotEncoder())
])

In [13]:
transformers = ColumnTransformer([
    ("scaler_process", scaler_step, ['loudness', 
                                     'tempo', 
                                     'artist_familiarity'
                                    ])
#     ,
#     ("encoder_process", encoder_step, ['genre'])
])

In [14]:
feature_pipeline = Pipeline([
    ("processor", transformers),
    ("kmeans_modeller",  MiniBatchKMeans(random_state=1))
])

### Split the data

In [15]:
train_indices = np.random.choice(filtered_df_2.index, size=int(filtered_df_2.shape[0]*0.8), replace=False)

In [16]:
test_df_2 = filtered_df_2[~filtered_df_2.index.isin(train_indices)]
train_df_2 = filtered_df_2[filtered_df_2.index.isin(train_indices)]

### Feed the train set to feature pipeline

In [17]:
feature_pipeline.fit(train_df_2)

Pipeline(memory=None,
         steps=[('processor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scaler_process',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=0,
                                                                                 missing_values=nan,
                                                                                 strategy='constant',
                                                      

### Extract model

In [18]:
feat_selection_kmeans = feature_pipeline['kmeans_modeller']


### Extract transformed dataframe

In [19]:
transformed_train_df_2 = feature_pipeline['processor'].fit_transform(train_df_2)

### Scoring the clustering methods

#### Kmeans

In [20]:
silhouette_score(transformed_train_df_2, feat_selection_kmeans.labels_, metric='euclidean',sample_size=int(train_df_2.shape[0]*0.3))

0.21168536423914605

In [22]:
calinski_harabasz_score(transformed_train_df_2, feat_selection_kmeans.labels_)

219826.0400121672

In [23]:
kmeans_classes = np.unique(feat_selection_kmeans.labels_)
kmeans_classes

array([0, 1, 2, 3, 4, 5, 6, 7], dtype=int32)

In [24]:
centers=feat_selection_kmeans.cluster_centers_

### UNCOMMENT LOGIC BELOW ON FIRST RUN###
%store centers

#store -r centers

Stored 'centers' (ndarray)


In [25]:
centers.shape

(8, 3)

In [27]:
centers


array([[ 2.28362118e-01,  7.48000810e-01,  1.02494212e+00],
       [-1.21867730e+00, -8.69793974e-01, -2.40404941e-01],
       [-2.58821271e+00, -6.20277267e-02,  6.74067263e-02],
       [ 5.42469568e-01,  2.28491356e-01, -1.93333940e-01],
       [-1.85289207e-01, -4.88769529e-05, -1.75603318e+00],
       [ 4.88057919e-01, -5.88692384e-01,  1.18063576e+00],
       [ 2.33997403e-01,  1.77828271e+00, -2.11741623e-01],
       [ 3.35059674e-01, -9.42984785e-01, -2.11734860e-01]])