In [None]:
import pandas as pd
import altair as alt
import numpy as np
import matplotlib.pyplot as plt
import IPython
from IPython.display import Audio
import pygame

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV

from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, recall_score, classification_report, accuracy_score

import sys
import os

sys.path.insert(0, '../../')

from library.notebook_api.data_loader import CombinedDataLoader, ModelDataLoader

alt.data_transformers.enable("vegafusion")

In [None]:
#data_df = pd.read_csv('/Users/tetyananesdill/music_modeling_capstone/notebooks/exploratory/output2.csv')

In [None]:
full_model_data = ModelDataLoader('003')
full_model_data.df.to_csv('output2.csv', index=False)
data_df = full_model_data.df

In [None]:
in_scope_labels = [
    'rock', 'electronic', 'hiphop', 'classical', 'jazz', 'country'
]
data_df = data_df[data_df['label'].isin(in_scope_labels)]

In [None]:
feature_df = data_df[[
    'spectral_centroids_mean', 'spectral_centroids_delta_mean',
    'spectral_centroids_accelerate_mean', 'spectral_bandwidth_mean',
    'spectral_rolloff_mean', 'zero_crossing_rate_mean', 'rms_mean',
    'chroma_stft_mean', 'mfccs_mean', 'tempo', 'onset', 'contrast', 'tonnetz',
    'mfccs_min', 'mfccs_max'
]]

label = data_df['label']

feature_df.head(1)

## PCA

In [None]:
x = StandardScaler().fit_transform(feature_df)
pca = PCA(n_components=3)
pca_comp = pca.fit_transform(x)

In [None]:
pca_df = pd.DataFrame(data=pca_comp,
                      columns=[
                          'principal component 1', 'principal component 2',
                          'principal component 3'
                      ])

In [None]:
pca_df


In [None]:
pca_df_2 = pd.concat([pca_df, label],axis=1)


In [None]:
pca.explained_variance_ratio_


In [None]:
pca = PCA().fit(x)

explained_var = pd.DataFrame(data=pca.explained_variance_ratio_,
                             columns=['PCs'])
explained_var['count cumsum'] = explained_var['PCs'].cumsum()
explained_var['cum_%'] = explained_var['count cumsum'] / (
    explained_var['PCs'].sum())
explained_var.reset_index(inplace=True)
explained_var

In [None]:
alt.Chart(explained_var).mark_bar().encode(
    x=alt.X('index:N', title=None, sort='-y'),
    y=alt.Y('PCs:Q', title=None),
    color=alt.Color('PCs:Q',
                    legend=None)).properties(title='Principal Components',
                                             width=400)

base = alt.Chart(explained_var).encode(
    x=alt.X('index:N', title=None, sort='-y'))

bar_chart = base.mark_bar(color='#ADD8E6').encode(
    y=alt.Y('PCs:Q', title=None)).properties(width=600)

line = base.mark_line(strokeWidth=3, color='#cb4154').encode(
    y=alt.Y('cum_%', title=None, axis=alt.Axis(format=".0%")),
    text=alt.Text('PCs:Q'))

text = base.mark_text(strokeWidth=3,
                      color='#cb4154',
                      align='left',
                      baseline='middle',
                      dx=-10,
                      dy=-10).encode(y=alt.Y('cum_%:Q', axis=None),
                                     text=alt.Text('cum_%:Q', format="0.0%"))

(bar_chart + line +
 text).resolve_scale(y='independent').properties(title='Principal Components')

## K-Means

In [None]:
clusters = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(x)
    clusters.append(kmeans.inertia_)

clusters

In [None]:
plt.plot(range(1, 11), clusters, marker='o')
plt.title('Elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3)
pred_cl = kmeans.fit_predict(feature_df)

In [None]:
pred_cl = pd.DataFrame(pred_cl)
label = pd.DataFrame(label).reset_index()
pred_cl.columns = ['cluster']

In [None]:
clust_label = pd.concat([pred_cl, label],axis=1)
clust_label.columns = ['cluster' , 'index', 'label']

In [None]:
#clust_label = clust_label[['cluster', 'label']]
clust_label

In [None]:
groups = pd.DataFrame(clust_label.groupby(by=['cluster', 'label']).count())

groups.reset_index()

In [None]:
alt.Chart(groups).mark_bar().encode(
    x=alt.X('index', title=None),
    y=alt.Y('label:N', title=None, sort='-x'),
    column=alt.Column('cluster:N', title=None)).properties(width=200)

In [None]:
df_scaled = pd.DataFrame(StandardScaler().fit_transform(feature_df),
                         columns=feature_df.columns)

In [None]:
df_scaled.reset_index(inplace=True)
features_cl = pd.concat([pred_cl, df_scaled], axis=1)

In [None]:
features_cl = pd.melt(features_cl,
                      id_vars=['index', 'cluster'],
                      var_name='features',
                      value_name='values')
features_cl.columns = ['index', 'cluster', 'features', 'values']
features_cl

In [None]:
alt.Chart(features_cl).mark_bar().encode(
    x=alt.X('values', title=None),
    y=alt.Y('features:N', title=None, sort='-x'),
    column=alt.Column('cluster:N', title=None)).properties(width=200)

In [None]:
#data_df.reset_index(inplace=True)
data_df_2 = pd.concat([pred_cl, data_df], axis=1)
data_df_2.to_csv('clusters.csv', index=False)

In [None]:
data_df_2