In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly as py
import plotly.express as px
import seaborn as sns
from pca import pca
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from joblib import dump

In [None]:
#importing the csv file to create models.
music_df = pd.read_csv('data.csv',index_col=['name'])
#droping columns that will just muddy up the data
music_df = music_df.drop(columns=['artists', 'release_date', 'id'])
#taking a look at the final dataframe before applying kmeans and pca
music_df.head()

In [None]:
#scalling the information to find the best fit
music_scale = pd.DataFrame(data=StandardScaler().fit_transform(music_df), columns=music_df.columns, index=music_df.index)
music_scale.head()

In [None]:
# creating cluster defenition to call back when running kmeans
kmeans=KMeans(n_clusters=10, random_state=42)
kmeans.fit(music_scale.values)
#find the centers of the Kmeans data
centers = kmeans.cluster_centers_
#prepare the lables of Kmeans for graphing
labels=kmeans.labels_
labels

In [None]:
def elbow_plot(df, num_clusters_test=10):
  """Create an elbow plot of KMeans Clustering data."""
  label_predictions = []
  for num in range(1, num_clusters_test+1):
    model = KMeans(num)
    model.fit(df)
    label_predictions.append(model.inertia_)
    print(num)
  plt.title('Cluster Variance')
  plt.xlabel('Number of Clusters')
  plt.ylabel('Sum of Squared Distances')
  plt.plot(range(1, num_clusters_test+1), label_predictions, 'rx-')
  plt.show()

In [None]:
elbow_plot(music_scale, 20)

### Kmeans 3d scater plots

In [None]:
music_scale.columns

In [None]:
#adding lables to color code each group type
music_df['group']=labels.astype(str)
#sampling the dataset to make a more managable graph
music_sample = music_df.sample(5000, random_state=42)
#color dictionary
colors={'0':'crimson',
        '1':'#4da6ff',
        '2':'#ff66cc',
        '3':'#993366',
        '4':'#ffff66',
        '5':'#99ff33',
        '6':'#009933',
        '7':'#00ffff',
        '8':'#cc3300',
        '9':'#cc33ff'
       }
#graphing the Kmeans test
fig = px.scatter_3d(music_sample,
                    x='energy', # the first important feature
                    y='danceability', # the second important feature
                    z='liveness', # the third important feature
                    color='group', # calling the K-means group labels column for apllying color to the different data points
                    color_discrete_map=colors, # using the color dictionary previously created to apply colors 
                    width=700,
                    height=500,
                    hover_name=music_sample.index) # make each dot show teh name of the song when someone hovers over the dot
fig.update_traces(marker = dict(size=1), #changing the dot size to be something more reasonable and easier to see
                  showlegend=False) #removing the ledgend that is nodiscriptive and serves no puropse
fig.show()

In [None]:
# Set a custom color palette to match with plotly graph
sncolors=('crimson',
          '#4da6ff',
          '#ff66cc',
          '#993366',
          '#ffff66',
          '#99ff33',
          '#009933',
          '#00ffff',
          '#cc3300',
          '#cc33ff')
customPalette = sns.set_palette(sns.color_palette(sncolors))

sns.pairplot(music_sample,
             hue='group',
             vars=(['acousticness',
                    'danceability',
                    'energy',
                    'instrumentalness',
                    'liveness',
                    'loudness',
                    'popularity',
                    'speechiness',
                    'tempo',
                    'valence',
                    'year']),
             palette=customPalette,
             corner=True)
plt.savefig('kmeans_pairplots.png')
plt.show()

### PCA analysis

In [None]:
def scree_plot(pca):
  num_length=len(pca.explained_variance_ratio_) #define how many components out
  ind = np.arange(num_length) #taging the spot of each value to assign the component to
  vals= pca.explained_variance_ratio_ #the total variance of the pca

  plt.figure(figsize=(18,6))
  ax = plt.subplot(111)
  cumval = np.cumsum(vals) #add up all the variance of the proceeding pcas till that point
  ax.bar(ind,vals)
  ax.plot(ind, cumval)
  for i in range(num_length):
    ax.annotate(r'%s' % ((str(vals[i]*100)[:3])), (ind[i], vals[i]), va='bottom',ha='center',fontsize=7)

  ax.xaxis.set_tick_params(width=0)
  ax.yaxis.set_tick_params(width=2, length=12)
  plt.xlim(0,10)
 
  ax.set_xlabel("Principal Component")
  ax.set_ylabel("Variance Explained (%)")
  plt.title('Explained Variance Per Principal Component')

In [None]:
mod1=PCA(.9)

In [None]:
pca_fit= mod1.fit_transform(music_scale)

In [None]:
scree_plot(mod1)

### second pca method to determin most important features

In [None]:
mod2=pca(.9)

In [None]:
pca_fit2= mod2.fit_transform(music_scale)

In [None]:
mod2.plot()

In [None]:
pca_fit2['topfeat']['feature'].to_list()

In [None]:

# pipeline = model
# dump(pipeline, 'pipeline.joblib', compress=True) 