In [63]:
# Start writing code here...
# The goal of this notebook is to lay the foundation
# for two types of supervised machine learning classifiers.
from collections import Counter
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")

import ast
import pandas as pd
import numpy as np



In [61]:
def condition_raw_data(raw_df):
    """
    Consume a dataframe

    Return a DF with the musical attributes and genre columns only
    """
    # rename artist genres to be called genre
    raw_df.rename(columns={'artist_genres': 'genre'}, inplace=True)
    # features and y val cols.
    required_cols = ['danceability', 'energy', 'key', 'loudness', 'mode', 
                'speechiness', 'acousticness', 'instrumentalness', 
                'liveness', 'valence', 'tempo', 'time_signature',
                'duration_ms', 'genre'
       ]
    # remove unneccessary cols
    raw_df = raw_df[required_cols]
    return raw_df
def generate_train_test(tracks_df, random_val=42, split_ratio=0.8):
    """
    inputs:
        a dataframe containing song attributes and genre.
        random val for repeatability
        split_ratio = decimal pct of samples to use for training.
    returns:
        two dataframes train_df and test_df
    """
    # step 1 shuffle the df
    temp_df = tracks_df.sample(random_state=random_val, frac=1.0)
    # establish a number to split the frame at.
    num_train_samples = int(split_ratio*len(tracks_df))
    # split the DF into two sets train and test
    return np.split(temp_df, [num_train_samples])
# PCA
# goal apply PCA to the data set. Evaluate impact on F1
def apply_pca(train_df, test_df, n_dim = 2):
    """
    read in the training data for a dataframe
    apply dimmensionality reduction

    return modified train and test sets
    """
    # start off with stock settings
    pca = PCA(n_components=n_dim)
    train_df.reset_index(inplace=True, drop=True)
    test_df.reset_index(inplace=True, drop=True)
    X_cols = [col for col in train_df.columns if col != 'genre']
    myScaler = StandardScaler()
    X_train = myScaler.fit_transform(train_df[X_cols])
    X_test = myScaler.transform(test_df[X_cols])
    # only use the training data to fit the pca model
    pca.fit(X_train)
    # lesson learned here you need to reset the index so that pandas merges the labels
    # back in correctly
    pca_train_df = pd.DataFrame(pca.transform(X_train))
    pca_train_df['genre'] = train_df['genre']

    pca_test_df = pd.DataFrame(pca.transform(X_test))
    pca_test_df['genre'] = test_df['genre']
    return pca_train_df, pca_test_df


In [135]:
def train_kmeans_cluster(train, num_clusses=7):
    X_train = train[[col for col in train.columns if col != 'genre']]
    y_train = train['genre']

    km = KMeans(n_clusters=num_clusses, random_state=42)
    km.fit(X_train)
    # label the clusters using most frequent
    clus = km.predict(X_train)
    num_1 = []
    # create a dictionary that will store a map of cluster labels to its most frequent
    # genre label
    clust_dict = defaultdict(str)
    for tc in range(num_clusses):
        temp_clusts = [y_train.values[i] for i in range(len(y_train)) if clus[i]==tc]
        if Counter(temp_clusts).most_common(1):
            num_1.append(Counter(temp_clusts).most_common(1)[0][0])
            clust_dict[tc] = Counter(temp_clusts).most_common(1)[0][0]
    return km, clust_dict

def test_kmeans_cluster(km, clust_dict, test):
    """
    provide a fitted kmeans cluster model
    a dictionary mapping clusters to labels
    test data

    return f1 score for model
    """
    X_test = test[[col for col in train.columns if col != 'genre']]
    y_test = test['genre']
    preds = km.predict(X_test)
    preds = [clust_dict[cl] for cl in preds]
    return f1_score(y_test, preds, average='macro')
#Counter(num_1)


In [136]:
# load in data 
raw_df = pd.read_csv('../raw_spotify_data/pure_genre_data.csv')
clean_df = condition_raw_data(raw_df)
train, test = generate_train_test(clean_df, 42, 0.8)
train, test = apply_pca(train, test, 2)
km, clust_dict = train_kmeans_cluster(train, num_clusses=20)
test_kmeans_cluster(km, clust_dict, test)

0.429790830295275

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=659c715d-e2b5-478e-9116-4d32a5174810' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>