In [2]:
import pandas as pd
import plotly.express as px
from sklearn.utils import shuffle
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix

pd.options.mode.chained_assignment = None  # default='warn'

dataset = pd.read_csv('msd_genre_dataset0.csv', header=0)

classicPopAndRock = dataset.iloc[0:23895]
punk = dataset.iloc[23895:27095]
folk = dataset.iloc[27095:40287]
pop = dataset.iloc[40287:41904]
danceAndElectronica = dataset.iloc[41904:46839]
metal = dataset.iloc[46839:48942]
jazz = dataset.iloc[48942:53276]
classical = dataset.iloc[53276:55150]
hipHop = dataset.iloc[55150:55584]
soulAndReggae = dataset.iloc[55584:59600]

timbreFeaturesAndGenre = ['avg_timbre1','avg_timbre2','avg_timbre3','avg_timbre4', 'avg_timbre5',
                            'avg_timbre6','avg_timbre7','avg_timbre8', 'avg_timbre9','avg_timbre10',
                            'avg_timbre11','avg_timbre12', 'var_timbre1', 'var_timbre2', 'var_timbre3',
                            'var_timbre4', 'var_timbre5','var_timbre6','var_timbre7','var_timbre8',
                            'var_timbre9','var_timbre10','var_timbre11','var_timbre12', 'genre']

datasetFrames = [classicPopAndRock.head(10), punk.head(10), folk.head(10), pop.head(10),
                 danceAndElectronica.head(10), metal.head(10), jazz.head(10), classical.head(10),
                 hipHop.head(10), soulAndReggae.head(10)]

datasetSample = pd.concat(datasetFrames)
shuffledDatasetSample = shuffle(datasetSample)

featureVector = shuffledDatasetSample[timbreFeaturesAndGenre]

kMeans = KMeans(n_clusters=10)
timbreFeatures = timbreFeaturesAndGenre[:-1]
kMeans.fit(featureVector[timbreFeatures])
featureVector.loc[:, 'genre'] = kMeans.labels_

In [51]:
genreNumbers = featureVector[['genre']]
genreTitles = shuffledDatasetSample[['genre']]
genreNumbers['genre'] = genreNumbers['genre'].astype(str)

clusterRelationshipDataframe = pd.DataFrame()
clusterRelationshipDataframe['genreN'] = genreNumbers['genre']
clusterRelationshipDataframe['genreT'] = genreTitles['genre']

def inspectClusters(relationshipDataframe):  
    songClusters = showGenreClusterDistribution(relationshipDataframe)
    print('Genre Cluster Locations:')
    clusterDistribution = clusterTallyPerGenre(songClusters)
    print('\n')
    return clusterDistribution

def showGenreClusterDistribution(clusterGenreCorrelationDataframe):
    clusterDistribution = {}
    for index, row in clusterGenreCorrelationDataframe.iterrows():
        clusterNumber = row['genreN']
        songGenre = row['genreT']
        if songGenre in clusterDistribution:
            clusterDistribution[songGenre].append(clusterNumber)
        else:
            clusterDistribution[songGenre] = [clusterNumber]
    return clusterDistribution

def clusterTallyPerGenre(genreSongsDistribution):
    clusterMap = {}
    clusterMap['pop'] = {}
    clusterMap['dance and electronica'] = {}
    clusterMap['punk'] = {}
    clusterMap['soul and reggae'] = {}
    clusterMap['folk'] = {}
    clusterMap['metal'] = {}
    clusterMap['hip-hop'] = {}
    clusterMap['classic pop and rock'] = {}
    clusterMap['classical'] = {}
    clusterMap['jazz and blues'] = {}
    for key in genreSongsDistribution:
        print(key, genreSongsDistribution[key])
        for value in genreSongsDistribution[key]:
            if value in clusterMap[key]:
                clusterMap[key][value] += 1
            else:
                clusterMap[key][value] = 1
    return clusterMap

def removeReservedClusters(clusteredIncorrectly, groupedClusters):
    for clusterIdentifier in clusteredIncorrectly:
        genreList = list(clusteredIncorrectly[clusterIdentifier])
        counter = 0
        while counter < len(genreList):
            clusterNumberList = list(groupedClusters[genreList[counter]])
            listCopy = clusterNumberList
            print('LIST', clusterNumberList)
            print('NUMBER', clusterIdentifier)
            while clusterIdentifier in clusterNumberList: clusterNumberList.remove(clusterIdentifier)
            print('RED', clusterNumberList)
            groupedClusters[clusterIdentifier] = clusterNumberList
            counter += 1
    return groupedClusters

songClusters = {}
songClusters = showGenreClusterDistribution(clusterRelationshipDataframe)
print('Genre Cluster Locations:')
clusterDistribution = clusterTallyPerGenre(songClusters)
print('\n')
print('Cluster Popularity:')
clusterLabels = {}
for key in clusterDistribution:
    clusterLabels[key] = max(clusterDistribution[key], key=clusterDistribution[key].get)
for key in clusterLabels:
    print(key, clusterLabels[key])
print('\n')

print('Cluster Ties:')
reverseClusterLabels = {}
for key, value in clusterLabels.items():
    reverseClusterLabels.setdefault(value, set()).add(key)
clusterDuplicates = {}
for key in reverseClusterLabels:
    value = reverseClusterLabels[key]
    if len(value) > 1:
        clusterDuplicates[key] = value
for key in clusterDuplicates:
    print(key, clusterDuplicates[key])
print('\n')

genreTieBreaker = {}
print(clusterDuplicates)
for clusterNumberLabel in clusterDuplicates:
    clusterDuplicateList = list(clusterDuplicates[clusterNumberLabel])
    maxGenre = clusterDuplicateList[0]
    maximum = clusterDistribution[clusterDuplicateList[0]][clusterNumberLabel]
    counter = 1
    while counter < len(clusterDuplicateList):
        if maximum < clusterDistribution[clusterDuplicateList[counter]][clusterNumberLabel]:
            maximum = clusterDistribution[clusterDuplicateList[counter]][clusterNumberLabel]
            maxGenre = clusterDuplicateList[counter]
        counter += 1
    clusterDuplicates[clusterNumberLabel].remove(maxGenre)
    genreTieBreaker[maxGenre] = clusterNumberLabel

print('tiebreaker', genreTieBreaker)
print('clusterdup', clusterDuplicates)

print('\n')
print('clusterDistribution')
for key in songClusters:
    print(key, songClusters[key])
print('\n')
updatedClusterLocations = removeReservedClusters(clusterDuplicates, songClusters)
# print('updated')
# for key in songClusters:
#     print(key, songClusters[key])

# for label in genreTieBreaker:
#     clusterLabels[label] = genreTieBreaker[label]
    

confusionMatrix = pd.DataFrame(confusion_matrix(featureVector[['genre']], predictatedClusters[['genre']]))

print(confusionMatrix)



YOU ARE WORKING ON REMOVING THE RESERVED CLUSTERS FROM THE ORIGINAL KMEANS CLUSTERING GROUPS

Genre Cluster Locations:
classical ['9', '9', '9', '9', '9', '1', '9', '9', '9', '2']
punk ['2', '2', '7', '7', '2', '7', '0', '2', '2', '6']
dance and electronica ['4', '4', '8', '3', '1', '8', '4', '7', '8', '5']
folk ['7', '0', '7', '0', '7', '6', '7', '1', '7', '7']
metal ['7', '1', '0', '2', '7', '2', '2', '2', '2', '2']
soul and reggae ['6', '6', '6', '6', '6', '6', '6', '6', '6', '6']
jazz and blues ['1', '1', '7', '1', '7', '1', '0', '1', '7', '0']
hip-hop ['9', '0', '0', '0', '0', '7', '2', '2', '7', '9']
classic pop and rock ['2', '2', '7', '5', '1', '2', '2', '2', '2', '2']
pop ['7', '7', '7', '7', '7', '0', '7', '7', '7', '7']


Cluster Popularity:
pop 7
dance and electronica 4
punk 2
soul and reggae 6
folk 7
metal 2
hip-hop 0
classic pop and rock 2
classical 9
jazz and blues 1


Cluster Ties:
7 {'pop', 'folk'}
2 {'metal', 'punk', 'classic pop and rock'}


{'7': {'pop', 'folk'}, '2': {'metal', 'punk', 'classic pop and rock'}}
tiebreaker {'pop': '7', 'classic pop and rock': 

NameError: name 'predictatedClusters' is not defined

In [61]:
# genre = ['classicPopAndRock', 'punk', 'folk', 'pop', 'danceAndElectronica', 'metal',
#          'jazz', 'classical', 'hipHop', 'soulAndReggae']

# predictatedClusters = shuffledDatasetSample[timbreFeaturesAndGenre]
# for index, row in predictatedClusters.iterrows():
#     if row['genre'] == 'dance and electronica':
#         predictatedClusters.at[index, 'genre'] = 0
#     elif row['genre'] == 'classic pop and rock':
#         predictatedClusters.at[index, 'genre'] = 1
#     elif row['genre'] == 'soul and reggae':
#         predictatedClusters.at[index, 'genre'] = 2
#     elif row['genre'] == 'punk':
#         predictatedClusters.at[index, 'genre'] = 3
#     elif row['genre'] == 'classical':
#         predictatedClusters.at[index, 'genre'] = 4
#     elif row['genre'] == 'folk':
#         predictatedClusters.at[index, 'genre'] = 5
#     elif row['genre'] == 'jazz and blues':
#         predictatedClusters.at[index, 'genre'] = 6
#     elif row['genre'] == 'metal':
#         predictatedClusters.at[index, 'genre'] = 7
#     elif row['genre'] == 'hip-hop':
#         predictatedClusters.at[index, 'genre'] = 8
#     elif row['genre'] == 'pop':
#         predictatedClusters.at[index, 'genre'] = 9

# punkSong = punk.iloc[11]
# punkSongTimbre = punkSong[timbreFeaturesAndGenre]
# noGenre = punkSongTimbre[:-1]
# print(noGenre)
# [noGenre[i:i+1] for i in range(0, len(noGenre), 1)]
# #noGenre.reshape(-1,1)
# print(noGenre)

# kMeans.predict(noGenre)

# classicPopAndRock.name = 'classicPopAndRock'
# punk.name = 'punk'
# folk.name = 'folk'
# pop.name = 'pop'
# danceAndElectronica.name = 'danceAndElectronica'
# jazz.name = 'jazz'
# classical.name = 'classical'
# hipHop.name = 'hipHop'
# soulAndReggae.name = 'soulAndReggae'

# def extractTimbreMeans(df):
#     avgTimbreDf = df[['avg_timbre1','avg_timbre2','avg_timbre3','avg_timbre4',
#                       'avg_timbre5','avg_timbre6','avg_timbre7','avg_timbre8',
#                       'avg_timbre9','avg_timbre10','avg_timbre11','avg_timbre12']]
#     df['avgTimbreMean'] = avgTimbreDf.sum(axis=1)
#     varTimbreDf = df[['var_timbre1','var_timbre2','var_timbre3','var_timbre4',
#                       'var_timbre5','var_timbre6','var_timbre7','var_timbre8',
#                       'var_timbre9','var_timbre10','var_timbre11','var_timbre12']]
#     df['varTimbreMean'] = varTimbreDf.sum(axis=1)
#     return df

# def extractVarTimbre(df):
#     varTimbreDf = df[['var_timbre1','var_timbre2','var_timbre3','var_timbre4',
#                       'var_timbre5','var_timbre6','var_timbre7','var_timbre8',
#                       'var_timbre9','var_timbre10','var_timbre11','var_timbre12']]
#     df['mean'] = varTimbreDf.mean(axis=1)
#     return df

# classicPopAndRockAvgTimbre = extractAvgTimbre(classicPopAndRock)
# punkAvgTimbre = extractAvgTimbre(punk)
# folkAvgTimbre = extractAvgTimbre(folk)
# popAvgTimbre = extractAvgTimbre(classicPopAndRock)
# danceAndElectronicaAvgTimbre = extractAvgTimbre(danceAndElectronica)
# metalAvgTimbre = extractAvgTimbre(metal)
# jazzAvgTimbre = extractAvgTimbre(jazz)
# classicalAvgTimbre = extractAvgTimbre(classical)
# hipHopAvgTimbre = extractAvgTimbre(hipHop)
# soulAndReggaeAvgTimbre = extractAvgTimbre(soulAndReggae)

# classicPopAndRockVarTimbre = extractAvgTimbre(classicPopAndRock)
# punkVarTimbre = extractVarTimbre(punk)
# folkVarTimbre = extractVarTimbre(folk)
# popVarTimbre = extractVarTimbre(classicPopAndRock)
# danceAndElectronicaVarTimbre = extractVarTimbre(danceAndElectronica)
# metalVarTimbre = extractVarTimbre(metal)
# jazzVarTimbre = extractVarTimbre(jazz)
# classicalVarTimbre = extractVarTimbre(classical)
# hipHopVarTimbre = extractVarTimbre(hipHop)
# soulAndReggaeVarTimbre = extractVarTimbre(soulAndReggae)

# fig = plt.figure(figsize=(12, 9))
# ax = Axes3D(fig, elev=21, azim=-136)
# ax.scatter(dataframe['Top_Tail'], dataframe['Middle_Segment'], dataframe['Bottom_Tail'], 
#            c=labels.astype(np.float))
# ax.set_xlabel('TopTail')
# ax.set_ylabel('MiddleSegment')
# ax.set_zlabel('BottomTail')
# plt.show()

# fig = px.scatter(avgTimbreSample, x=avgTimbre1, y=avgTimbre2, color='genre')
# fig.show()

# fig = px.box(avgTimbreSample, x="genre", y=avgTimbre1)
# fig.show()