In [23]:
import pandas as pd
import plotly.express as px
from sklearn.utils import shuffle
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix

pd.options.mode.chained_assignment = None  # default='warn'

dataset = pd.read_csv('msd_genre_dataset0.csv', header=0)

classicPopAndRock = dataset.iloc[0:23895]
punk = dataset.iloc[23895:27095]
folk = dataset.iloc[27095:40287]
pop = dataset.iloc[40287:41904]
danceAndElectronica = dataset.iloc[41904:46839]
metal = dataset.iloc[46839:48942]
jazz = dataset.iloc[48942:53276]
classical = dataset.iloc[53276:55150]
hipHop = dataset.iloc[55150:55584]
soulAndReggae = dataset.iloc[55584:59600]

timbreFeaturesAndGenre = ['avg_timbre1','avg_timbre2','avg_timbre3','avg_timbre4', 'avg_timbre5',
                            'avg_timbre6','avg_timbre7','avg_timbre8', 'avg_timbre9','avg_timbre10',
                            'avg_timbre11','avg_timbre12', 'var_timbre1', 'var_timbre2', 'var_timbre3',
                            'var_timbre4', 'var_timbre5','var_timbre6','var_timbre7','var_timbre8',
                            'var_timbre9','var_timbre10','var_timbre11','var_timbre12', 'genre']

datasetFrames = [classicPopAndRock.head(10), punk.head(10), folk.head(10), pop.head(10),
                 danceAndElectronica.head(10), metal.head(10), jazz.head(10), classical.head(10),
                 hipHop.head(10), soulAndReggae.head(10)]

datasetSample = pd.concat(datasetFrames)
shuffledDatasetSample = shuffle(datasetSample)

featureVector = shuffledDatasetSample[timbreFeaturesAndGenre]

kMeans = KMeans(n_clusters=10)
timbreFeatures = timbreFeaturesAndGenre[:-1]
kMeans.fit(featureVector[timbreFeatures])
featureVector.loc[:, 'genre'] = kMeans.labels_

In [125]:
genreNumbers = featureVector[['genre']]
genreTitles = shuffledDatasetSample[['genre']]
genreNumbers['genre'] = genreNumbers['genre'].astype(str)

newDf = pd.DataFrame()
newDf['genreN'] = genreNumbers['genre']
newDf['genreT'] = genreTitles['genre']

def inspectClusters():  
    songClusters = {}
    clustersSize = {}
    for index, row in newDf.iterrows():
        clusterNumber = row['genreN']
        songGenre = row['genreT']
        if clusterNumber in clustersSize:
            clustersSize[clusterNumber] += 1
        else:
            clustersSize[clusterNumber] = 1
        if songGenre in songClusters:
            songClusters[songGenre].append(clusterNumber)
        else:
            songClusters[songGenre] = [clusterNumber]
    print('Genre Cluster Locations:')
    for key in songClusters:
        print(key, songClusters[key])
        for value in songClusters[key]:
            if value in clusterMapping[key]:
                clusterMapping[key][value] += 1
            else:
                clusterMapping[key][value] = 1
    print('\n')
#     print('Cluster Size')
#     for key in clustersSize:
#         print(key, clustersSize[key])
#     print('\n')

clusterMapping = {}
clusterMapping['pop'] = {}
clusterMapping['dance and electronica'] = {}
clusterMapping['punk'] = {}
clusterMapping['soul and reggae'] = {}
clusterMapping['folk'] = {}
clusterMapping['metal'] = {}
clusterMapping['hip-hop'] = {}
clusterMapping['classic pop and rock'] = {}
clusterMapping['classical'] = {}
clusterMapping['jazz and blues'] = {}

inspectClusters()
print('Cluster Popularity:')
clusterLabels = {}
for key in clusterMapping:
    clusterLabels[key] = max(clusterMapping[key], key=clusterMapping[key].get)
for key in clusterLabels:
    print(key, clusterLabels[key])
print('\n')

print('Cluster Ties:')
reverseClusterLabels = {}
for key, value in clusterLabels.items():
    reverseClusterLabels.setdefault(value, set()).add(key)
clusterDuplicates = {}
for key in reverseClusterLabels:
    value = reverseClusterLabels[key]
    if len(value) > 1:
        clusterDuplicates[key] = value
for key in clusterDuplicates:
    print(key, clusterDuplicates[key])
print('\n')

genreTieBreaker = {}
for key in clusterDuplicates:
    print('Cluster in question:', key)
    clusterDuplicateList = list(clusterDuplicates[key])
    print('Number of tied genres in ', clusterDuplicateList, ':', len(clusterDuplicateList))
    genreTieBreaker[clusterDuplicateList[0]] = clusterMapping[clusterDuplicateList[0]][key]
    counter = 1
    while (counter < len(clusterDuplicateList)):
        print('current genre is:', clusterDuplicateList[counter-1])
        print('with', genreTieBreaker[clusterDuplicateList[counter-1]], 'songs in the cluster')
        print('next genre is:', clusterDuplicateList[counter])
        print('with', genreTieBreaker[clusterDuplicateList[counter]], 'songs in the cluster')
        if genreTieBreaker[clusterDuplicateList[counter-1]] < clusterMapping[clusterDuplicateList[counter]][key]:
            genreTieBreaker[clusterDuplicateList[counter]] = clusterMapping[clusterDuplicateList[counter]][key]
        counter += 1
        
confusionMatrix = pd.DataFrame(confusion_matrix(featureVector[['genre']], predictatedClusters[['genre']]))

print(confusionMatrix)

Genre Cluster Locations:
metal ['8', '5', '0', '5', '5', '5', '1', '5', '5', '0']
classical ['5', '7', '7', '7', '7', '8', '7', '7', '7', '7']
jazz and blues ['8', '0', '1', '2', '1', '1', '1', '0', '0', '2']
folk ['1', '0', '0', '8', '8', '0', '0', '4', '0', '0']
punk ['0', '0', '5', '0', '8', '4', '5', '5', '5', '5']
pop ['0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
classic pop and rock ['5', '0', '5', '9', '5', '1', '5', '5', '5', '5']
hip-hop ['5', '0', '0', '7', '7', '8', '7', '0', '5', '7']
dance and electronica ['1', '2', '9', '3', '6', '8', '6', '2', '2', '6']
soul and reggae ['4', '4', '4', '4', '4', '4', '4', '4', '4', '4']


Cluster Popularity:
pop 0
dance and electronica 2
punk 5
soul and reggae 4
folk 0
metal 5
hip-hop 7
classic pop and rock 5
classical 7
jazz and blues 1


Cluster Ties:
0 {'pop', 'folk'}
5 {'classic pop and rock', 'punk', 'metal'}
7 {'classical', 'hip-hop'}


Cluster in question: 0
Number of tied genres in  ['pop', 'folk'] : 2
current genre is: pop
w

KeyError: 'folk'

In [61]:
# genre = ['classicPopAndRock', 'punk', 'folk', 'pop', 'danceAndElectronica', 'metal',
#          'jazz', 'classical', 'hipHop', 'soulAndReggae']

# predictatedClusters = shuffledDatasetSample[timbreFeaturesAndGenre]
# for index, row in predictatedClusters.iterrows():
#     if row['genre'] == 'dance and electronica':
#         predictatedClusters.at[index, 'genre'] = 0
#     elif row['genre'] == 'classic pop and rock':
#         predictatedClusters.at[index, 'genre'] = 1
#     elif row['genre'] == 'soul and reggae':
#         predictatedClusters.at[index, 'genre'] = 2
#     elif row['genre'] == 'punk':
#         predictatedClusters.at[index, 'genre'] = 3
#     elif row['genre'] == 'classical':
#         predictatedClusters.at[index, 'genre'] = 4
#     elif row['genre'] == 'folk':
#         predictatedClusters.at[index, 'genre'] = 5
#     elif row['genre'] == 'jazz and blues':
#         predictatedClusters.at[index, 'genre'] = 6
#     elif row['genre'] == 'metal':
#         predictatedClusters.at[index, 'genre'] = 7
#     elif row['genre'] == 'hip-hop':
#         predictatedClusters.at[index, 'genre'] = 8
#     elif row['genre'] == 'pop':
#         predictatedClusters.at[index, 'genre'] = 9

# punkSong = punk.iloc[11]
# punkSongTimbre = punkSong[timbreFeaturesAndGenre]
# noGenre = punkSongTimbre[:-1]
# print(noGenre)
# [noGenre[i:i+1] for i in range(0, len(noGenre), 1)]
# #noGenre.reshape(-1,1)
# print(noGenre)

# kMeans.predict(noGenre)

# classicPopAndRock.name = 'classicPopAndRock'
# punk.name = 'punk'
# folk.name = 'folk'
# pop.name = 'pop'
# danceAndElectronica.name = 'danceAndElectronica'
# jazz.name = 'jazz'
# classical.name = 'classical'
# hipHop.name = 'hipHop'
# soulAndReggae.name = 'soulAndReggae'

# def extractTimbreMeans(df):
#     avgTimbreDf = df[['avg_timbre1','avg_timbre2','avg_timbre3','avg_timbre4',
#                       'avg_timbre5','avg_timbre6','avg_timbre7','avg_timbre8',
#                       'avg_timbre9','avg_timbre10','avg_timbre11','avg_timbre12']]
#     df['avgTimbreMean'] = avgTimbreDf.sum(axis=1)
#     varTimbreDf = df[['var_timbre1','var_timbre2','var_timbre3','var_timbre4',
#                       'var_timbre5','var_timbre6','var_timbre7','var_timbre8',
#                       'var_timbre9','var_timbre10','var_timbre11','var_timbre12']]
#     df['varTimbreMean'] = varTimbreDf.sum(axis=1)
#     return df

# def extractVarTimbre(df):
#     varTimbreDf = df[['var_timbre1','var_timbre2','var_timbre3','var_timbre4',
#                       'var_timbre5','var_timbre6','var_timbre7','var_timbre8',
#                       'var_timbre9','var_timbre10','var_timbre11','var_timbre12']]
#     df['mean'] = varTimbreDf.mean(axis=1)
#     return df

# classicPopAndRockAvgTimbre = extractAvgTimbre(classicPopAndRock)
# punkAvgTimbre = extractAvgTimbre(punk)
# folkAvgTimbre = extractAvgTimbre(folk)
# popAvgTimbre = extractAvgTimbre(classicPopAndRock)
# danceAndElectronicaAvgTimbre = extractAvgTimbre(danceAndElectronica)
# metalAvgTimbre = extractAvgTimbre(metal)
# jazzAvgTimbre = extractAvgTimbre(jazz)
# classicalAvgTimbre = extractAvgTimbre(classical)
# hipHopAvgTimbre = extractAvgTimbre(hipHop)
# soulAndReggaeAvgTimbre = extractAvgTimbre(soulAndReggae)

# classicPopAndRockVarTimbre = extractAvgTimbre(classicPopAndRock)
# punkVarTimbre = extractVarTimbre(punk)
# folkVarTimbre = extractVarTimbre(folk)
# popVarTimbre = extractVarTimbre(classicPopAndRock)
# danceAndElectronicaVarTimbre = extractVarTimbre(danceAndElectronica)
# metalVarTimbre = extractVarTimbre(metal)
# jazzVarTimbre = extractVarTimbre(jazz)
# classicalVarTimbre = extractVarTimbre(classical)
# hipHopVarTimbre = extractVarTimbre(hipHop)
# soulAndReggaeVarTimbre = extractVarTimbre(soulAndReggae)

# fig = plt.figure(figsize=(12, 9))
# ax = Axes3D(fig, elev=21, azim=-136)
# ax.scatter(dataframe['Top_Tail'], dataframe['Middle_Segment'], dataframe['Bottom_Tail'], 
#            c=labels.astype(np.float))
# ax.set_xlabel('TopTail')
# ax.set_ylabel('MiddleSegment')
# ax.set_zlabel('BottomTail')
# plt.show()

# fig = px.scatter(avgTimbreSample, x=avgTimbre1, y=avgTimbre2, color='genre')
# fig.show()

# fig = px.box(avgTimbreSample, x="genre", y=avgTimbre1)
# fig.show()