In [104]:
import pandas as pd
import plotly.express as px
import operator
from sklearn.utils import shuffle
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix

pd.options.mode.chained_assignment = None  # default='warn'

dataset = pd.read_csv('msd_genre_dataset0.csv', header=0)

classicPopAndRock = dataset.iloc[0:23895]
punk = dataset.iloc[23895:27095]
folk = dataset.iloc[27095:40287]
pop = dataset.iloc[40287:41904]
danceAndElectronica = dataset.iloc[41904:46839]
metal = dataset.iloc[46839:48942]
jazz = dataset.iloc[48942:53276]
classical = dataset.iloc[53276:55150]
hipHop = dataset.iloc[55150:55584]
soulAndReggae = dataset.iloc[55584:59600]

timbreFeaturesAndGenre = ['avg_timbre1','avg_timbre2','avg_timbre3','avg_timbre4', 'avg_timbre5',
                            'avg_timbre6','avg_timbre7','avg_timbre8', 'avg_timbre9','avg_timbre10',
                            'avg_timbre11','avg_timbre12', 'var_timbre1', 'var_timbre2', 'var_timbre3',
                            'var_timbre4', 'var_timbre5','var_timbre6','var_timbre7','var_timbre8',
                            'var_timbre9','var_timbre10','var_timbre11','var_timbre12', 'genre']

datasetFrames = [classicPopAndRock.head(10), punk.head(10), folk.head(10), pop.head(10),
                 danceAndElectronica.head(10), metal.head(10), jazz.head(10), classical.head(10),
                 hipHop.head(10), soulAndReggae.head(10)]

datasetSample = pd.concat(datasetFrames)
shuffledDatasetSample = shuffle(datasetSample)

featureVector = shuffledDatasetSample[timbreFeaturesAndGenre]

kMeans = KMeans(n_clusters=10)
timbreFeatures = timbreFeaturesAndGenre[:-1]
kMeans.fit(featureVector[timbreFeatures])
featureVector.loc[:, 'genre'] = kMeans.labels_

In [112]:
def showGenreClusterDistribution(clusterGenreCorrelationDataframe):
    clusterDistribution = {}
    for index, row in clusterGenreCorrelationDataframe.iterrows():
        clusterNumber = row['genreN']
        songGenre = row['genreT']
        if songGenre in clusterDistribution:
            clusterDistribution[songGenre].append(clusterNumber)
        else:
            clusterDistribution[songGenre] = [clusterNumber]
    return clusterDistribution

def clusterGenrePopularity(clusterGenreDistribution):
    clusterLabels = {}
    counter = 0
    while counter < 10:
        clusterLabels[counter] = {}
        clusterLabels[counter]['pop'] = 0
        clusterLabels[counter]['dance and electronica'] = 0
        clusterLabels[counter]['punk'] = 0
        clusterLabels[counter]['soul and reggae'] = 0
        clusterLabels[counter]['folk'] = 0
        clusterLabels[counter]['metal'] = 0
        clusterLabels[counter]['hip-hop'] = 0
        clusterLabels[counter]['classic pop and rock'] = 0
        clusterLabels[counter]['classical'] = 0
        clusterLabels[counter]['jazz and blues'] = 0
        counter += 1
    counter = 0
    for genre in clusterGenreDistribution:
        clusterList = clusterGenreDistribution[genre]
        for cluster in clusterList:
            clusterLabels[int(cluster)][genre] += 1
    return clusterLabels

def createClusterLabels(clusterPopularity):
    clusterLabeled = {}
    for cluster in clusterPopularity:
        clusterLabeled[cluster] = max(clusterPopularity[cluster].items(), key=operator.itemgetter(1))[0]
    return clusterLabeled

def convertClustarLabels(dataframe, labelsDictionary):
    convertedDataframe = dataframe.replace(labelsDictionary)
    return convertedDataframe

genreNumbers = featureVector[['genre']]
genreTitles = shuffledDatasetSample[['genre']]
genreNumbers['genre'] = genreNumbers['genre'].astype(str)

clusterRelationshipDataframe = pd.DataFrame()
clusterRelationshipDataframe['genreN'] = genreNumbers['genre']
clusterRelationshipDataframe['genreT'] = genreTitles['genre']

songClusters = showGenreClusterDistribution(clusterRelationshipDataframe)

clusterDistribution = {}
clusterDistribution = clusterGenrePopularity(songClusters)

clusterLabelsDictionary = createClusterLabels(clusterDistribution)

convertedFeatureDataframe = convertClustarLabels(featureVector[['genre']], clusterLabelsDictionary)

for key in clusterLabelsDictionary:
    print(key, clusterLabelsDictionary[key])
print('\n')
confusionMatrix = pd.DataFrame(confusion_matrix(convertedFeatureDataframe[['genre']], shuffledDatasetSample[['genre']]))
print(confusionMatrix)

0 pop
1 jazz and blues
2 dance and electronica
3 classical
4 soul and reggae
5 dance and electronica
6 dance and electronica
7 pop
8 classic pop and rock
9 dance and electronica


   0  1  2  3  4  5  6   7  8   9
0  7  1  0  0  2  0  6   0  5   0
1  0  9  0  0  4  0  0   0  0   0
2  1  0  8  0  0  1  0   0  0   0
3  0  0  0  0  0  0  0   0  0   0
4  0  0  0  0  0  0  0   0  0   0
5  1  0  1  1  0  4  1   0  0   0
6  0  0  0  0  0  0  0   0  0   0
7  1  0  1  8  3  5  3  10  4   0
8  0  0  0  0  0  0  0   0  0   0
9  0  0  0  1  1  0  0   0  1  10


In [107]:

# def inspectClusters(relationshipDataframe):  
#     songClusters = showGenreClusterDistribution(relationshipDataframe)
#     print('Genre Cluster Locations:')
#     clusterDistribution = clusterTallyPerGenre(songClusters)
#     print('\n')
#     return clusterDistribution



# def clusterTallyPerGenre(genreSongsDistribution):
#     clusterMap = {}
#     counter = 0
#     while counter < 10:
#         clusterMap[counter] = {}
#         for key in genreSongsDistribution:
#             clusterList = list(genreSongsDistribution[key])
#             print(key, genreSongsDistribution[key])
#             for cluster in clusterList:
#                 if cluster in clusterMap[counter]:
#                     clusterMap[counter][key] += 1
#                 else:
#                     clusterMap[counter][key] = 1
#         counter += 1
#     return clusterMap

# def removeReservedClusters(clusteredIncorrectly, groupedClusters):
#     for clusterIdentifier in clusteredIncorrectly:
#         genreList = list(clusteredIncorrectly[clusterIdentifier])
#         counter = 0
#         while counter < len(genreList):
#             clusterNumberList = list(groupedClusters[genreList[counter]])
#             listCopy = clusterNumberList
#             while clusterIdentifier in clusterNumberList: clusterNumberList.remove(clusterIdentifier)
#             groupedClusters[genreList[counter]] = clusterNumberList
#             counter += 1
#     return groupedClusters


# print('Genre Cluster Locations:')
# clusterDistribution = clusterTallyPerGenre(songClusters)
# print('HEY', clusterDistribution)
# print('\n')
# print('Cluster Popularity:')
# clusterLabels = {}
# for key in clusterDistribution:
#     clusterLabels[key] = max(clusterDistribution[key], key=clusterDistribution[key].get)
# for key in clusterLabels:
#     print(key, clusterLabels[key])
# print('\n')

# print('Cluster Ties:')
# reverseClusterLabels = {}
# for key, value in clusterLabels.items():
#     reverseClusterLabels.setdefault(value, set()).add(key)
# clusterDuplicates = {}
# for key in reverseClusterLabels:
#     value = reverseClusterLabels[key]
#     if len(value) > 1:
#         clusterDuplicates[key] = value
# for key in clusterDuplicates:
#     print(key, clusterDuplicates[key])
# print('\n')

# genreTieBreaker = {}
# for clusterNumberLabel in clusterDuplicates:
#     clusterDuplicateList = list(clusterDuplicates[clusterNumberLabel])
#     maxGenre = clusterDuplicateList[0]
#     maximum = clusterDistribution[clusterDuplicateList[0]][clusterNumberLabel]
#     counter = 1
#     while counter < len(clusterDuplicateList):
#         if maximum < clusterDistribution[clusterDuplicateList[counter]][clusterNumberLabel]:
#             maximum = clusterDistribution[clusterDuplicateList[counter]][clusterNumberLabel]
#             maxGenre = clusterDuplicateList[counter]
#         counter += 1
#     clusterDuplicates[clusterNumberLabel].remove(maxGenre)
#     genreTieBreaker[maxGenre] = clusterNumberLabel

# print('tiebreaker', genreTieBreaker)
# print('clusterdup', clusterDuplicates)

# print('\n')
# updatedClusterLocations = removeReservedClusters(clusterDuplicates, songClusters)
# for key in updatedClusterLocations:
#     print(key, updatedClusterLocations[key])
# print('updated')
# for key in songClusters:
#     print(key, songClusters[key])

# for label in genreTieBreaker:
#     clusterLabels[label] = genreTieBreaker[label]
    

# confusionMatrix = pd.DataFrame(confusion_matrix(featureVector[['genre']], predictatedClusters[['genre']]))

# print(confusionMatrix)

In [61]:
# genre = ['classicPopAndRock', 'punk', 'folk', 'pop', 'danceAndElectronica', 'metal',
#          'jazz', 'classical', 'hipHop', 'soulAndReggae']

# predictatedClusters = shuffledDatasetSample[timbreFeaturesAndGenre]
# for index, row in predictatedClusters.iterrows():
#     if row['genre'] == 'dance and electronica':
#         predictatedClusters.at[index, 'genre'] = 0
#     elif row['genre'] == 'classic pop and rock':
#         predictatedClusters.at[index, 'genre'] = 1
#     elif row['genre'] == 'soul and reggae':
#         predictatedClusters.at[index, 'genre'] = 2
#     elif row['genre'] == 'punk':
#         predictatedClusters.at[index, 'genre'] = 3
#     elif row['genre'] == 'classical':
#         predictatedClusters.at[index, 'genre'] = 4
#     elif row['genre'] == 'folk':
#         predictatedClusters.at[index, 'genre'] = 5
#     elif row['genre'] == 'jazz and blues':
#         predictatedClusters.at[index, 'genre'] = 6
#     elif row['genre'] == 'metal':
#         predictatedClusters.at[index, 'genre'] = 7
#     elif row['genre'] == 'hip-hop':
#         predictatedClusters.at[index, 'genre'] = 8
#     elif row['genre'] == 'pop':
#         predictatedClusters.at[index, 'genre'] = 9

# punkSong = punk.iloc[11]
# punkSongTimbre = punkSong[timbreFeaturesAndGenre]
# noGenre = punkSongTimbre[:-1]
# print(noGenre)
# [noGenre[i:i+1] for i in range(0, len(noGenre), 1)]
# #noGenre.reshape(-1,1)
# print(noGenre)

# kMeans.predict(noGenre)

# classicPopAndRock.name = 'classicPopAndRock'
# punk.name = 'punk'
# folk.name = 'folk'
# pop.name = 'pop'
# danceAndElectronica.name = 'danceAndElectronica'
# jazz.name = 'jazz'
# classical.name = 'classical'
# hipHop.name = 'hipHop'
# soulAndReggae.name = 'soulAndReggae'

# def extractTimbreMeans(df):
#     avgTimbreDf = df[['avg_timbre1','avg_timbre2','avg_timbre3','avg_timbre4',
#                       'avg_timbre5','avg_timbre6','avg_timbre7','avg_timbre8',
#                       'avg_timbre9','avg_timbre10','avg_timbre11','avg_timbre12']]
#     df['avgTimbreMean'] = avgTimbreDf.sum(axis=1)
#     varTimbreDf = df[['var_timbre1','var_timbre2','var_timbre3','var_timbre4',
#                       'var_timbre5','var_timbre6','var_timbre7','var_timbre8',
#                       'var_timbre9','var_timbre10','var_timbre11','var_timbre12']]
#     df['varTimbreMean'] = varTimbreDf.sum(axis=1)
#     return df

# def extractVarTimbre(df):
#     varTimbreDf = df[['var_timbre1','var_timbre2','var_timbre3','var_timbre4',
#                       'var_timbre5','var_timbre6','var_timbre7','var_timbre8',
#                       'var_timbre9','var_timbre10','var_timbre11','var_timbre12']]
#     df['mean'] = varTimbreDf.mean(axis=1)
#     return df

# classicPopAndRockAvgTimbre = extractAvgTimbre(classicPopAndRock)
# punkAvgTimbre = extractAvgTimbre(punk)
# folkAvgTimbre = extractAvgTimbre(folk)
# popAvgTimbre = extractAvgTimbre(classicPopAndRock)
# danceAndElectronicaAvgTimbre = extractAvgTimbre(danceAndElectronica)
# metalAvgTimbre = extractAvgTimbre(metal)
# jazzAvgTimbre = extractAvgTimbre(jazz)
# classicalAvgTimbre = extractAvgTimbre(classical)
# hipHopAvgTimbre = extractAvgTimbre(hipHop)
# soulAndReggaeAvgTimbre = extractAvgTimbre(soulAndReggae)

# classicPopAndRockVarTimbre = extractAvgTimbre(classicPopAndRock)
# punkVarTimbre = extractVarTimbre(punk)
# folkVarTimbre = extractVarTimbre(folk)
# popVarTimbre = extractVarTimbre(classicPopAndRock)
# danceAndElectronicaVarTimbre = extractVarTimbre(danceAndElectronica)
# metalVarTimbre = extractVarTimbre(metal)
# jazzVarTimbre = extractVarTimbre(jazz)
# classicalVarTimbre = extractVarTimbre(classical)
# hipHopVarTimbre = extractVarTimbre(hipHop)
# soulAndReggaeVarTimbre = extractVarTimbre(soulAndReggae)

# fig = plt.figure(figsize=(12, 9))
# ax = Axes3D(fig, elev=21, azim=-136)
# ax.scatter(dataframe['Top_Tail'], dataframe['Middle_Segment'], dataframe['Bottom_Tail'], 
#            c=labels.astype(np.float))
# ax.set_xlabel('TopTail')
# ax.set_ylabel('MiddleSegment')
# ax.set_zlabel('BottomTail')
# plt.show()

# fig = px.scatter(avgTimbreSample, x=avgTimbre1, y=avgTimbre2, color='genre')
# fig.show()

# fig = px.box(avgTimbreSample, x="genre", y=avgTimbre1)
# fig.show()