In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score
from nltk import ConfusionMatrix
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
from sklearn.preprocessing import MinMaxScaler 
min_max_scaler = MinMaxScaler()

In [2]:
genres_dataset = pd.read_json('MasterSongList.json')

In [3]:
genres_dataset.head(5)


Unnamed: 0,_id,album,artist,audio_features,context,decades,genres,lyrics_features,moods,name,new_context,picture,recording_id,sub_context,yt_id,yt_views
0,{'$oid': '52fdfb440b9398049f3d7a8c'},Gangnam Style (강남스타일),PSY,"[11, 0.912744, 0.083704, 132.069, 0.293137, 0....",[work out],[],[pop],"[oppa, gangnam, style, gangnam, style, najeneu...","[energetic, motivational]",Gangnam Style (강남스타일),work out,http://images.musicnet.com/albums/073/463/405/...,50232.0,[working out: cardio],9bZkp7q19f0,2450112089
1,{'$oid': '52fdfb3d0b9398049f3cbc8e'},Native,OneRepublic,"[6, 0.7457039999999999, 0.11995499999999999, 1...",[energetic],[2012],[pop],"[lately, i, ve, been, i, ve, been, losing, sle...",[happy],Counting Stars,energetic,http://images.musicnet.com/albums/081/851/887/...,5839.0,[energy boost],hT_nvWreIhg,1020297206
2,{'$oid': '52fdfb420b9398049f3d3ea5'},Party Rock Anthem,LMFAO,"[5, 0.709932, 0.231455, 130.03, 0.121740999999...","[energetic, energetic, energetic, energetic]",[],[],"[party, rock, yeah, woo, let, s, go, party, ro...","[happy, celebratory, rowdy]",Party Rock Anthem,housework,http://images.musicnet.com/albums/049/414/127/...,52379.0,"[energy boost, pleasing a crowd, housework, dr...",KQ6zr6kCPj8,971128436
3,{'$oid': '52fdfb410b9398049f3d1eac'},Gentleman,PSY,"[3, 0.705822, 0.053292, 126.009, 0.126016, 0.0...","[party, party, party, party, party, party]",[2010s],[dance],"[alagamun, lan, weh, wakun, heya, hanun, gon, ...","[happy, energetic, celebratory]",Gentleman,energetic,http://images.musicnet.com/albums/082/950/461/...,12353.0,"[driving in the left lane, energy boost, girls...",ASO_zypdnsQ,892096527
4,{'$oid': '52fdfb400b9398049f3d0b19'},On The Floor,Jennifer Lopez,"[3, 0.741757, 0.07277399999999999, 129.985, 0....","[party, party]",[2000s],[reggaeton],"[j, lo, the, other, side, out, my, mine, it, s...",[energetic],On The Floor,work out,http://images.musicnet.com/albums/050/131/765/...,29502.0,"[working out: cardio, dance party: sweaty]",t4H_Zoh7G5A,873285189


In [4]:
## Preprocessing data and categorise 

def preprocess_data():
    global filtered_genres_dataset
    genres_dataset.loc[:, 'genres'] = genres_dataset['genres'].apply(''.join)
    genres_dataset.loc[:, 'genres'] = genres_dataset['genres'].apply(consolidate_genre)
    
    dance_dataset = genres_dataset.loc[(genres_dataset['genres'] == 'dance')].sample(n=2000)
    jazz_dataset = genres_dataset.loc[(genres_dataset['genres'] == 'jazz')].sample(n=2000)
    country_dataset = genres_dataset.loc[(genres_dataset['genres'] == 'country')].sample(n=1200)
    pop_dataset = genres_dataset.loc[(genres_dataset['genres'] == 'rock')].sample(n=2000)
    filtered_genres_dataset = pd.concat([dance_dataset, jazz_dataset, country_dataset, pop_dataset])
                             
def consolidate_genre(genre):
    if (len(genre) > 0):
        return genre.split(':')[0]
    else:
        return genre
    
preprocess_data()


In [5]:
filtered_genres_dataset.head(5)

Unnamed: 0,_id,album,artist,audio_features,context,decades,genres,lyrics_features,moods,name,new_context,picture,recording_id,sub_context,yt_id,yt_views
17237,{'$oid': '52fdfb460b9398049f3da219'},Versatile 98,I:Cube,"[1, 0.796964, 0.053065999999999995, 129.235, 0...","[party, party, party, party]",['00s electronic],dance,[],"[energetic, visceral]",Disco Cubizm (Daft Punk Remix),coding,http://images.musicnet.com/albums/048/911/063/...,48043.0,"[driving in the left lane, dance party: sweaty...",2uKja4Ij75Y,82725
18053,{'$oid': '52fdfb430b9398049f3d5fff'},"Factory Records: Communications, 1978-1992",Quando Quango,"[11, 0.745764, 0.08394399999999999, 120.373, 0...",[energetic],[1970s],dance,[],[energetic],Love Tempo,,http://images.musicnet.com/albums/031/288/965/...,68539.0,,xGil0GvMWUo,69577
16557,{'$oid': '52fdfb420b9398049f3d3401'},Mädchen In Uniform,Nachtmahr,"[11, 0.9443600000000001, 0.614977, 160.006, 0....",[coding],[2000s],dance,[],"[energetic, cold, visceral]",Mädchen In Unform (Faderhead Remix),coding,http://images.musicnet.com/albums/036/735/947/...,,[coding],AGVgOKPqch0,95739
4005,{'$oid': '52fdfb3e0b9398049f3ccdff'},Platinum & Gold Collection: The Best Of Real M...,Real McCoy,"[1, 0.9238609999999999, 0.08168299999999999, 1...","[party, party]",[1990s],dance,"[runaway, runaway, runaway, and, save, your, l...","[energetic, celebratory]",Run Away (Single Version),party,http://images.musicnet.com/albums/028/777/195/...,4413.0,"[girls night out, dance party: fun & funky]",jnM67j9So8w,3127898
31371,{'$oid': '52fdfb3d0b9398049f3cc38e'},"Hits, Remixes And Rarities: The Warner Brother...",Ashford & Simpson,"[3, 0.665794, 0.072278, 122.837, 0.04891099999...",[housework],[1970s],dance,"[found, a, cure, ooh, for, your, heartache, be...","[celebratory, campy, lush]",Found A Cure (12-inch Disco Mix),housework,http://images.musicnet.com/albums/013/006/127/...,54039.0,[housework],rdq3BZfnxmU,3161


In [6]:
### Setting up data frame
audio_feature_headers = ['key', 
                         'energy', 
                         'liveliness', 
                         'tempo', 
                         'speechiness', 
                         'acousticness', 
                         'instrumentalness', 
                         'time_signature', 
                         'duration', 
                         'loudness', 
                         'valence', 
                         'danceability', 
                         'mode', 
                         'time_signature_confidence', 
                         'tempo_confidence', 
                         'key_confidence', 
                         'mode_confidence',
                         'moods',
                         'new_context',
                        ] 
audio_features_list = []
for audio_feature in filtered_genres_dataset['audio_features']:
    audio_features_list.append(audio_feature)

for index, mood in enumerate(filtered_genres_dataset['moods']):
    audio_features_list[index].append(mood)
    
for index, context in enumerate(filtered_genres_dataset['new_context']):
    audio_features_list[index].append(context)
    
df_features = pd.DataFrame(audio_features_list, columns=audio_feature_headers)

In [7]:
filtered_genres_dataset['genres'].reset_index(drop=True, inplace=True)
df_features['genres'] = filtered_genres_dataset['genres']


In [8]:
# Remove missing data instead of guessing  
df_features.dropna(inplace=True)

In [9]:
# Balanced 2 genres dataset with 10 kfold score  

two_genres_dance_type = df_features.loc[df_features['genres'] == 'dance'].sample(n=1200)
two_genres_jazz_type = df_features.loc[df_features['genres'] == 'jazz'].sample(n=1200)
two_genres = pd.concat([two_genres_dance_type, two_genres_jazz_type])

ub_two_genres_dance_type = df_features.loc[df_features['genres'] == 'dance'].sample(n=1200)
ub_two_genres_jazz_type = df_features.loc[df_features['genres'] == 'jazz'].sample(n=600)
ub_two_genres = pd.concat([ub_two_genres_dance_type, ub_two_genres_jazz_type])

In [10]:
## Transforming data
two_genres['moods'] = label_encoder.fit_transform(two_genres['moods'])
two_genres['new_context'] = label_encoder.fit_transform(two_genres['new_context'])

two_genres_labels = two_genres['genres']

ub_two_genres['moods'] = label_encoder.fit_transform(ub_two_genres['moods'])
ub_two_genres['new_context'] = label_encoder.fit_transform(ub_two_genres['new_context'])

ub_two_genres_labels = ub_two_genres['genres']

In [11]:
# Filtered out unnecessary columns 
two_genres = two_genres.drop(['genres', 
                                'mode_confidence',
                                'time_signature',
                                'duration',
                                'time_signature_confidence', 
                                'key', 
                                'tempo',
                                'key_confidence',
                                'tempo_confidence',
                                'liveliness',
                                'loudness',
                                'instrumentalness',
                               ], axis=1)

ub_two_genres = ub_two_genres.drop(['genres', 
                                'mode_confidence',
                                'time_signature',
                                'duration',
                                'time_signature_confidence', 
                                'key', 
                                'tempo',
                                'key_confidence',
                                'tempo_confidence',
                                'liveliness',
                                'loudness',
                                'instrumentalness',
                               ], axis=1)

In [13]:
two_genres_features = min_max_scaler.fit_transform(two_genres)
two_genres_df = pd.DataFrame(two_genres_features, columns=two_genres.columns)

ub_two_genres_features = min_max_scaler.fit_transform(ub_two_genres)
ub_two_genres_df = pd.DataFrame(ub_two_genres_features, columns=ub_two_genres.columns)

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(algorithm='auto', leaf_size=30,
           metric_params=None, n_jobs=1, n_neighbors=3, p=2)

X_train, X_test, y_train, y_test = train_test_split(ub_two_genres_df,
                                                    ub_two_genres_labels, 
                                                    test_size = 0.33, 
                                                    random_state = 101)

knn.fit(X_train, y_train)
pred = knn.predict(X_test)
np.mean(cross_val_score(knn, ub_two_genres_df, ub_two_genres_labels, cv=10))



0.97277777777777763

In [15]:
print(accuracy_score(y_test, pred))

0.96632996633


In [16]:
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

      dance       0.97      0.97      0.97       400
       jazz       0.95      0.95      0.95       194

avg / total       0.97      0.97      0.97       594



## Unbalanced data can cause affect score, IMHO. I think it because data somehow did not make the score changes significantly. Also we are doing samples which mean there will be deviation between each dataset

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(algorithm='auto', leaf_size=30,
           metric_params=None, n_jobs=1, n_neighbors=3, p=2)

X_train, X_test, y_train, y_test = train_test_split(two_genres_df,
                                                    two_genres_labels, 
                                                    test_size = 0.33, 
                                                    random_state = 101)

knn.fit(X_train, y_train)
pred = knn.predict(X_test)
np.mean(cross_val_score(knn, two_genres_df, two_genres_labels, cv=10))

0.97041666666666659

In [18]:
print(ConfusionMatrix(list(y_test), list(pred)))

      |   d     |
      |   a   j |
      |   n   a |
      |   c   z |
      |   e   z |
------+---------+
dance |<394> 12 |
 jazz |  14<372>|
------+---------+
(row = reference; col = test)



In [19]:
print(accuracy_score(y_test, pred))

0.967171717172


In [20]:
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

      dance       0.97      0.97      0.97       406
       jazz       0.97      0.96      0.97       386

avg / total       0.97      0.97      0.97       792



In [21]:
# Unbalanced data with 3 genres and 10 kfold score.
three_genres_dance_type = df_features.loc[df_features['genres'] == 'dance'].sample(n=1200)
three_genres_jazz_type = df_features.loc[df_features['genres'] == 'jazz'].sample(n=1200)
three_genres_country_type = df_features.loc[(df_features['genres'] == 'country')].sample(n=600)
three_genres = pd.concat([three_genres_dance_type, three_genres_jazz_type, three_genres_country_type])

In [22]:
three_genres['moods'] = label_encoder.fit_transform(three_genres['moods'])
three_genres['new_context'] = label_encoder.fit_transform(three_genres['new_context'])

three_genres_labels = three_genres['genres']

In [23]:
# Filtered out unnecessary columns 
three_genres = three_genres.drop(['genres', 
                                'mode_confidence',
                                'time_signature',
                                'duration',
                                'time_signature_confidence', 
                                'key', 
                                'tempo',
                                'key_confidence',
                                'tempo_confidence',
                                'liveliness',
                                'instrumentalness',
                               ], axis=1)

In [24]:
three_genres_features = min_max_scaler.fit_transform(three_genres)
three_genres_df = pd.DataFrame(three_genres_features, columns=three_genres.columns)

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(algorithm='auto', leaf_size=30,
           metric_params=None, n_jobs=1, n_neighbors=3, p=2)

X_train, X_test, y_train, y_test = train_test_split(three_genres_df,
                                                    three_genres_labels, 
                                                    test_size = 0.33, 
                                                    random_state = 101)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
np.mean(cross_val_score(knn, three_genres_df, three_genres_labels, cv=10))

0.93800000000000006

In [26]:
print(ConfusionMatrix(list(y_test), list(pred)))

        |   c         |
        |   o         |
        |   u   d     |
        |   n   a   j |
        |   t   n   a |
        |   r   c   z |
        |   y   e   z |
--------+-------------+
country |<180>  3   6 |
  dance |  14<390> 12 |
   jazz |   7  11<367>|
--------+-------------+
(row = reference; col = test)



In [27]:
print(accuracy_score(y_test, pred))

0.946464646465


In [28]:
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

    country       0.90      0.95      0.92       189
      dance       0.97      0.94      0.95       416
       jazz       0.95      0.95      0.95       385

avg / total       0.95      0.95      0.95       990



In [29]:
df_features['moods'] = label_encoder.fit_transform(df_features['moods'])
df_features['new_context'] = label_encoder.fit_transform(df_features['new_context'])
labels_final = df_features['genres']

# Filtered out unnecessary columns 
df_features = df_features.drop(['genres', 
                                'mode_confidence',
                                'time_signature',
                                'duration',
                                'time_signature_confidence', 
                                'key', 
                                'tempo',
                                'key_confidence',
                                'tempo_confidence',
                                'liveliness',
                                'valence',
                               ], axis=1)

In [30]:
df_scaled_features = min_max_scaler.fit_transform(df_features)
df_features_final = pd.DataFrame(df_scaled_features, columns=df_features.columns)

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(algorithm='auto', leaf_size=30,
           metric_params=None, n_jobs=1, n_neighbors=3, p=2)

X_train, X_test, y_train, y_test = train_test_split(df_features_final,
                                                    labels_final, 
                                                    test_size = 0.33, 
                                                    random_state = 101)
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
np.mean(cross_val_score(knn, df_features_final, labels_final, cv=10))

0.84626812202818602

In [32]:
print(classification_report(y_test, pred))

             precision    recall  f1-score   support

    country       0.67      0.85      0.75       208
      dance       0.88      0.90      0.89       466
       jazz       0.93      0.92      0.92       521
       rock       0.81      0.68      0.74       359

avg / total       0.85      0.85      0.85      1554



In [33]:
print(accuracy_score(y_test, pred))

0.848133848134


In [None]:
### 