In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import mode

# Define a function to load data and select specific columns
def load_and_select_columns(file_path, columns):
    data = pd.read_csv(file_path, usecols=columns)
    return data

# Columns to be used
columns_to_use = [
    'track_id', 'track_name', 'track_artist', 'track_popularity', 
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
    'duration_ms', 'genre'
]
columns_to_use_2 = [
    'track_id', 'track_name', 'track_artist', 'track_popularity', 
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
    'duration_ms', 'genre','prediction'
]

# Load the data with the selected columns
song_data = load_and_select_columns('../data/songs_data.csv', columns_to_use)
song_data.head()

# Get unique genres
song_data['genre'].unique()

array(['edm', 'rock', 'rap', 'latin', 'r&b', 'pop'], dtype=object)

In [60]:
# Separate the data by genre
edm_data = song_data[song_data['genre']=='edm']
rock_data = song_data[song_data['genre']=='rock']
rap_data = song_data[song_data['genre']=='rap']
latin_data = song_data[song_data['genre']=='latin']
rb_data = song_data[song_data['genre']=='r&b']
pop_data = song_data[song_data['genre']=='pop']

# Compute global mean and standard deviation
global_mean = song_data.mean(numeric_only=True)
global_std = song_data.std(numeric_only=True)

# Compute mean for each genre
edm_mean = edm_data.mean(numeric_only=True)
rock_mean = rock_data.mean(numeric_only=True)
rap_mean = rap_data.mean(numeric_only=True)
latin_mean = latin_data.mean(numeric_only=True)
rb_mean = rb_data.mean(numeric_only=True)
pop_mean = pop_data.mean(numeric_only=True)

In [61]:
# Normalize the data
normalized_edm = (edm_mean - global_mean) / global_std
normalized_rock = (rock_mean - global_mean) / global_std
normalized_rap = (rap_mean - global_mean) / global_std
normalized_latin = (latin_mean - global_mean) / global_std
normalized_rb = (rb_mean - global_mean) / global_std
normalized_pop = (pop_mean - global_mean) / global_std

# Convert to vectors
vector_edm = normalized_edm.values
vector_rock = normalized_rock.values
vector_rap = normalized_rap.values
vector_latin = normalized_latin.values
vector_rb = normalized_rb.values
vector_pop = normalized_pop.values

In [62]:
# Load the input song data with the selected columns
input_data = load_and_select_columns('../data/Input_Song.csv', columns_to_use[:-1]) # Exclude 'genre' from input data, that's why it is last in the list
input_data.head()

# Normalize the input data
normalized_input = (input_data.select_dtypes(include='number') - global_mean) / global_std
vector_input = normalized_input.values

# Calculate magnitudes
magnitude_list = []
magnitude_list.append(np.linalg.norm(vector_input - vector_edm))
magnitude_list.append(np.linalg.norm(vector_input - vector_rock))
magnitude_list.append(np.linalg.norm(vector_input - vector_rap))
magnitude_list.append(np.linalg.norm(vector_input - vector_latin))
magnitude_list.append(np.linalg.norm(vector_input - vector_rb))
magnitude_list.append(np.linalg.norm(vector_input - vector_pop))

# Find the genre closest to the input song
min_mag_index = np.argmin(magnitude_list)
print("The song is ")
if min_mag_index == 0:
    print("EDM")
elif min_mag_index == 1:
    print("Rock")
elif min_mag_index == 2:
    print("Rap")
elif min_mag_index == 3:
    print("Latin")
elif min_mag_index == 4:
    print("R&B")
elif min_mag_index == 5:
    print("Pop")

The song is 
Rap


In [63]:
# Input Song List (To test a great Quantity of values at once)
input_data = load_and_select_columns('../data/Input_Song_List.csv', columns_to_use_2) 
input_data.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,prediction
0,6ho0GyrWZN3mhi9zVRW7xi,Losing It,FISHER,79,edm,0.76,0.964,2,-5.844,1,0.0576,0.00182,0.7,0.0974,0.641,125.0,248036,none
1,4bYJKHG6KLdGzHQRJMaFc8,Rock And Roll All Nite,KISS,3,rock,0.581,0.831,1,-8.045,1,0.107,0.0458,0.0,0.0782,0.885,144.684,168160,none
2,1VGlMYqPO1LHyclvfIMvWI,Snälla bli min (Hjalm Remix),Robin Lukovic,0,edm,0.661,0.76,8,-4.769,1,0.0268,0.000406,0.0407,0.611,0.439,99.993,271238,none
3,6BbINUfGabVyiNFJpQXn3x,Wicked,Future,62,rap,0.718,0.679,11,-5.899,0,0.0706,0.00191,0.0104,0.385,0.205,157.97,173347,none
4,1Ly0ssAgeM7YqdHptao8Oe,Paranoid,Black Sabbath,15,rock,0.426,0.619,4,-11.501,0,0.0722,8.4e-05,0.00239,0.17,0.382,162.721,168440,none


In [64]:
# Normalize the input data
normalized_input = (input_data.select_dtypes(include='number') - global_mean) / global_std
normalized_input

Unnamed: 0,track_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,1.673124,0.731242,1.447610,-0.932028,0.320619,0.876723,-0.491017,-0.787065,2.618002,-0.599849,0.557237,0.150036,0.351373
1,-1.532749,-0.496479,0.722750,-1.208738,-0.404242,0.876723,-0.009645,-0.589764,-0.391903,-0.722914,1.598597,0.880219,-0.956428
2,-1.659297,0.052223,0.335795,0.728229,0.674652,0.876723,-0.791143,-0.793409,-0.216898,2.692136,-0.304873,-0.777605,0.731256
3,0.956021,0.443173,-0.105661,1.558357,0.302506,-1.140571,-0.364340,-0.786662,-0.347184,1.243560,-1.303554,1.373066,-0.871502
4,-1.026559,-1.559590,-0.432666,-0.378609,-1.542416,-1.140571,-0.348749,-0.794852,-0.381626,-0.134510,-0.548141,1.549305,-0.951844
...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,-0.098543,0.429456,-0.519867,-1.485447,-0.249785,0.876723,2.085392,2.439295,-0.391903,0.275706,0.087771,0.882667,-0.742107
112,0.660743,0.113952,0.488397,1.558357,1.444961,-1.140571,-0.618668,-0.785136,-0.391903,-0.519088,-0.932249,-1.426586,-0.771300
113,0.407648,0.724383,-0.323664,1.004938,-1.504543,0.876723,-0.638157,-0.418392,-0.249577,-0.892128,0.834649,-0.176143,0.146007
114,1.251299,-1.216651,1.594762,-0.378609,1.304994,-1.140571,0.409362,-0.786662,-0.391903,-0.448582,-1.316358,-1.408076,-0.003706


In [65]:
vector_list_input = normalized_input.values

In [66]:
#num_rows, num_cols = vector_input

# Calculate magnitudes
magnitude_list = {
    'edm': [],
    'rock': [],
    'rap': [],
    'latin': [],
    'r&b': [],
    'pop': []
}
for vector in vector_list_input:
    magnitude_list['edm'].append(np.linalg.norm(vector - vector_edm))
    magnitude_list['rock'].append(np.linalg.norm(vector - vector_rock))
    magnitude_list['rap'].append(np.linalg.norm(vector - vector_rap))
    magnitude_list['latin'].append(np.linalg.norm(vector - vector_latin))
    magnitude_list['r&b'].append(np.linalg.norm(vector - vector_rb))
    magnitude_list['pop'].append(np.linalg.norm(vector - vector_pop))

# Convert magnitude lists to numpy arrays for further processing
for genre in magnitude_list:
    magnitude_list[genre] = np.array(magnitude_list[genre])

magnitude_list

# Find the closest genre for each input vector
genres = ['edm', 'rock', 'rap', 'latin', 'r&b', 'pop']
closest_genres = []

for i in range(len(vector_list_input)):
    magnitudes = [magnitude_list[genre][i] for genre in genres]
    min_mag_index = np.argmin(magnitudes)
    closest_genres.append(genres[min_mag_index])

closest_genres

['edm',
 'rock',
 'edm',
 'edm',
 'rock',
 'latin',
 'edm',
 'r&b',
 'r&b',
 'rap',
 'edm',
 'latin',
 'latin',
 'edm',
 'latin',
 'rock',
 'edm',
 'latin',
 'latin',
 'r&b',
 'r&b',
 'edm',
 'edm',
 'rap',
 'rock',
 'pop',
 'rap',
 'latin',
 'rock',
 'rap',
 'rap',
 'edm',
 'rock',
 'edm',
 'latin',
 'rap',
 'latin',
 'rock',
 'latin',
 'rap',
 'pop',
 'rap',
 'rap',
 'latin',
 'rock',
 'latin',
 'latin',
 'rap',
 'latin',
 'edm',
 'r&b',
 'rap',
 'rap',
 'edm',
 'pop',
 'r&b',
 'rap',
 'rap',
 'rap',
 'latin',
 'latin',
 'edm',
 'latin',
 'pop',
 'latin',
 'latin',
 'latin',
 'r&b',
 'pop',
 'r&b',
 'edm',
 'rap',
 'pop',
 'edm',
 'pop',
 'latin',
 'edm',
 'r&b',
 'rock',
 'rap',
 'rock',
 'latin',
 'r&b',
 'pop',
 'r&b',
 'latin',
 'pop',
 'latin',
 'latin',
 'latin',
 'rock',
 'pop',
 'edm',
 'pop',
 'r&b',
 'pop',
 'edm',
 'rock',
 'latin',
 'latin',
 'edm',
 'pop',
 'edm',
 'rap',
 'r&b',
 'pop',
 'latin',
 'rap',
 'rock',
 'latin',
 'pop',
 'rap',
 'pop',
 'r&b',
 'edm',
 'pop']

In [67]:
input_data ['prediction'] = closest_genres

input_data

Unnamed: 0,track_id,track_name,track_artist,track_popularity,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,prediction
0,6ho0GyrWZN3mhi9zVRW7xi,Losing It,FISHER,79,edm,0.760,0.964,2,-5.844,1,0.0576,0.001820,0.70000,0.0974,0.641,125.000,248036,edm
1,4bYJKHG6KLdGzHQRJMaFc8,Rock And Roll All Nite,KISS,3,rock,0.581,0.831,1,-8.045,1,0.1070,0.045800,0.00000,0.0782,0.885,144.684,168160,rock
2,1VGlMYqPO1LHyclvfIMvWI,Snälla bli min (Hjalm Remix),Robin Lukovic,0,edm,0.661,0.760,8,-4.769,1,0.0268,0.000406,0.04070,0.6110,0.439,99.993,271238,edm
3,6BbINUfGabVyiNFJpQXn3x,Wicked,Future,62,rap,0.718,0.679,11,-5.899,0,0.0706,0.001910,0.01040,0.3850,0.205,157.970,173347,edm
4,1Ly0ssAgeM7YqdHptao8Oe,Paranoid,Black Sabbath,15,rock,0.426,0.619,4,-11.501,0,0.0722,0.000084,0.00239,0.1700,0.382,162.721,168440,rock
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,691aH4q1gTSajAdBLckRey,Quisiera - Unplugged,Pasabordo,37,latin,0.716,0.603,0,-7.576,1,0.3220,0.721000,0.00000,0.2340,0.531,144.750,181250,rap
112,59q0gDvWPoL2DQ1SAt5ZLY,What Up Gangsta,50 Cent,55,rap,0.670,0.788,11,-2.430,0,0.0445,0.002250,0.00000,0.1100,0.292,82.498,179467,pop
113,3mJ6pNcFM2CkykCYSREdKT,When I Think Of You,Janet Jackson,49,r&b,0.759,0.639,9,-11.386,1,0.0425,0.084000,0.03310,0.0518,0.706,116.207,235493,r&b
114,4Htt3QaBWdLggq88rJI5MU,Inside Out,Five Finger Death Punch,69,rock,0.476,0.991,4,-2.855,0,0.1500,0.001910,0.00000,0.1210,0.202,82.997,226349,edm


In [68]:
genres = input_data['genre'].values
predictions = input_data['prediction'].values
accurracy = np.mean(genres == predictions)

accurracy

0.4482758620689655