In [273]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import mode

# Define a function to load data and select specific columns
def load_and_select_columns(file_path, columns):
    data = pd.read_csv(file_path, usecols=columns)
    return data

def remove_outliers(df, columns):
    Q1 = df[columns].quantile(0.25)
    Q3 = df[columns].quantile(0.75)
    IQR = Q3 - Q1
    condition = ~((df[columns] < (Q1 - 1.5 * IQR)) | (df[columns] > (Q3 + 1.5 * IQR))).any(axis=1)
    return df[condition]
# Columns to be used
"""
columns_to_use = [
    'track_id', 'track_name', 'track_artist', 'track_popularity', 
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
    'duration_ms', 'genre'
]
columns_to_use_2 = [
    'track_id', 'track_name', 'track_artist', 'track_popularity', 
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
    'duration_ms', 'genre','prediction'
]
"""
# liveness, duration_ms, mode, key removed
columns_to_use = [
    'track_id', 'track_name', 'track_artist', 'track_popularity',
    'danceability', 'energy', 'loudness', 'speechiness',
    'acousticness', 'instrumentalness', 'valence', 'tempo', 'genre',
]
columns_to_use_2 = [
    'track_id', 'track_name', 'track_artist', 'track_popularity',
    'danceability', 'energy', 'loudness', 'speechiness',
    'acousticness', 'instrumentalness', 'valence', 'tempo', 'genre','prediction'
]

columns_to_use_3 = [
    'track_popularity',
    'danceability', 'energy', 'loudness', 'speechiness',
    'acousticness', 'instrumentalness', 'valence', 'tempo'
]


# Load the data with the selected columns
song_data = load_and_select_columns('../data/songs_data.csv', columns_to_use)
song_data.head()


# Get unique genres
song_data['genre'].unique()

array(['edm', 'rock', 'rap', 'latin', 'r&b', 'pop'], dtype=object)

In [274]:
# Separate the data by genre
edm_data = song_data[song_data['genre']=='edm']
rock_data = song_data[song_data['genre']=='rock']
rap_data = song_data[song_data['genre']=='rap']
latin_data = song_data[song_data['genre']=='latin']
rb_data = song_data[song_data['genre']=='r&b']
pop_data = song_data[song_data['genre']=='pop']

# Removing outliers
edm_data=remove_outliers(edm_data, columns_to_use_3)
rock_data=remove_outliers(rock_data, columns_to_use_3)
rap_data=remove_outliers(rap_data, columns_to_use_3)
latin_data=remove_outliers(latin_data, columns_to_use_3)
rb_data=remove_outliers(rb_data, columns_to_use_3)
pop_data=remove_outliers(pop_data, columns_to_use_3)

# Compute global mean and standard deviation

global_mean = song_data.mean(numeric_only=True)
global_std = song_data.std(numeric_only=True)

# Compute mean for each genre
edm_mean = edm_data.mean(numeric_only=True)
rock_mean = rock_data.mean(numeric_only=True)
rap_mean = rap_data.mean(numeric_only=True)
latin_mean = latin_data.mean(numeric_only=True)
rb_mean = rb_data.mean(numeric_only=True)
pop_mean = pop_data.mean(numeric_only=True)

In [275]:
# Normalize the data
normalized_edm = (edm_mean - global_mean) / global_std
normalized_rock = (rock_mean - global_mean) / global_std
normalized_rap = (rap_mean - global_mean) / global_std
normalized_latin = (latin_mean - global_mean) / global_std
normalized_rb = (rb_mean - global_mean) / global_std
normalized_pop = (pop_mean - global_mean) / global_std

# A vector to control the impact of a category on the magnitude of the vector
# 'track_popularity','danceability', 'energy', 'loudness', 'speechiness','acousticness', 'instrumentalness', 'valence', 'tempo'
relevance_vector = [0.8,1.4,1.2,1,1,0.7,0.6,0.8,0.8]

# Convert to vectors
vector_edm = normalized_edm.values * relevance_vector
vector_rock = normalized_rock.values * relevance_vector
vector_rap = normalized_rap.values * relevance_vector
vector_latin = normalized_latin.values * relevance_vector
vector_rb = normalized_rb.values * relevance_vector
vector_pop = normalized_pop.values * relevance_vector

In [276]:
# Load the input song data with the selected columns
input_data = load_and_select_columns('../data/Input_Song.csv', columns_to_use[:-1]) # Exclude 'genre' from input data, that's why it is last in the list
input_data.head()

# Normalize the input data
normalized_input = (input_data.select_dtypes(include='number') - global_mean) / global_std
vector_input = normalized_input.values * relevance_vector

# Calculate magnitudes
magnitude_list = []
magnitude_list.append(np.linalg.norm(vector_input - vector_edm))
magnitude_list.append(np.linalg.norm(vector_input - vector_rock))
magnitude_list.append(np.linalg.norm(vector_input - vector_rap))
magnitude_list.append(np.linalg.norm(vector_input - vector_latin))
magnitude_list.append(np.linalg.norm(vector_input - vector_rb))
magnitude_list.append(np.linalg.norm(vector_input - vector_pop))

# Find the genre closest to the input song
min_mag_index = np.argmin(magnitude_list)
print("The song is ")
if min_mag_index == 0:
    print("EDM")
elif min_mag_index == 1:
    print("Rock")
elif min_mag_index == 2:
    print("Rap")
elif min_mag_index == 3:
    print("Latin")
elif min_mag_index == 4:
    print("R&B")
elif min_mag_index == 5:
    print("Pop")

The song is 
Rap


In [277]:
# Input Song List (To test a great Quantity of values at once)
input_data = load_and_select_columns('../data/Input_Song_List.csv', columns_to_use_2) 
input_data.head()

Unnamed: 0,track_id,track_name,track_artist,track_popularity,genre,danceability,energy,loudness,speechiness,acousticness,instrumentalness,valence,tempo,prediction
0,7givkGZW1cwM8MJYzMd2Na,Ain't No Time,Future,51,rap,0.842,0.587,-6.677,0.069,0.00941,0.000337,0.135,145.016,none
1,6i4hX1H06CZsc5GPSjnVRB,What's Your Name,Lynyrd Skynyrd,0,rock,0.666,0.593,-7.612,0.0284,0.677,1.5e-05,0.96,135.196,none
2,4f7xpneUOcob0RoRyae6ax,Giovane Fuoriclasse,Capo Plaza,0,latin,0.555,0.702,-5.002,0.26,0.518,0.0,0.0883,133.036,none
3,3qIFYKztoCRa8I4a5J3BLF,Muskrat Love,Captain & Tennille,0,r&b,0.394,0.256,-16.528,0.056,0.885,0.000133,0.22,204.527,none
4,0DFRrABd2ppHhS3QaZuCJ1,Fuck Tha Police,N.W.A.,57,rap,0.85,0.837,-6.37,0.291,0.00956,0.0,0.737,98.694,none


In [278]:
# Normalize the input data
normalized_input = (input_data.select_dtypes(include='number') - global_mean) / global_std
normalized_input

Unnamed: 0,track_popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,valence,tempo
0,0.492013,1.293661,-0.607068,0.046285,-0.379931,-0.753015,-0.390454,-1.602305,0.892534
1,-1.659297,0.086517,-0.574368,-0.261641,-0.775552,2.241904,-0.391840,1.918687,0.528259
2,-1.659297,-0.674807,0.019690,0.597917,1.481242,1.528604,-0.391903,-1.801614,0.448133
3,-1.659297,-1.779070,-2.411043,-3.197971,-0.506608,3.175026,-0.391331,-1.239536,3.100109
4,0.745108,1.348532,0.755450,0.147390,1.783317,-0.752342,-0.391903,0.966953,-0.825792
...,...,...,...,...,...,...,...,...,...
111,-0.098543,0.429456,-0.519867,-0.249785,2.085392,2.439295,-0.391903,0.087771,0.882667
112,0.660743,0.113952,0.488397,1.444961,-0.618668,-0.785136,-0.391903,-0.932249,-1.426586
113,0.407648,0.724383,-0.323664,-1.504543,-0.638157,-0.418392,-0.249577,0.834649,-0.176143
114,1.251299,-1.216651,1.594762,1.304994,0.409362,-0.786662,-0.391903,-1.316358,-1.408076


In [279]:
vector_list_input = normalized_input.values * relevance_vector

In [280]:
#num_rows, num_cols = vector_input

# Calculate magnitudes
magnitude_list = {
    'edm': [],
    'rock': [],
    'rap': [],
    'latin': [],
    'r&b': [],
    'pop': []
}
for vector in vector_list_input:
    magnitude_list['edm'].append(np.linalg.norm(vector - vector_edm))
    magnitude_list['rock'].append(np.linalg.norm(vector - vector_rock))
    magnitude_list['rap'].append(np.linalg.norm(vector - vector_rap))
    magnitude_list['latin'].append(np.linalg.norm(vector - vector_latin))
    magnitude_list['r&b'].append(np.linalg.norm(vector - vector_rb))
    magnitude_list['pop'].append(np.linalg.norm(vector - vector_pop))

# Convert magnitude lists to numpy arrays for further processing
for genre in magnitude_list:
    magnitude_list[genre] = np.array(magnitude_list[genre])

magnitude_list

# Find the closest genre for each input vector
genres = ['edm', 'rock', 'rap', 'latin', 'r&b', 'pop']
closest_genres = []

for i in range(len(vector_list_input)):
    magnitudes = [magnitude_list[genre][i] for genre in genres]
    min_mag_index = np.argmin(magnitudes)
    closest_genres.append(genres[min_mag_index])

closest_genres

['rap',
 'r&b',
 'rap',
 'r&b',
 'rap',
 'latin',
 'rock',
 'r&b',
 'r&b',
 'r&b',
 'latin',
 'r&b',
 'edm',
 'edm',
 'latin',
 'rock',
 'edm',
 'rap',
 'rap',
 'rap',
 'rock',
 'r&b',
 'r&b',
 'pop',
 'rock',
 'latin',
 'edm',
 'latin',
 'pop',
 'rock',
 'rock',
 'edm',
 'edm',
 'latin',
 'rock',
 'edm',
 'r&b',
 'r&b',
 'r&b',
 'rock',
 'edm',
 'rap',
 'r&b',
 'pop',
 'edm',
 'rap',
 'edm',
 'rock',
 'rock',
 'r&b',
 'r&b',
 'rock',
 'r&b',
 'pop',
 'edm',
 'latin',
 'latin',
 'r&b',
 'latin',
 'rap',
 'rock',
 'rap',
 'pop',
 'latin',
 'r&b',
 'rap',
 'rock',
 'r&b',
 'pop',
 'pop',
 'edm',
 'rock',
 'r&b',
 'latin',
 'rock',
 'latin',
 'latin',
 'rap',
 'rap',
 'rap',
 'edm',
 'r&b',
 'pop',
 'r&b',
 'latin',
 'rap',
 'latin',
 'rap',
 'rock',
 'latin',
 'r&b',
 'latin',
 'latin',
 'edm',
 'r&b',
 'latin',
 'pop',
 'edm',
 'r&b',
 'edm',
 'r&b',
 'rap',
 'rap',
 'latin',
 'r&b',
 'r&b',
 'r&b',
 'rap',
 'rap',
 'r&b',
 'pop',
 'rap',
 'pop',
 'r&b',
 'rock',
 'pop']

In [281]:
input_data ['prediction'] = closest_genres

input_data

Unnamed: 0,track_id,track_name,track_artist,track_popularity,genre,danceability,energy,loudness,speechiness,acousticness,instrumentalness,valence,tempo,prediction
0,7givkGZW1cwM8MJYzMd2Na,Ain't No Time,Future,51,rap,0.842,0.587,-6.677,0.0690,0.00941,0.000337,0.1350,145.016,rap
1,6i4hX1H06CZsc5GPSjnVRB,What's Your Name,Lynyrd Skynyrd,0,rock,0.666,0.593,-7.612,0.0284,0.67700,0.000015,0.9600,135.196,r&b
2,4f7xpneUOcob0RoRyae6ax,Giovane Fuoriclasse,Capo Plaza,0,latin,0.555,0.702,-5.002,0.2600,0.51800,0.000000,0.0883,133.036,rap
3,3qIFYKztoCRa8I4a5J3BLF,Muskrat Love,Captain & Tennille,0,r&b,0.394,0.256,-16.528,0.0560,0.88500,0.000133,0.2200,204.527,r&b
4,0DFRrABd2ppHhS3QaZuCJ1,Fuck Tha Police,N.W.A.,57,rap,0.850,0.837,-6.370,0.2910,0.00956,0.000000,0.7370,98.694,rap
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,691aH4q1gTSajAdBLckRey,Quisiera - Unplugged,Pasabordo,37,latin,0.716,0.603,-7.576,0.3220,0.72100,0.000000,0.5310,144.750,rap
112,59q0gDvWPoL2DQ1SAt5ZLY,What Up Gangsta,50 Cent,55,rap,0.670,0.788,-2.430,0.0445,0.00225,0.000000,0.2920,82.498,pop
113,3mJ6pNcFM2CkykCYSREdKT,When I Think Of You,Janet Jackson,49,r&b,0.759,0.639,-11.386,0.0425,0.08400,0.033100,0.7060,116.207,r&b
114,4Htt3QaBWdLggq88rJI5MU,Inside Out,Five Finger Death Punch,69,rock,0.476,0.991,-2.855,0.1500,0.00191,0.000000,0.2020,82.997,rock


In [282]:
genres = input_data['genre'].values
predictions = input_data['prediction'].values
accurracy = np.mean(genres == predictions)

accurracy

0.4827586206896552