In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statistics import mode

# Define a function to load data and select specific columns
def load_and_select_columns(file_path, columns):
    data = pd.read_csv(file_path, usecols=columns)
    return data

# Columns to be used
columns_to_use = [
    'track_id', 'track_name', 'track_artist', 'track_popularity', 
    'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
    'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 
    'duration_ms', 'genre'
]

# Load the data with the selected columns
song_data = load_and_select_columns('../data/songs_data.csv', columns_to_use)
song_data.head()

# Get unique genres
song_data['genre'].unique()

array(['edm', 'rock', 'rap', 'latin', 'r&b', 'pop'], dtype=object)

In [5]:
# Separate the data by genre
edm_data = song_data[song_data['genre']=='edm']
rock_data = song_data[song_data['genre']=='rock']
rap_data = song_data[song_data['genre']=='rap']
latin_data = song_data[song_data['genre']=='latin']
rb_data = song_data[song_data['genre']=='r&b']
pop_data = song_data[song_data['genre']=='pop']

# Compute global mean and standard deviation
global_mean = song_data.mean(numeric_only=True)
global_std = song_data.std(numeric_only=True)

# Compute mean for each genre
edm_mean = edm_data.mean(numeric_only=True)
rock_mean = rock_data.mean(numeric_only=True)
rap_mean = rap_data.mean(numeric_only=True)
latin_mean = latin_data.mean(numeric_only=True)
rb_mean = rb_data.mean(numeric_only=True)
pop_mean = pop_data.mean(numeric_only=True)

In [6]:
# Normalize the data
normalized_edm = (edm_mean - global_mean) / global_std
normalized_rock = (rock_mean - global_mean) / global_std
normalized_rap = (rap_mean - global_mean) / global_std
normalized_latin = (latin_mean - global_mean) / global_std
normalized_rb = (rb_mean - global_mean) / global_std
normalized_pop = (pop_mean - global_mean) / global_std

# Convert to vectors
vector_edm = normalized_edm.values
vector_rock = normalized_rock.values
vector_rap = normalized_rap.values
vector_latin = normalized_latin.values
vector_rb = normalized_rb.values
vector_pop = normalized_pop.values

In [7]:
# Load the input song data with the selected columns
input_data = load_and_select_columns('../data/Input_Song.csv', columns_to_use[:-1]) # Exclude 'genre' from input data, that's why it is last in the list
input_data.head()

# Normalize the input data
normalized_input = (input_data.select_dtypes(include='number') - global_mean) / global_std
vector_input = normalized_input.values

# Calculate magnitudes
magnitude_list = []
magnitude_list.append(np.linalg.norm(vector_input - vector_edm))
magnitude_list.append(np.linalg.norm(vector_input - vector_rock))
magnitude_list.append(np.linalg.norm(vector_input - vector_rap))
magnitude_list.append(np.linalg.norm(vector_input - vector_latin))
magnitude_list.append(np.linalg.norm(vector_input - vector_rb))
magnitude_list.append(np.linalg.norm(vector_input - vector_pop))

# Find the genre closest to the input song
min_mag_index = np.argmin(magnitude_list)
print("The song is ")
if min_mag_index == 0:
    print("EDM")
elif min_mag_index == 1:
    print("Rock")
elif min_mag_index == 2:
    print("Rap")
elif min_mag_index == 3:
    print("Latin")
elif min_mag_index == 4:
    print("R&B")
elif min_mag_index == 5:
    print("Pop")

The song is 
Rap
