In [2]:
# Pandas dataframes
# Info of the dataframe, statistics
# Slicing songs to danceability of at least 0.7 calculate the min, max and median of the acousticness feature
# Two thresholds for danceability v1 and v2 where v1<v2
# Columns based on low_danceability (<v1), medium_danceability (>=v1 and <v2), high_danceability (>v2). 
# group by this new column and standardize the numerical columns.

import pandas as pd
import numpy as np

spotify_data = pd.read_csv("data/data.csv", converters={"artists": eval})

print("Dataset Overview:")
spotify_data.info()

# Danceability threshold
v1, v2 = 0.4, 0.7  

# Classify into danceability
def classify_danceability(danceability):
    if danceability < v1:
        return "low_danceability"
    elif v1 <= danceability < v2:
        return "medium_danceability"
    else:
        return "high_danceability"

# Classification
spotify_data["danceability_category"] = spotify_data["danceability"].apply(classify_danceability)

grouped_data = spotify_data.groupby("danceability_category")

numerical_columns = spotify_data.select_dtypes(include=[np.number]).columns

def standardize_columns(group):
    standardized = group[numerical_columns].apply(lambda x: (x - x.mean()) / x.std())
    return standardized

standardized_tracks = grouped_data.apply(standardize_columns)

standardized_tracks = standardized_tracks.reset_index(drop=True)

final_data = pd.concat([spotify_data.drop(columns=numerical_columns), standardized_tracks], axis=1)

print("\nStandardized Data by Danceability Category:")
print(final_data.head())

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169909 entries, 0 to 169908
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   acousticness      169909 non-null  float64
 1   artists           169909 non-null  object 
 2   danceability      169909 non-null  float64
 3   duration_ms       169909 non-null  int64  
 4   energy            169909 non-null  float64
 5   explicit          169909 non-null  int64  
 6   id                169909 non-null  object 
 7   instrumentalness  169909 non-null  float64
 8   key               169909 non-null  int64  
 9   liveness          169909 non-null  float64
 10  loudness          169909 non-null  float64
 11  mode              169909 non-null  int64  
 12  name              169909 non-null  object 
 13  popularity        169909 non-null  int64  
 14  release_date      169909 non-null  object 
 15  speechiness       169909 non-null  float64
 16  te