### featuring data

* load the cleaned data
* Scale Numeric features (e.g., tempo, loudness)
* Encode Categorical variables (key, mode)
  * Explicit column to check if song contain explicit content
* process text data for Artists (Label encoding)
* Extract release date (Could be used for better additional insight)
* Combine features into single dataset
  * Merge scaled numeric features and encoded categorical features
  * ensure there's no mismatch
* Save data 

In [32]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import pickle
import numpy as np



In [33]:
df = pd.read_csv("../datatset/data.csv")

df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [34]:
# scale numeric columns between 0 - 1 using minmax
columns_to_scale = ['loudness', 'popularity', 'tempo']

# minmax scaler
scaler = MinMaxScaler()

# scale only relevant column
df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])

# save scaler for future use
scaler_path = "../models/scaler.pkl"  # Path to save the scaler
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)



df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,0.745,1,Singende Bataillone 1. Teil,0.0,1928,0.0506,0.485348,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,0.494026,1,"Fantasiestücke, Op. 111: Più tosto lento",0.0,1928,0.0462,0.344019,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,0.627609,0,Chapter 1.18 - Zamek kaniowski,0.0,1928,0.929,0.439086,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,0.708887,0,Bebamos Juntos - Instrumental (Remasterizado),0.0,1928-09-25,0.0926,0.44247,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,0.676079,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",0.01,1928,0.0424,0.254614,0.0693,1928


In [35]:

# Cyclical encoding for 'key' using sine and cosine transformation
# sine (key_sin) and cosine (key_cos) columns represent the cyclical relationship between the musical keys
df['key_sin'] = np.sin(2 * np.pi * df['key'] / 12)
df['key_cos'] = np.cos(2 * np.pi * df['key'] / 12)

df.head(4)

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,...,mode,name,popularity,release_date,speechiness,tempo,valence,year,key_sin,key_cos
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,...,1,Singende Bataillone 1. Teil,0.0,1928,0.0506,0.485348,0.779,1928,-0.866025,0.5
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,...,1,"Fantasiestücke, Op. 111: Più tosto lento",0.0,1928,0.0462,0.344019,0.0767,1928,-0.866025,-0.5
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,...,0,Chapter 1.18 - Zamek kaniowski,0.0,1928,0.929,0.439086,0.88,1928,0.5,-0.866025
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,...,0,Bebamos Juntos - Instrumental (Remasterizado),0.0,1928-09-25,0.0926,0.44247,0.72,1928,0.5,0.866025


In [36]:
# Save the transformed data
df.to_csv("../datatset/features.csv", index=False)

In [37]:
# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)


acousticness        0
artists             0
danceability        0
duration_ms         0
energy              0
explicit            0
id                  0
instrumentalness    0
key                 0
liveness            0
loudness            0
mode                0
name                0
popularity          0
release_date        0
speechiness         0
tempo               0
valence             0
year                0
key_sin             0
key_cos             0
dtype: int64
