In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

In [36]:
spotify_df_org = pd.read_csv('data/raw/Spotify-dataset.csv')
spotify_df_org.shape

(114000, 21)

In [37]:
display(spotify_df_org.info())
display(spotify_df_org.isnull().sum())
display(spotify_df_org[spotify_df_org.duplicated()].shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114000 entries, 0 to 113999
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   index             114000 non-null  int64  
 1   track_id          114000 non-null  object 
 2   artists           113999 non-null  object 
 3   album_name        113999 non-null  object 
 4   track_name        113999 non-null  object 
 5   popularity        114000 non-null  int64  
 6   duration_ms       114000 non-null  int64  
 7   explicit          114000 non-null  bool   
 8   danceability      114000 non-null  float64
 9   energy            114000 non-null  float64
 10  key               114000 non-null  int64  
 11  loudness          114000 non-null  float64
 12  mode              114000 non-null  int64  
 13  speechiness       114000 non-null  float64
 14  acousticness      114000 non-null  float64
 15  instrumentalness  114000 non-null  float64
 16  liveness          11

None

index               0
track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

(0, 21)

In [38]:
df_cleaned = spotify_df_org.dropna()
df_cleaned = df_cleaned.sort_values(by='popularity', ascending=False)
#df_cleaned = df_cleaned.drop_duplicates()
df_cleaned = df_cleaned.drop_duplicates(subset=['track_id','artists', 'track_name'])
print(df_cleaned.shape)
#display(df_cleaned[df_cleaned.duplicated(subset='track_id')])

(89740, 21)


In [39]:
from src.transformation import key_cyclic_encoding
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

def classify_duration(z):
    """
    Classify the log-duration based on the different classes.
    We used the standardized (Z-score) method.
    """
    if z < -0.3:
        return 'short'
    elif -0.3 <= z < 0.3:
        return 'normal'
    elif 0.3 <= z < 1:
        return 'mid_normal'
    elif 1 <= z < 2:
        return 'long'
    else:
        return 'very_long'


def feature_engineering_on_duration_col() -> None:
    """Add a minute column, class column, and one-hot encoding."""

    if 'duration_ms' in df_cleaned.columns:
        # Minute Column
        df_cleaned['duration_min'] = df_cleaned['duration_ms'] / 60_000

        # Log transformation
        df_cleaned['duration_log'] = np.log1p(df_cleaned['duration_min'])       
        
        # Standardize log-duration
        scaler = StandardScaler()
        df_cleaned['duration_class'] = df_cleaned['duration_log_z'] = scaler.fit_transform(df_cleaned[['duration_log']])
        
        # Make a class
        df_cleaned['duration_class'] = df_cleaned['duration_log_z'].apply(classify_duration)
        
    print("Columns formed by 'duration' column : ", [col for col in df_cleaned.columns if col.startswith('duration')], "\n")
    
    
def feature_engineering_on_time_signature_col():
    """ 
    1. Make class (1 -> Common, 0 -> Others)
    """
    df_cleaned['time_signature_class_boolean'] = df_cleaned['time_signature'].apply(lambda x : 1 if x in [3, 4] else 0)    

    print("Columns formed by duration column : ", [col for col in df_cleaned.columns if col.startswith('time_signature')], "\n")
    

def feature_engineering_on_loudness_col():
    pt = PowerTransformer(method='yeo-johnson')
    df_cleaned['loudness_yeo'] = pt.fit_transform(df_cleaned[['loudness']])
    
    df_cleaned['loudness_level'] = pd.qcut(
        df_cleaned['loudness_yeo'], 
        q=5,  
        labels=['Very Quiet', 'Quiet', 'Normal', 'Loud', 'Very Loud']
    )
    
    df_cleaned['loudness_intensity'] = df_cleaned['energy'] * (df_cleaned['loudness'] + (abs(df_cleaned['loudness']).min()))

    
    print("Columns formed by 'loudness' column : ", [col for col in df_cleaned.columns if col.startswith('loudness')], "\n")



def feature_engineering_on_multicolumn_col():
    df_cleaned['is_instrumental'] = (df_cleaned['instrumentalness'] > 0.8).astype('int')
    df_cleaned['is_dance_hit'] = ((df_cleaned['danceability'] > 0.7840) & (df_cleaned['energy'] > 0.9410)).astype('int')
    
    df_cleaned['happy_dance'] = df_cleaned['valence'] * df_cleaned['danceability']
    df_cleaned['acoustics_instrumental'] = df_cleaned['instrumentalness'] * df_cleaned['acousticness']

    print("Columns formed by multi column : ", [col for col in df_cleaned.columns if col.startswith('is')], \
        "['happy_dance', , 'acoustics_instrumental'] \n" )


def feature_engineering_on_popularity_col():
    df_cleaned['popularity_level'] =  pd.qcut(
    df_cleaned['popularity'],
    q=[0, 0.5, 0.8, 0.9, 0.97, 1.0],
    labels=['low', 'normal', 'medium', 'high', 'very high']
    ) 
    
    print("Columns formed by 'popularity' column : ", [col for col in df_cleaned.columns if col.startswith('popularity')], "\n" )


def feature_engineering_on_artist_col():
    
    artists_popularity =  df_cleaned.groupby(['artists'])['popularity'].mean()
    df_cleaned['artists_avg_popularity'] = df_cleaned['artists'].map(artists_popularity)

    df_cleaned['artist_song_count'] = df_cleaned['artists'].map(df_cleaned['artists'].value_counts())

    print("Columns formed by 'artist' column : ", [col for col in df_cleaned.columns if col.startswith('artist')], "\n" )
    
    
def feature_engineering_on_album_col():
    
    df_cleaned['album_freq'] = df_cleaned['album_name'].map(df_cleaned['album_name'].value_counts())

    print("Columns formed by 'album' column : ", [col for col in df_cleaned.columns if col.startswith('album')], "\n" )


def feature_engineering_on_tempo_col():
    
    df_cleaned['tempo_class'] = pd.cut(
        df_cleaned['tempo'],
        bins=[0, 40, 80, 180, 210, float('inf')],
        labels=['very slow', 'slow', 'normal', 'fast', 'very fast'])

    df_cleaned['temp_zscore'] = (df_cleaned['tempo'] - df_cleaned['tempo'].mean()) / (df_cleaned['tempo'].std())
    
    df_cleaned['tempo_vs_genre'] = df_cleaned['tempo'] - df_cleaned.groupby('track_genre')['tempo'].transform('mean')
    
    print("Columns formed by 'tempo' column : ", [col for col in df_cleaned.columns if col.startswith('temp')], "\n" )


def feature_engineering_on_energy_col():
    
    df_cleaned['energy_rank_pct'] = df_cleaned['energy'].rank(pct=True)
    
    df_cleaned['loud_energy_ratio'] = (df_cleaned['loudness'] + 50) / (df_cleaned['energy'] + 1e-6)
    
    print("Columns formed by 'energy' column : ", [col for col in df_cleaned.columns if 'energy' in col], "\n" )


def clustering_the_columns():

    features = df_cleaned[['danceability', 'energy', 'valence', 'tempo']]
    km = KMeans(n_clusters=11, random_state=0).fit(features)
    df_cleaned['mood_cluster'] = km.labels_
    
    
    features = df_cleaned[['acousticness', 'valence']]
    km = KMeans(n_clusters=11, random_state=0).fit(features)
    df_cleaned['acoustic_valence_mood_cluster'] = km.labels_

    print("Columns formed by 'clustering' : ['mood_cluster', 'acoustic_valence_mood_cluster'] \n" )


def pca_columns():
    pca = PCA(n_components=1)
    df_cleaned['mood_pca'] = pca.fit_transform(df_cleaned[['valence', 'energy', 'danceability']])
    
    print("Columns formed by 'pca' : ['mood_pca'] \n" )


def drop_col(cols):
    for col in cols:
        if col in df_cleaned.columns:
            df_cleaned.drop(columns=[col], inplace=True)
            print(f"'{col}' is drop.\n")
    print(f"Drop Columns are : {cols}", flush=True)

In [40]:
dummy_df = df_cleaned.copy()

In [41]:
key_cyclic_encoding(df_cleaned,'key')
feature_engineering_on_duration_col()
feature_engineering_on_time_signature_col()
feature_engineering_on_loudness_col()
feature_engineering_on_multicolumn_col()
feature_engineering_on_popularity_col()
feature_engineering_on_artist_col()
feature_engineering_on_album_col()
feature_engineering_on_tempo_col()
feature_engineering_on_energy_col()
clustering_the_columns()
pca_columns()

        key   key_sin  key_cos
108885    0  0.000000      1.0
27822     4  0.866025     -0.5
17360     8 -0.866025     -0.5 

Columns formed by 'duration' column :  ['duration_ms', 'duration_min', 'duration_log', 'duration_class', 'duration_log_z'] 

Columns formed by duration column :  ['time_signature', 'time_signature_class_boolean'] 

Columns formed by 'loudness' column :  ['loudness', 'loudness_yeo', 'loudness_level', 'loudness_intensity'] 

Columns formed by multi column :  ['is_instrumental', 'is_dance_hit'] ['happy_dance', , 'acoustics_instrumental'] 

Columns formed by 'popularity' column :  ['popularity', 'popularity_level'] 

Columns formed by 'artist' column :  ['artists', 'artists_avg_popularity', 'artist_song_count'] 

Columns formed by 'album' column :  ['album_name', 'album_freq'] 

Columns formed by 'tempo' column :  ['tempo', 'tempo_class', 'temp_zscore', 'tempo_vs_genre'] 

Columns formed by 'energy' column :  ['energy', 'energy_rank_pct', 'loud_energy_ratio'] 

Colu

In [42]:
print('---'*15 + "  Drop Columns  " +'---'*15)
drop_col(['index', 'album_name', 'track_name', 'duration_ms', 'duration_min', 'key', 'artists', 'track_id'])
print('---'*30)
print(f"{df_cleaned.shape[1] - dummy_df.shape[1]} New Columns formed. ")

---------------------------------------------  Drop Columns  ---------------------------------------------
'index' is drop.

'album_name' is drop.

'track_name' is drop.

'duration_ms' is drop.

'duration_min' is drop.

'key' is drop.

'artists' is drop.

'track_id' is drop.

Drop Columns are : ['index', 'album_name', 'track_name', 'duration_ms', 'duration_min', 'key', 'artists', 'track_id']
------------------------------------------------------------------------------------------
18 New Columns formed. 


In [43]:
df_cleaned.columns

Index(['popularity', 'explicit', 'danceability', 'energy', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'time_signature', 'track_genre', 'key_sin',
       'key_cos', 'duration_log', 'duration_class', 'duration_log_z',
       'time_signature_class_boolean', 'loudness_yeo', 'loudness_level',
       'loudness_intensity', 'is_instrumental', 'is_dance_hit', 'happy_dance',
       'acoustics_instrumental', 'popularity_level', 'artists_avg_popularity',
       'artist_song_count', 'album_freq', 'tempo_class', 'temp_zscore',
       'tempo_vs_genre', 'energy_rank_pct', 'loud_energy_ratio',
       'mood_cluster', 'acoustic_valence_mood_cluster', 'mood_pca'],
      dtype='object')

In [44]:
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,OneHotEncoder

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import  Pipeline
from sklearn import set_config

set_config(display='diagram')

df_cleaned['explicit'] = df_cleaned['explicit'].astype(int)

# Define columns by type
onehot_cols = ['track_genre', 'duration_class']
ordinal_cols = df_cleaned.select_dtypes(include='category').columns.tolist()

int64_cols = [
    'popularity', 'time_signature', 'artist_song_count', 'album_freq'
]

float64_cols = [
    'danceability','loudness','speechiness','acousticness','valence','tempo',
    'loudness_intensity','happy_dance','acoustics_instrumental',
    'artists_avg_popularity','tempo_vs_genre','energy_rank_pct','loud_energy_ratio','mood_pca'
]

int32_cols = [
    'mood_cluster', 'acoustic_valence_mood_cluster'
]

numerical_cols = int64_cols + float64_cols + int32_cols

# One hot encoder
onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)),
])

# Scaler
scaler_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Ordinal Encoder
ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder()),
])

# Combine all transformers 
preprocessor = ColumnTransformer(transformers=[
    ('onehot', onehot_transformer, onehot_cols),
    ('ordinal', ordinal_transformer, ordinal_cols),
    ('scaler', scaler_transformer, numerical_cols)
], remainder='passthrough')

# Create full pipeline
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor)
])

# Show the pipeline
pipeline

0,1,2
,steps,"[('preprocessing', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('onehot', ...), ('ordinal', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True


In [None]:
processed_array = pipeline.fit_transform(df_cleaned)

# Get OHE columns names
ohe_feature_names = pipeline.named_steps['preprocessing']\
    .named_transformers_['onehot']\
    .named_steps['onehot']\
    .get_feature_names_out(onehot_cols)

# Get passed through columns
all_input_cols = df_cleaned.columns.to_list()
specified_cols = onehot_cols + ordinal_cols + numerical_cols
passthrough_cols = [col for col in all_input_cols if col not in specified_cols]

# Final columns names list
# Here we write similar like our pipe 
final_columns = np.concatenate([
    ohe_feature_names,
    ordinal_cols,
    numerical_cols,
    passthrough_cols
])

df = pd.DataFrame(processed_array, columns=final_columns)
df = df.astype(np.float64)

print("Our New Data Frame Created ")
print("Processed array shape:", processed_array.shape)
print("Number of column names:", len(final_columns))
print('--'*30)
df.sample(5)

Our New Data Frame Created 🥳
Processed array shape: (89740, 156)
Number of column names: 156
------------------------------------------------------------


Unnamed: 0,track_genre_acoustic,track_genre_afrobeat,track_genre_alt-rock,track_genre_alternative,track_genre_ambient,track_genre_anime,track_genre_black-metal,track_genre_bluegrass,track_genre_blues,track_genre_brazil,...,liveness,key_sin,key_cos,duration_log,duration_log_z,time_signature_class_boolean,loudness_yeo,is_instrumental,is_dance_hit,temp_zscore
71232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.361,-0.5,-0.8660254,1.622288,0.32606,1.0,0.639521,0.0,0.0,-0.565952
19039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.225,0.8660254,-0.5,1.589871,0.218345,1.0,0.392246,0.0,0.0,1.252152
73237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0471,0.5,0.8660254,2.147373,2.070796,1.0,0.291178,1.0,1.0,0.139117
66910,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.317,-1.0,-1.83697e-16,1.621452,0.323282,1.0,1.728978,0.0,0.0,2.587448
8498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.11,1.224647e-16,-1.0,1.602749,0.261135,1.0,-0.98062,0.0,0.0,-0.069001


In [50]:
df.to_csv('data/procesed/modified-spotify-data.csv')

(89740, 39)