In [28]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler

# Get the directory of the current notebook
base_dir = Path().resolve()

# Define the data directory
data_dir = base_dir / 'data'

# Construct the path to the data file
data_path = data_dir / 'dataset.csv'

# Read the main CSV file
df = pd.read_csv(data_path)
df = df.dropna() # Drop missing values 
df = df.drop_duplicates(subset='track_id')

# Convert duration from ms to seconds
df['duration_sec'] = df['duration_ms'] / 1000
df = df.drop(columns='duration_ms')  # Drop duration_ms column
df['explicit'] = df['explicit'].astype(bool)

# List of features to normalize
features_to_normalize = ['danceability', 'energy', 'loudness', 'speechiness', 
                         'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

# Apply MinMaxScaler
scaler = MinMaxScaler()
df[features_to_normalize] = scaler.fit_transform(df[features_to_normalize])

#Keep first artist only since keeping all artists might dilute clustering
df['artist'] = df['artists'].apply(lambda x: x.split(';')[0] if ';' in x else x)

# Convert genre to category and encode as numeric labels
df['genre_label'] = df['track_genre'].astype('category').cat.codes

# Mapping of genre codes to genre names
genre_mapping = dict(enumerate(df['track_genre'].astype('category').cat.categories))


df.to_csv('processed_data.csv', index=False)
