In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Load dataset
file_path = "/content/dataset.csv"  # Change this to the correct path

# 🔹 1. Check if file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found: {file_path}")

df = pd.read_csv(file_path)

# 🔹 2. Handle Missing Values
df = df.dropna()  # Remove rows with missing values

# 🔹 3. Convert Columns to Correct Data Types (Ensure numeric values)
features = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
            'instrumentalness', 'liveness', 'valence', 'tempo']

# 🔹 4. Check if required features exist
missing_features = [col for col in features if col not in df.columns]
if missing_features:
    raise KeyError(f"Missing features in dataset: {missing_features}")

df[features] = df[features].apply(pd.to_numeric, errors='coerce')

# 🔹 5. Normalize Numerical Features
scaler = StandardScaler()  # Use MinMaxScaler() if you prefer 0-1 normalization
df[features] = scaler.fit_transform(df[features])

# 🔹 6. Define Mood Categories Based on Valence & Energy
def classify_mood(row):
    if row['valence'] > 0.5 and row['energy'] > 0.5:
        return 'Happy'
    elif row['valence'] > 0.5 and row['energy'] <= 0.5:
        return 'Calm'
    elif row['valence'] <= 0.5 and row['energy'] > 0.5:
        return 'Energetic'
    else:
        return 'Sad'

df['mood'] = df.apply(classify_mood, axis=1)

# 🔹 7. Save the Processed Dataset
processed_file_path = "/content/processed_spotify_data.csv"
df.to_csv(processed_file_path, index=False)

print(f"✅ Data preprocessing complete. Processed file saved at: {processed_file_path}")