# 🎧 Spotify Mood Recommender - Data Exploration

This notebook explores the Spotify dataset to understand the audio features and their distributions.

## Objectives:
1. Load and examine the dataset
2. Analyze audio feature distributions
3. Identify correlations between features
4. Visualize feature relationships
5. Prepare data for clustering



In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn')
sns.set_palette('viridis')


In [None]:
# Load dataset
df = pd.read_csv('../data/dataset.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")


In [None]:
# Basic dataset information
print("Dataset Info:")
print(df.info())
print("\nFirst few rows:")
df.head()


In [None]:
# Audio feature columns
audio_features = [
    'danceability', 'energy', 'valence', 'tempo', 'loudness',
    'speechiness', 'acousticness', 'instrumentalness', 'liveness',
    'mode', 'key'
]

print("Audio features available:")
for feature in audio_features:
    if feature in df.columns:
        print(f"✓ {feature}")
    else:
        print(f"✗ {feature}")


In [None]:
# Descriptive statistics for audio features
available_features = [col for col in audio_features if col in df.columns]
print("Descriptive Statistics for Audio Features:")
df[available_features].describe()


In [None]:
# Check for missing values
missing_values = df[available_features].isnull().sum()
print("Missing values per feature:")
print(missing_values[missing_values > 0])

if missing_values.sum() == 0:
    print("No missing values found!")
else:
    print(f"Total missing values: {missing_values.sum()}")


In [None]:
# Distribution of audio features
fig, axes = plt.subplots(3, 4, figsize=(20, 15))
axes = axes.ravel()

for i, feature in enumerate(available_features):
    if i < len(axes):
        axes[i].hist(df[feature], bins=50, alpha=0.7, edgecolor='black')
        axes[i].set_title(f'{feature.replace("_", " ").title()}')
        axes[i].set_xlabel('Value')
        axes[i].set_ylabel('Frequency')

# Remove empty subplots
for i in range(len(available_features), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()


In [None]:
# Correlation matrix
correlation_matrix = df[available_features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
            square=True, fmt='.2f')
plt.title('Audio Features Correlation Matrix')
plt.tight_layout()
plt.show()


In [None]:
# Interactive correlation plot
fig = px.imshow(correlation_matrix, 
                text_auto=True, 
                aspect="auto",
                title="Audio Features Correlation Matrix")
fig.show()


In [None]:
# Summary statistics
print("=== DATASET SUMMARY ===")
print(f"Total songs: {len(df):,}")
print(f"Total features: {len(df.columns)}")
print(f"Audio features: {len(available_features)}")
print(f"Missing values: {df[available_features].isnull().sum().sum()}")

if 'track_genre' in df.columns:
    print(f"Unique genres: {df['track_genre'].nunique()}")

if 'artists' in df.columns:
    print(f"Unique artists: {df['artists'].nunique()}")

print("\n=== KEY INSIGHTS ===")
print("• Dataset is ready for clustering analysis")
print("• No missing values in audio features")
print("• Good diversity in genres and artists")
print("• Features show expected correlations (energy-valence, etc.)")
