In [None]:
# 1. Setup and Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import os

# Set plot style if available, otherwise use default
try:
    plt.style.use("seaborn-v0_8-darkgrid")
except:
    plt.style.use("seaborn-darkgrid")

# 2. Check data directories
print("Project structure:")
base_path = Path(".")
for path in base_path.rglob("*"):
    if path.is_dir() and path.parts <= 4:  # Limit depth
        print(f"    {path}/")

# 3. Load Processed Data
print("\n" + "="*50)
print("Loading GIZON audio data...")

# Try to load audio data - using correct file extensions
try:
    audio_features = np.load("data/processed/Gizon_features.npy", allow_pickle=True)
    audio_labels = np.load("data/processed/Gizon_labels.npy", allow_pickle=True)
    audio_metadata = pd.read_csv("data/processed/Gizon_metadata.csv")
    
    print(f"Audio features shape: {audio_features.shape}")
    print(f"Audio labels shape: {audio_labels.shape}")
    print(f"Metadata shape: {audio_metadata.shape}")
    
    # Display basic info about metadata
    print("\nAudio metadata info:")
    print(audio_metadata.info())
    print("\nAudio metadata head:")
    print(audio_metadata.head())
    
except FileNotFoundError as e:
    print(f"Error loading audio data: {e}")
    print("Attempting to find files...")
    
    # Try alternative paths
    audio_files = list(Path(".").rglob("*Gizon*"))
    if audio_files:
        print(f"Found potential Gizon files: {audio_files[:5]}")

print("\n" + "="*50)
print("Loading lyrics data...")

# Try to load lyrics data
try:
    lyrics_df = pd.read_csv("data/processed/lyrics.csv")
    print(f"Lyrics data shape: {lyrics_df.shape}")
    print("\nLyrics data columns:")
    print(lyrics_df.columns.tolist())
    print("\nLyrics data info:")
    print(lyrics_df.info())
    print("\nLyrics data head:")
    print(lyrics_df.head())
    
except FileNotFoundError as e:
    print(f"Error loading lyrics data: {e}")

print("\n" + "="*50)
print("Exploratory Data Analysis:")

# 4. Basic data exploration if data loaded successfully
if 'audio_features' in locals() and audio_features.size > 0:
    print(f"\nAudio Features Stats:")
    print(f"  - Dimensionality: {audio_features.shape[1]} features")
    print(f"  - Samples: {audio_features.shape[0]}")
    print(f"  - Data type: {audio_features.dtype}")
    
    # Show basic statistics if 2D array
    if len(audio_features.shape) == 2:
        print(f"  - Feature range: [{audio_features.min():.4f}, {audio_features.max():.4f}]")
        print(f"  - Mean: {audio_features.mean():.4f}, Std: {audio_features.std():.4f}")

if 'lyrics_df' in locals() and not lyrics_df.empty:
    print(f"\nLyrics Data Stats:")
    print(f"  - Number of songs: {len(lyrics_df)}")
    print(f"  - Columns: {len(lyrics_df.columns)}")
    
    # Check for missing values
    missing = lyrics_df.isnull().sum().sum()
    if missing > 0:
        print(f"  - Missing values: {missing}")

# 5. Create a summary plot if we have data
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Plot 1: Feature distribution (if audio features exist)
if 'audio_features' in locals() and audio_features.size > 0:
    if len(audio_features.shape) == 2:
        # Plot mean of each feature
        feature_means = audio_features.mean(axis=0)
        axes[0].bar(range(len(feature_means[:50])), feature_means[:50])  # First 50 features
        axes[0].set_title('Audio Feature Means (First 50)')
        axes[0].set_xlabel('Feature Index')
        axes[0].set_ylabel('Mean Value')
    else:
        axes[0].text(0.5, 0.5, 'Audio features loaded\nbut not 2D array', 
                    ha='center', va='center', transform=axes[0].transAxes)

# Plot 2: Metadata or lyrics info
if 'audio_metadata' in locals() and not audio_metadata.empty:
    if 'label' in audio_metadata.columns:
        label_counts = audio_metadata['label'].value_counts()
        axes[1].bar(label_counts.index[:10], label_counts.values[:10])
        axes[1].set_title('Top 10 Labels in Metadata')
        axes[1].set_xlabel('Label')
        axes[1].set_ylabel('Count')
        plt.setp(axes[1].xaxis.get_majorticklabels(), rotation=45, ha='right')
    else:
        axes[1].text(0.5, 0.5, 'No label column in metadata', 
                    ha='center', va='center', transform=axes[1].transAxes)

plt.tight_layout()
plt.show()

print("\n" + "="*50)
print("Exploratory analysis complete!")
print("Next steps:")
print("1. Check if all expected files are loaded")
print("2. Examine the data structures")
print("3. Consider data cleaning/preprocessing")
print("4. Plan clustering/analysis approach")