# AI & Music Sociology: Data Exploration

This notebook provides an initial exploration of music data for the AI & Music Sociology research project.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Import our custom modules
import sys
sys.path.append('../src')

from data_collection.spotify_scraper import SpotifyPlaylistAnalyzer
from analysis.emotion_mapping import EmotionMapper

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

print("Libraries imported successfully!")

## 1. Load and Explore Sample Data

In [None]:
# Generate sample data for exploration
np.random.seed(42)

# Create sample playlist data
def generate_sample_data(n_tracks=100):
    """Generate realistic sample music data"""
    
    genres = ['pop', 'rock', 'jazz', 'classical', 'electronic', 'hip_hop', 'country', 'folk']
    artists = [f'Artist_{i}' for i in range(20)]
    
    data = {
        'track_id': [f'track_{i:04d}' for i in range(n_tracks)],
        'track_name': [f'Song Title {i}' for i in range(n_tracks)],
        'artist_name': np.random.choice(artists, n_tracks),
        'genre': np.random.choice(genres, n_tracks),
        'popularity': np.random.randint(0, 100, n_tracks),
        'duration_ms': np.random.normal(210000, 45000, n_tracks),
        
        # Spotify audio features
        'valence': np.random.beta(2, 2, n_tracks),
        'energy': np.random.beta(2, 2, n_tracks),
        'danceability': np.random.beta(2, 2, n_tracks),
        'acousticness': np.random.beta(2, 5, n_tracks),
        'instrumentalness': np.random.beta(1, 10, n_tracks),
        'liveness': np.random.beta(1, 8, n_tracks),
        'speechiness': np.random.beta(1, 15, n_tracks),
        'tempo': np.random.normal(120, 25, n_tracks),
        'loudness': np.random.normal(-8, 4, n_tracks),
        
        # Sentiment analysis results
        'lyric_sentiment': np.random.normal(0.1, 0.4, n_tracks),
        'emotional_valence': np.random.normal(0.5, 0.3, n_tracks)
    }
    
    # Ensure realistic ranges
    data['duration_ms'] = np.clip(data['duration_ms'], 30000, 600000)
    data['tempo'] = np.clip(data['tempo'], 60, 200)
    data['loudness'] = np.clip(data['loudness'], -25, 0)
    data['lyric_sentiment'] = np.clip(data['lyric_sentiment'], -1, 1)
    data['emotional_valence'] = np.clip(data['emotional_valence'], 0, 1)
    
    for feature in ['valence', 'energy', 'danceability', 'acousticness', 
                   'instrumentalness', 'liveness', 'speechiness']:
        data[feature] = np.clip(data[feature], 0, 1)
    
    return pd.DataFrame(data)

# Generate and display sample data
df = generate_sample_data(200)
print(f"Generated dataset with {len(df)} tracks")
print(f"Features: {list(df.columns)}")
df.head()

## 2. Basic Data Statistics

In [None]:
# Basic statistics
print("Dataset Overview:")
print(f"Number of tracks: {len(df)}")
print(f"Number of unique artists: {df['artist_name'].nunique()}")
print(f"Number of genres: {df['genre'].nunique()}")
print(f"Average track duration: {df['duration_ms'].mean()/60000:.2f} minutes")
print(f"Average popularity: {df['popularity'].mean():.2f}")

# Display basic statistics for audio features
audio_features = ['valence', 'energy', 'danceability', 'acousticness', 
                 'instrumentalness', 'liveness', 'speechiness', 'tempo']

print("\nAudio Features Summary:")
df[audio_features].describe()

## 3. Exploratory Visualizations

In [None]:
# Genre distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Genre distribution
genre_counts = df['genre'].value_counts()
axes[0,0].pie(genre_counts.values, labels=genre_counts.index, autopct='%1.1f%%')
axes[0,0].set_title('Genre Distribution')

# Popularity vs Valence
scatter = axes[0,1].scatter(df['popularity'], df['valence'], alpha=0.6, c=df['energy'], cmap='viridis')
axes[0,1].set_xlabel('Popularity')
axes[0,1].set_ylabel('Valence')
axes[0,1].set_title('Popularity vs Valence (colored by Energy)')
plt.colorbar(scatter, ax=axes[0,1], label='Energy')

# Tempo distribution by genre
df.boxplot(column='tempo', by='genre', ax=axes[1,0])
axes[1,0].set_title('Tempo Distribution by Genre')
axes[1,0].tick_params(axis='x', rotation=45)

# Audio features correlation heatmap
corr_matrix = df[audio_features].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=axes[1,1])
axes[1,1].set_title('Audio Features Correlation Matrix')

plt.tight_layout()
plt.show()

In [None]:
# Interactive plotly visualizations

# 3D scatter plot of key audio features
fig = px.scatter_3d(
    df, x='valence', y='energy', z='danceability',
    color='genre', size='popularity',
    hover_data=['track_name', 'artist_name'],
    title='3D Audio Feature Space'
)
fig.show()

# Audio features radar chart by genre
features_for_radar = ['valence', 'energy', 'danceability', 'acousticness', 'speechiness']
genre_means = df.groupby('genre')[features_for_radar].mean()

fig = go.Figure()

for genre in genre_means.index[:5]:  # Show top 5 genres
    fig.add_trace(go.Scatterpolar(
        r=genre_means.loc[genre].values,
        theta=features_for_radar,
        fill='toself',
        name=genre.capitalize()
    ))

fig.update_layout(
    polar=dict(
        radialaxis=dict(visible=True, range=[0, 1])
    ),
    showlegend=True,
    title="Audio Features Profile by Genre"
)

fig.show()

## 4. Emotional Clustering Analysis

In [None]:
# Perform emotional clustering using KMeans
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Prepare features for clustering
cluster_features = ['valence', 'energy', 'danceability', 'acousticness', 'lyric_sentiment']
X = df[cluster_features].fillna(df[cluster_features].mean())

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply K-means clustering
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Add cluster labels to dataframe
df['emotion_cluster'] = clusters

# Analyze clusters
cluster_analysis = df.groupby('emotion_cluster')[cluster_features].mean()
print("Cluster Centroids (average feature values):")
print(cluster_analysis.round(3))

# Create descriptive names for clusters
cluster_descriptions = {}
for cluster in range(n_clusters):
    means = cluster_analysis.loc[cluster]
    
    if means['valence'] > 0.6 and means['energy'] > 0.6:
        cluster_descriptions[cluster] = "Energetic & Positive"
    elif means['valence'] < 0.4 and means['energy'] < 0.4:
        cluster_descriptions[cluster] = "Melancholic & Calm"
    elif means['energy'] > 0.7:
        cluster_descriptions[cluster] = "High Energy"
    elif means['acousticness'] > 0.6:
        cluster_descriptions[cluster] = "Acoustic & Intimate"
    else:
        cluster_descriptions[cluster] = f"Mixed Emotions {cluster}"

df['cluster_description'] = df['emotion_cluster'].map(cluster_descriptions)

In [None]:
# Visualize clusters in reduced PCA space
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

df['pca1'] = X_pca[:, 0]
df['pca2'] = X_pca[:, 1]

plt.figure(figsize=(10, 7))
sns.scatterplot(
    data=df, x='pca1', y='pca2',
    hue='cluster_description', palette='Set2', alpha=0.7
)
plt.title("Emotional Clusters in PCA Space")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title="Cluster Description", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Show sample tracks from each cluster
for cluster, desc in cluster_descriptions.items():
    print(f"\nCluster {cluster} - {desc}")
    display(df[df['emotion_cluster'] == cluster][['track_name', 'artist_name', 'genre']].head(5))

## 5. Key Takeaways

- The synthetic dataset captures **musical diversity** across genres, artists, and audio features.  
- Exploratory analysis revealed **relationships between popularity, valence, and energy**, along with meaningful genre-based differences.  
- The **clustering step grouped tracks into distinct emotional profiles** (e.g., *Energetic & Positive*, *Melancholic & Calm*).  
- These exploratory insights provide a foundation for the **AI & Music Sociology research project**, where we can later integrate real Spotify datasets, lyrics analysis, and sociological interpretations of music consumption.