# Gaia Star Cluster Analysis

This notebook combines the data fetching and analysis steps for the Gaia Star Cluster Visualizer.

In [None]:
import os
import time
import numpy as np
import pandas as pd
from astroquery.gaia import Gaia
import hdbscan
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

## 1. Fetch Data from Gaia DR3

In [None]:
def fetch_gaia_data(limit_pc=650, max_stars=50000):
    print(f"Fetching Gaia DR3 data within {limit_pc} pc...")
    
    parallax_limit = 1000.0 / limit_pc
    
    query = f"""
    SELECT TOP {max_stars}
        source_id, ra, dec, parallax, pmra, pmdec, radial_velocity,
        phot_g_mean_mag, bp_rp
    FROM gaiadr3.gaia_source
    WHERE parallax > {parallax_limit}
      AND phot_g_mean_mag < 13
      AND radial_velocity IS NOT NULL
    """
    
    job = Gaia.launch_job_async(query)
    r = job.get_results()
    df = r.to_pandas()
    print(f"Fetched {len(df)} stars.")
    return df

# Fetch data
df = fetch_gaia_data()

## 2. Preprocessing & Coordinate Transformation

In [None]:
# Calculate 3D coordinates (Cartesian)
df['dist_pc'] = 1000.0 / df['parallax']

ra_rad = np.deg2rad(df['ra'])
dec_rad = np.deg2rad(df['dec'])

df['x'] = df['dist_pc'] * np.cos(dec_rad) * np.cos(ra_rad)
df['y'] = df['dist_pc'] * np.cos(dec_rad) * np.sin(ra_rad)
df['z'] = df['dist_pc'] * np.sin(dec_rad)

# Velocity components (Simplified for clustering)
# We use proper motion and radial velocity directly as features
k = 4.74047
v_ra = k * df['pmra'] / df['parallax']
v_dec = k * df['pmdec'] / df['parallax']
v_rad = df['radial_velocity']

df['v_tot'] = np.sqrt(v_ra**2 + v_dec**2 + v_rad**2)

## 3. Clustering with HDBSCAN

In [None]:
print("Clustering...")
features = df[['x', 'y', 'z', 'pmra', 'pmdec', 'radial_velocity']].copy()
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

clusterer = hdbscan.HDBSCAN(min_cluster_size=20, min_samples=10, metric='euclidean')
labels = clusterer.fit_predict(features_scaled)

df['cluster_id'] = labels

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(f"Found {n_clusters} clusters.")

## 4. Visualization (2D Projection)

In [None]:
plt.figure(figsize=(10, 8))

# Plot noise (grey)
noise = df[df['cluster_id'] == -1]
plt.scatter(noise['x'], noise['y'], c='grey', s=1, alpha=0.1, label='Field Stars')

# Plot clusters
clustered = df[df['cluster_id'] != -1]
plt.scatter(clustered['x'], clustered['y'], c=clustered['cluster_id'], cmap='tab20', s=10, label='Cluster Members')

plt.title('Galactic Plane Projection (X-Y)')
plt.xlabel('X (pc)')
plt.ylabel('Y (pc)')
plt.legend()
plt.axis('equal')
plt.show()