In [5]:
!uv add pyspark

[2mResolved [1m135 packages[0m [2min 0.59ms[0m[0m
[2mAudited [1m129 packages[0m [2min 6ms[0m[0m


In [17]:
import os
import urllib.request
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr

# --- CONFIGURATION ---
raw_dir = "data/raw"
clean_dir = "data/clean"
base_url = "http://cdn.gea.esac.esa.int/Gaia/gdr3/gaia_source/"

files_to_download = [
"GaiaSource_000000-003111.csv.gz",
"GaiaSource_003112-005263.csv.gz",
"GaiaSource_005264-006601.csv.gz",
"GaiaSource_006602-007952.csv.gz",
"GaiaSource_007953-010234.csv.gz",
"GaiaSource_010235-012597.csv.gz",
"GaiaSource_012598-014045.csv.gz",
"GaiaSource_014046-015369.csv.gz",
"GaiaSource_015370-016240.csv.gz",
"GaiaSource_016241-017018.csv.gz",
"GaiaSource_017019-017658.csv.gz",
"GaiaSource_017659-018028.csv.gz",
"GaiaSource_018029-018472.csv.gz",
"GaiaSource_018473-019161.csv.gz",
"GaiaSource_019162-019657.csv.gz",
"GaiaSource_019658-020091.csv.gz",
"GaiaSource_020092-020493.csv.gz",
"GaiaSource_020494-020747.csv.gz",
"GaiaSource_020748-020984.csv.gz",
"GaiaSource_020985-021233.csv.gz",
"GaiaSource_021234-021441.csv.gz",
"GaiaSource_021442-021665.csv.gz",
"GaiaSource_021666-021919.csv.gz",
"GaiaSource_021920-022158.csv.gz",
"GaiaSource_022159-022410.csv.gz"
]



In [18]:
# --- STEP 1: DOWNLOAD (Only runs if files missing) ---
if not os.path.exists(raw_dir): os.makedirs(raw_dir)

print("Checking raw files...")
for f in files_to_download:
    local_path = os.path.join(raw_dir, f)
    if not os.path.exists(local_path):
        print(f"Downloading {f}...")
        urllib.request.urlretrieve(base_url + f, local_path)



Checking raw files...
Downloading GaiaSource_020985-021233.csv.gz...
Downloading GaiaSource_021234-021441.csv.gz...
Downloading GaiaSource_021442-021665.csv.gz...
Downloading GaiaSource_021666-021919.csv.gz...
Downloading GaiaSource_021920-022158.csv.gz...
Downloading GaiaSource_022159-022410.csv.gz...


In [19]:
!du -sh {raw_dir}

5.3G	data/raw


In [20]:
# --- STEP 2: SPARK ETL ---
print("Reading raw data...")

raw_df = spark.read \
    .option("header", "true") \
    .option("comment", "#") \
    .option("nullValue", "null") \
    .option("nanValue", "NaN") \
    .csv(raw_dir)  # Reads all chunks

cols = [
    "source_id", "ra", "dec", "parallax", "parallax_error", 
    "pmra", "pmdec", "phot_g_mean_mag", "bp_rp", "teff_gspphot"
]

# Use try_cast via expr() or select() 
# We use a loop to apply it to all columns safely
clean_df = raw_df.select(
    [expr(f"try_cast({c} as double) as {c}") for c in cols]
).filter(col("parallax").isNotNull()) \
 .filter(col("parallax") > 0) \
 .filter(col("ra").isNotNull()) \
 .filter(col("dec").isNotNull())

print("data cleaned")

Reading raw data...
data cleaned


In [21]:
# --- STEP 3: OPTIMIZE WRITE ---
print(f"Writing clean data to {clean_dir}...")
clean_df.coalesce(5).write.mode("overwrite").parquet(clean_dir)

Writing clean data to data/clean...


                                                                                

In [22]:
!du -sh {clean_dir}

664M	data/clean


In [23]:
clean_df.describe().show()



+-------+--------------------+------------------+--------------------+--------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+
|summary|           source_id|                ra|                 dec|            parallax|    parallax_error|              pmra|              pmdec|   phot_g_mean_mag|             bp_rp|     teff_gspphot|
+-------+--------------------+------------------+--------------------+--------------------+------------------+------------------+-------------------+------------------+------------------+-----------------+
|  count|             9343020|           9343020|             9343020|             9343020|           9343020|           9343020|            9343020|           9338985|           9240719|          4804041|
|   mean|1.408268882045705...| 64.30684538115095|  30.279938145397054|  0.8807713878443485|0.3691608931401685|2.3982517220204187|-3.9861548928667103|18.353410679570086|1.660860

                                                                                