In [1]:
import pandas as pd
import numpy as np
import gc # Garbage Collector
import os
import sys
# add src directory to path for module imports
sys.path.append(os.path.abspath('..'))

from tqdm import tqdm # Progress bar

from src import preprocess

# Configuration
RAW_DIR = '../data/raw'
PROCESSED_DIR = '../data/processed'
os.makedirs(PROCESSED_DIR, exist_ok=True)

# Load Master Metadata
print("Loading Master Metadata...")
meta_df = pd.read_csv(f'{RAW_DIR}/train_log.csv') 

print(f"Metadata loaded. Total objects: {len(meta_df)}")

Loading Master Metadata...
Metadata loaded. Total objects: 3043


As the data is massive, we'll perform a loop to extract features from each split and then delete them to save memory

In [2]:
# Storage for the extracted features from all splits
all_features_batches = []

# Loop through splits 01 to 20
for i in range(1, 21): 
    split_name = f"split_{i:02d}" # Formats to 'split_01', 'split_02'
    lc_path = f"{RAW_DIR}/{split_name}/train_full_lightcurves.csv"
    
    # Check if file exists
    if not os.path.exists(lc_path):
        print(f"Skipping {split_name} (File not found)")
        continue
        
    print(f"Processing {split_name}...")
    
    # 1. Load Lightcurves for this split
    chunk_lc = pd.read_csv(lc_path)
    
    # 2. Filter Metadata to match ONLY these objects
    chunk_meta = meta_df[meta_df['object_id'].isin(chunk_lc['object_id'])].copy()
    
    # 3. Clean Data (Physics)
    # This applies the Distance and Dust correction
    _, clean_lc = preprocess.clean_data(chunk_meta, chunk_lc)
    
    # 4. Extract Features (Stats)
    # This shrinks millions of rows -> 1 row per object
    chunk_features = preprocess.extract_features(clean_lc)
    
    # 5. Store Result
    all_features_batches.append(chunk_features)
    
    # 6. Memory Cleanup
    del chunk_lc, clean_lc, chunk_features, chunk_meta
    gc.collect() # Force RAM release

print("All splits processed.")

Processing split_01...
Processing split_02...
Processing split_03...
Processing split_04...
Processing split_05...
Processing split_06...
Processing split_07...
Processing split_08...
Processing split_09...
Processing split_10...
Processing split_11...
Processing split_12...
Processing split_13...
Processing split_14...
Processing split_15...
Processing split_16...
Processing split_17...
Processing split_18...
Processing split_19...
Processing split_20...
All splits processed.


In [3]:
print("Combining all batches...")
# 1. Concatenate all feature batches
full_features = pd.concat(all_features_batches, ignore_index=True)

# 2. Merge with Labels (Target) and Z
# We grab 'target', 'Z', and 'EBV' from the original metadata
# This creates the final "X" and "y" matrix for training
final_train_data = full_features.merge(
    meta_df[['object_id', 'target', 'Z', 'EBV']], 
    on='object_id', 
    how='left'
)

# 3. Inspect
print(f"Final Data Shape: {final_train_data.shape}")
print(final_train_data.head())

# 4. Save
# Parquet is 10x faster and smaller than CSV for this data
final_train_data.to_parquet(f'{PROCESSED_DIR}/train.parquet')
print("Successfully saved train.parquet")

Combining all batches...
Final Data Shape: (3043, 8)
                  object_id  peak_flux  rise_slope  fall_slope  color_u_z  \
0  Dornhoth_fervain_onodrim  10.311690    0.021273   -0.009711   5.511201   
1       Dornhoth_galadh_ylf   7.864956    0.000756   -0.003425  -1.381394   
2      Elrim_melethril_thul   8.025164    0.004992    0.002015  -3.384872   
3        Ithil_tobas_rodwen   8.010233    0.001751    0.000975  -0.814407   
4       Mirion_adar_Druadan   7.515978    0.004017   -0.002686  -0.822449   

   target       Z    EBV  
0       0  3.0490  0.110  
1       0  0.4324  0.058  
2       0  0.4673  0.577  
3       0  0.6946  0.012  
4       0  0.4161  0.058  
Successfully saved train.parquet


We'll do the same to test dataset

In [4]:
# Load Master Metadata
print("Loading Master Metadata...")
meta_df = pd.read_csv(f'{RAW_DIR}/test_log.csv') 

print(f"Metadata loaded. Total objects: {len(meta_df)}")

Loading Master Metadata...
Metadata loaded. Total objects: 7135


In [5]:
# Storage for the extracted features from all splits
all_features_batches = []

# Loop through splits 01 to 20
for i in range(1, 21): 
    split_name = f"split_{i:02d}" # Formats to 'split_01', 'split_02'
    lc_path = f"{RAW_DIR}/{split_name}/test_full_lightcurves.csv"
    
    # Check if file exists
    if not os.path.exists(lc_path):
        print(f"Skipping {split_name} (File not found)")
        continue
        
    print(f"Processing {split_name}...")
    
    # 1. Load Lightcurves for this split
    chunk_lc = pd.read_csv(lc_path)
    
    # 2. Filter Metadata to match ONLY these objects
    chunk_meta = meta_df[meta_df['object_id'].isin(chunk_lc['object_id'])].copy()
    
    # 3. Clean Data (Physics)
    # This applies the Distance and Dust correction
    _, clean_lc = preprocess.clean_data(chunk_meta, chunk_lc)
    
    # 4. Extract Features (Stats)
    # This shrinks millions of rows -> 1 row per object
    chunk_features = preprocess.extract_features(clean_lc)
    
    # 5. Store Result
    all_features_batches.append(chunk_features)
    
    # 6. Memory Cleanup
    del chunk_lc, clean_lc, chunk_features, chunk_meta
    gc.collect() # Force RAM release

print("All splits processed.")

Processing split_01...
Processing split_02...
Processing split_03...
Processing split_04...
Processing split_05...
Processing split_06...
Processing split_07...
Processing split_08...
Processing split_09...
Processing split_10...
Processing split_11...
Processing split_12...
Processing split_13...
Processing split_14...
Processing split_15...
Processing split_16...
Processing split_17...
Processing split_18...
Processing split_19...
Processing split_20...
All splits processed.


In [6]:
print("Combining all batches...")
# 1. Concatenate all feature batches
full_features = pd.concat(all_features_batches, ignore_index=True)

# 2. Merge with Labels (Target) and Z
# We grab 'target', 'Z', and 'EBV' from the original metadata
# This creates the final "X" and "y" matrix for training
final_train_data = full_features.merge(
    meta_df[['object_id', 'Z', 'EBV']], 
    on='object_id', 
    how='left'
)

# 3. Inspect
print(f"Final Data Shape: {final_train_data.shape}")
print(final_train_data.head())

# 4. Save
# Parquet is 10x faster and smaller than CSV for this data
final_train_data.to_parquet(f'{PROCESSED_DIR}/test.parquet')
print("Successfully saved test.parquet")

Combining all batches...
Final Data Shape: (7135, 7)
                   object_id  peak_flux  rise_slope  fall_slope  color_u_z  \
0            Elrim_sador_hun   7.038654    0.001275   -0.004790  -0.540200   
1   Eluwaith_Mithrim_nothrim   8.476333    0.005925   -0.006400   0.423939   
2         Eru_heledir_archam   7.947598    0.006409   -0.000427  -1.535087   
3          Gonhir_anann_fuin   7.845152    0.002041   -0.000994  -1.182255   
4  Gwathuirim_eilian_fervain   7.769473   -0.000265   -0.017058   0.772347   

        Z    EBV  
0  0.3003  0.030  
1  0.5393  0.610  
2  0.7282  0.058  
3  0.6026  0.070  
4  0.8305  0.038  
Successfully saved test.parquet
