True Flux formula:
$$F_{int}(\lambda) = F_{obs}(\lambda) \times 10^{0.4 \cdot A_\lambda}$$

Where:
$$A_\lambda = E(B-V) \cdot [R_V \cdot a(\lambda) + b(\lambda)]$$

In [11]:
import pandas as pd
import numpy as np
import gc

from pathlib import Path

import sys
import os

sys.path.append(os.path.abspath('..'))

RAW_DATA_PATH = Path("../data/raw")
PROCESSED_DATA_PATH = Path("../data/processed")

# Extinction coefficients for different filters
# Used for calculating true Flux
EXTINCTION_COEFFS = {
    'u': 4.81,
    'g': 3.64,
    'r': 2.70,
    'i': 2.06,
    'z': 1.58,
    'y': 1.31
}

In [None]:
# Function to process data
def feature_extraction(raw_path, train=True):
    # Dunno what to call this
    if train:
        something = "train_"
    else:
        something = "test_"
    
    # Master metadata
    log_file_path = raw_path / (something + "log.csv")
    log_df = pd.read_csv(log_file_path)
    
    # Drop the "for fun" column and split name column
    log_df = log_df.drop(columns=["English Translation", "split", "SpecType"])

    # Stores processed data chunks
    processed_chunks = []

    # Loop through 20 splits
    for i in range (1, 21):
        folder_name = f"split_{i:02d}" # Formats to 'split_01', 'split_02'
        file_path = raw_path / folder_name / (something + "full_lightcurves.csv")
        
        print(f"Processing {folder_name}...")
        
        # Load lightcurves data for this split only
        chunk_df = pd.read_csv(file_path)
        
        # Drop rows with missing Flux values
        chunk_df = chunk_df.dropna(subset=['Flux'])
        
        # Merge metadata with lightcurves on 'object_id'
        chunk_df = chunk_df.merge(log_df, on="object_id", how="left")
        
        # == Step 1: Extinction Correction ==
        # ======================================================
        
        # Calculate total extinction for each row
        extinction_factors = chunk_df["Filter"].map(EXTINCTION_COEFFS)
        
        chunk_df["Total_extinction"] = extinction_factors * chunk_df["EBV"]
        
        # Calculate extinction-corrected flux and flux error
        chunk_df["Flux_corrected"] = chunk_df["Flux"] * 10**(0.4 * chunk_df["Total_extinction"])
        chunk_df["Flux_err_corrected"] = chunk_df["Flux_err"] * 10**(0.4 * chunk_df["Total_extinction"])
        
        # == Step 2: Extract time features ==
        # ======================================================
        
        # Sort by object_id and then MJD
        # Makes it easier to compute time differences
        chunk_df = chunk_df.sort_values(by=["object_id", "Time (MJD)"])
        
        # Set the starting time of observation to zero for each object
        start_times = chunk_df.groupby('object_id')['Time (MJD)'].transform('min')
        chunk_df['Time_relative'] = chunk_df['Time (MJD)'] - start_times
        
        # Calculate delta time between observations of the object on the same filter
        # Fill NaN values (first observation) with 0
        chunk_df["Delta_time"] = chunk_df.groupby(["object_id", "Filter"])["Time_relative"].diff().fillna(0)
        
        # Calculate delta flux between observations of the object on the same filter
        # Fill NaN values (first observation) with 0
        chunk_df["Delta_flux"] = chunk_df.groupby(["object_id", "Filter"])["Flux_corrected"].diff().fillna(0)
        
        # Calculate flux change rate (The most important feature!)
        # Fill NaN values (first observation) with 0
        chunk_df["Flux_change_rate"] = chunk_df["Delta_flux"] / chunk_df["Delta_time"].replace(0, np.nan)
        chunk_df["Flux_change_rate"] = chunk_df["Flux_change_rate"].fillna(0)
        
        # == Step 3: Extract statistical features ==
        # ======================================================
        
        # -- Extract Peak Time --
        
        # This returns a list of row numbers (e.g., Row 5, Row 102, Row 888...)
        idx_peaks = chunk_df.groupby(['object_id', 'Filter'])['Flux_corrected'].idxmax()
    
        # .loc takes the list of addresses and returns only those rows
        # Only keep the columns we care about: ID, Filter, and the Time
        peak_data = chunk_df.loc[idx_peaks, ['object_id', 'Filter', 'Time_relative']]
        
        # Rename the Time_relative column to Peak_time
        peak_data = peak_data.rename(columns={"Time_relative": "Time_to_peak"})
        
        # Pivot the table to have filters as columns
        peak_data_wide = peak_data.pivot(index='object_id', columns='Filter', values='Time_to_peak')
        
        # Rename columns to indicate they are peak times
        peak_data_wide = peak_data_wide.rename(columns=lambda x: f"Time_to_peak_{x}")
        
        """ 
        Before pivot:
            object_id   Filter  Time_to_peak
            A           g       12.34
            A           r       15.67
            
        After pivot:
            object_id   Time_to_peak_g  Time_to_peak_r
            A           12.34           15.67
        """
        
        # -- Extract aggregated statistics --
        
        # Define aggregation functions
        aggs = {
            # 1. Flux Statistics
            'Flux_corrected': ['mean', 'max', 'min', 'std', 'skew'],
    
            # 2. Derivative Statistics
            'Flux_change_rate': ['mean', 'max', 'min', 'std'], 
    
            # 3. Time Statistics
            'Time_relative': ['max', 'count'],   # max = Duration, count = N_observations
    
            # 4. Error Statistics (Quality Control)
            'Flux_err_corrected': ['mean']
        }
        
        # Aggregation block
        agg_df = chunk_df.groupby(['object_id', 'Filter']).agg(aggs)
        
        # Flatten MultiIndex columns
        features_df = agg_df.unstack(level='Filter')
        
        # Flatten the Hierarchical Column Names
        # Current format: (Column, Stat, Filter) -> ('Flux_Corrected', 'mean', 'u')
        # Desired format: "Flux_Corrected_mean_u"
        new_columns = []
        for col_name, stat_name, filter_name in features_df.columns:
            # Create the combined string
            new_name = f"{col_name}_{stat_name}_{filter_name}"
            new_columns.append(new_name)
        
        # Assign new column names    
        features_df.columns = new_columns
        
        # Fill NaN counts with 0 (no observations)
        count_cols = [col for col in features_df.columns if col.endswith('_count_' + col.split('_')[-1])]
        features_df[count_cols] = features_df[count_cols].fillna(0)
        
        # Leave other NaNs for XGBoost to handle
        
        # Reset index to bring object_id back as a column
        features_df = features_df.reset_index()
        
        # Merge peak time features
        features_df = features_df.merge(peak_data_wide, on='object_id', how='left')
        
        # Append processed chunk to list
        processed_chunks.append(features_df)
        
        # Clean up
        del chunk_df, agg_df, features_df, peak_data, peak_data_wide
        gc.collect()
        
    # Concatenate all processed chunks into a single DataFrame
    print("Concatenating chunks...")
    final_df = pd.concat(processed_chunks)
    
    # Merge with log_df to get target labels if training data
    if train:
        final_df = final_df.merge(log_df[['object_id', 'target', 'Z', 'Z_err']], on='object_id', how='left')
    else:
        final_df = final_df.merge(log_df[['object_id', 'Z', 'Z_err']], on='object_id', how='left')
    
    print("Processing complete.")
    
    # Clean up unnecessary columns
    final_df.drop(columns=['Time (MJD)', 'Flux', 'Flux_err', 'Total_extinction'], errors='ignore', inplace=True)
    
    return final_df

In [29]:
# Process training data
train_features = feature_extraction(RAW_DATA_PATH, train=True)

Processing split_01...
Processing split_02...
Processing split_03...
Processing split_04...
Processing split_05...
Processing split_06...
Processing split_07...
Processing split_08...
Processing split_09...
Processing split_10...
Processing split_11...
Processing split_12...
Processing split_13...
Processing split_14...
Processing split_15...
Processing split_16...
Processing split_17...
Processing split_18...
Processing split_19...
Processing split_20...
Concatenating chunks...
Processing complete.


In [30]:
train_features.head()


Unnamed: 0,object_id,Flux_corrected_mean_g,Flux_corrected_mean_i,Flux_corrected_mean_r,Flux_corrected_mean_u,Flux_corrected_mean_y,Flux_corrected_mean_z,Flux_corrected_max_g,Flux_corrected_max_i,Flux_corrected_max_r,...,Flux_err_corrected_mean_z,Time_to_peak_g,Time_to_peak_i,Time_to_peak_r,Time_to_peak_u,Time_to_peak_y,Time_to_peak_z,target,Z,Z_err
0,Dornhoth_fervain_onodrim,-0.541341,2.601577,1.412578,0.950745,-0.458879,1.48976,1.424,28.277937,13.8027,...,0.667111,510.5107,457.6993,466.5012,466.5012,475.3031,457.6993,0,3.049,
1,Dornhoth_galadh_ylf,0.218875,0.394725,0.317303,0.03493,0.757253,0.573209,1.518313,5.797249,3.027457,...,0.579614,1550.8067,1555.9419,1550.8067,729.1873,1558.5094,1561.077,0,0.4324,
2,Elrim_melethril_thul,3.757406,7.781453,5.462329,0.078012,0.065533,9.287801,8.691179,14.758927,11.552149,...,1.089832,694.1133,733.1571,698.4515,1138.7796,767.8628,733.1571,0,0.4673,
3,Ithil_tobas_rodwen,0.28994,0.455992,0.448044,0.161333,0.348274,0.540509,1.42555,2.26982,1.92208,...,0.441839,1119.8792,1119.8792,2463.1615,2781.0811,2735.2549,1105.5585,0,0.6946,
4,Mirion_adar_Druadan,0.05612,0.428605,0.242114,-0.018189,0.308948,0.322406,1.798828,6.010829,2.709919,...,0.603441,779.7534,1115.5113,996.2289,775.3356,1201.6597,631.7549,0,0.4161,
