True Flux formula:
$$F_{int}(\lambda) = F_{obs}(\lambda) \times 10^{0.4 \cdot A_\lambda}$$

Where:
$$A_\lambda = E(B-V) \cdot [R_V \cdot a(\lambda) + b(\lambda)]$$

In [1]:
import pandas as pd
import numpy as np
import gc

from pathlib import Path

import sys
import os

sys.path.append(os.path.abspath('..'))

RAW_DATA_PATH = Path("../data/raw")
PROCESSED_DATA_PATH = Path("../data/processed")

# Extinction coefficients for different filters
# Used for calculating true Flux
EXTINCTION_COEFFS = {
    'u': 4.81,
    'g': 3.64,
    'r': 2.70,
    'i': 2.06,
    'z': 1.58,
    'y': 1.31
}

In [2]:
# To tackle faint TDEs
def calculate_temperature_proxy(df):
    # 1. Mean Color (Static Temperature)
    # Higher value = Bluer (Hotter)
    # If 'u' is much brighter than 'g', this ratio is high.
    df['color_u_div_g'] = df['Flux_corrected_mean_u'] / (df['Flux_corrected_mean_g'] + 1e-6)

    # 2. Color Evolution (The "Cooling Rate")
    # TDEs cool slowly (stay blue). SNe cool fast (turn red).
    # We compare color at Peak vs. Color Late-time.
    
    # Simple proxy: (Max u / Max g) - (Mean u / Mean g)
    # If positive: It was bluer at peak than on average (Typical behavior)
    # If near zero: Temperature stayed constant (TDE characteristic)
    ratio_peak = df['Flux_corrected_max_u'] / (df['Flux_corrected_max_g'] + 1e-6)
    ratio_mean = df['Flux_corrected_mean_u'] / (df['Flux_corrected_mean_g'] + 1e-6)
    
    df['color_change_u_g'] = ratio_peak - ratio_mean
    
    return df

In [3]:
def calculate_shape_features(df):
    # "Plumpness" (Area under curve relative to Peak)
    # A spike is thin. A sustained event is plump.
    # Formula: Mean Flux / Max Flux
    # For a square wave = 1.0. For a delta spike ~ 0.
    
    for band in ['u', 'g', 'r', 'i', 'z', 'y']:
        col_mean = f'Flux_corrected_mean_{band}'
        col_max = f'Flux_corrected_max_{band}'
        
        # This feature is 100% independent of distance/brightness
        # If it's a TDE, this value should be consistent whether it's z=0.1 or z=1.0
        df[f'shape_plumpness_{band}'] = df[col_mean] / (df[col_max] + 1e-6)
        
    return df

In [4]:
# A simplified "Decay Rate" feature
# Compare Flux at Peak vs Flux at the end of the window

def calculate_decay_strength(df):
    for band in ['u', 'g']: # Focus on blue bands where TDEs live
        # Ratio of Min (late-time) to Max (peak)
        # Low value = Faded away completely
        # High value = Still glowing (Plateau)
        col_min = f'Flux_corrected_min_{band}'
        col_max = f'Flux_corrected_max_{band}'
        
        df[f'decay_ratio_{band}'] = df[col_min] / (df[col_max] + 1e-6)

    return df

In [5]:
def feature_extraction(raw_path, train=True):
    # Dunno what to call this
    if train:
        something = "train_"
    else:
        something = "test_"
    
    # Master metadata
    log_file_path = raw_path / (something + "log.csv")
    log_df = pd.read_csv(log_file_path)
    
    # Drop the "for fun" column and split name column
    log_df = log_df.drop(columns=["English Translation", "split", "SpecType"], errors='ignore')

    # Stores processed data chunks
    processed_chunks = []

    # Loop through 20 splits
    for i in range (1, 21):
        folder_name = f"split_{i:02d}" # Formats to 'split_01', 'split_02'
        file_path = raw_path / folder_name / (something + "full_lightcurves.csv")
        
        print(f"Processing {folder_name}...")
        
        # Load lightcurves data for this split only
        chunk_df = pd.read_csv(file_path)
        
        # Drop rows with missing Flux values
        chunk_df = chunk_df.dropna(subset=['Flux'])
        
        # Merge metadata with lightcurves on 'object_id'
        chunk_df = chunk_df.merge(log_df, on="object_id", how="left")
        
        # == Step 1: Extinction Correction ==
        # ======================================================
        
        # Calculate total extinction for each row
        extinction_factors = chunk_df["Filter"].map(EXTINCTION_COEFFS)
        
        chunk_df["Total_extinction"] = extinction_factors * chunk_df["EBV"]
        
        # Calculate extinction-corrected flux and flux error
        chunk_df["Flux_corrected"] = chunk_df["Flux"] * 10**(0.4 * chunk_df["Total_extinction"])
        chunk_df["Flux_err_corrected"] = chunk_df["Flux_err"] * 10**(0.4 * chunk_df["Total_extinction"])
        
        # == Calculate Per-Observation SNR ==
        chunk_df["SNR_obs"] = chunk_df["Flux_corrected"] / chunk_df["Flux_err_corrected"]
        
        # == Step 2: Extract time features ==
        # ======================================================
        
        # Sort by object_id and then MJD
        chunk_df = chunk_df.sort_values(by=["object_id", "Time (MJD)"])
        
        # Set the starting time of observation to zero for each object
        start_times = chunk_df.groupby('object_id')['Time (MJD)'].transform('min')
        chunk_df['Time_relative'] = chunk_df['Time (MJD)'] - start_times
        
        # Calculate delta time between observations of the object on the same filter
        chunk_df["Delta_time"] = chunk_df.groupby(["object_id", "Filter"])["Time_relative"].diff().fillna(0)
        
        # Calculate delta flux between observations of the object on the same filter
        chunk_df["Delta_flux"] = chunk_df.groupby(["object_id", "Filter"])["Flux_corrected"].diff().fillna(0)
        
        # Calculate flux change rate
        chunk_df["Flux_change_rate"] = chunk_df["Delta_flux"] / chunk_df["Delta_time"].replace(0, np.nan)
        chunk_df["Flux_change_rate"] = chunk_df["Flux_change_rate"].fillna(0)
        
        # == Step 3: Extract statistical features ==
        # ======================================================
        
        # -- 1. Extract Peak and Min Times --
        # We need indices for both Max and Min to get the corresponding Times
        idx_max = chunk_df.groupby(['object_id', 'Filter'])['Flux_corrected'].idxmax().dropna()
        idx_min = chunk_df.groupby(['object_id', 'Filter'])['Flux_corrected'].idxmin().dropna()
        
        # Extract Time of Max
        time_max_data = chunk_df.loc[idx_max, ['object_id', 'Filter', 'Time_relative']]
        time_max_wide = time_max_data.pivot(index='object_id', columns='Filter', values='Time_relative')
        time_max_wide = time_max_wide.rename(columns=lambda x: f"Time_of_max_flux_{x}")
        
        # Extract Time of Min
        time_min_data = chunk_df.loc[idx_min, ['object_id', 'Filter', 'Time_relative']]
        time_min_wide = time_min_data.pivot(index='object_id', columns='Filter', values='Time_relative')
        time_min_wide = time_min_wide.rename(columns=lambda x: f"Time_of_min_flux_{x}")

        # -- 2. Extract aggregated statistics --
        aggs = {
            'Flux_corrected': ['mean', 'max', 'min', 'std', 'skew'],
            'Flux_change_rate': ['mean', 'max', 'min', 'std'], 
            'Time_relative': ['max', 'count'], 
            'Flux_err_corrected': ['mean'],
            'SNR_obs': ['mean', 'max']
        }
        
        agg_df = chunk_df.groupby(['object_id', 'Filter']).agg(aggs)
        features_df = agg_df.unstack(level='Filter')
        
        # Flatten Hierarchical Columns
        new_columns = []
        for col_name, stat_name, filter_name in features_df.columns:
            new_name = f"{col_name}_{stat_name}_{filter_name}"
            new_columns.append(new_name)
        
        features_df.columns = new_columns
        
        # Fill NaN counts with 0
        count_cols = [col for col in features_df.columns if col.endswith('_count_' + col.split('_')[-1])]
        features_df[count_cols] = features_df[count_cols].fillna(0)
        
        features_df = features_df.reset_index()
        
        # Merge Time of Max and Time of Min
        features_df = features_df.merge(time_max_wide, on='object_id', how='left')
        features_df = features_df.merge(time_min_wide, on='object_id', how='left')

        # == Step 4: Feature Engineering (Math & Ratios) ==
        # ======================================================
        
        bands = ['u', 'g', 'r', 'i', 'z', 'y']
        
        # 1. Rise-to-Decay Ratio
        # Formula: (Time_Max - Time_Start) / (Time_End - Time_Max)
        # Note: Time_Start is always 0 in 'Time_relative', so numerator is just Time_of_max
        for b in bands:
            t_max_col = f"Time_of_max_flux_{b}"
            t_total_col = f"Time_relative_max_{b}" # Total duration of this filter
            
            if t_max_col in features_df.columns and t_total_col in features_df.columns:
                # Time to Rise = t_max
                # Time to Decay = t_total - t_max
                # Add epsilon to prevent divide by zero
                features_df[f"rise_decay_ratio_{b}"] = features_df[t_max_col] / (
                    (features_df[t_total_col] - features_df[t_max_col]) + 1e-6
                )

        # 2. Color Indices
        for b1, b2 in zip(bands[:-1], bands[1:]):
            col_1 = f"Flux_corrected_mean_{b1}"
            col_2 = f"Flux_corrected_mean_{b2}"
            if col_1 in features_df.columns and col_2 in features_df.columns:
                features_df[f"color_{b1}_{b2}"] = features_df[col_1] - features_df[col_2]

        # 3. Amplitude
        for b in bands:
            max_col = f"Flux_corrected_max_{b}"
            min_col = f"Flux_corrected_min_{b}"
            if max_col in features_df.columns and min_col in features_df.columns:
                features_df[f"amplitude_{b}"] = features_df[max_col] - features_df[min_col]

        # 4. Global Duration
        time_max_cols = [c for c in features_df.columns if "Time_relative_max" in c]
        if time_max_cols:
             features_df['duration_global'] = features_df[time_max_cols].max(axis=1)

        # ======================================================
        
        # == Step 5: Advanced Feature Calculations ==
        # ======================================================
        
        # Temperature Proxy Features
        features_df = calculate_temperature_proxy(features_df)
        
        # Shape Features
        features_df = calculate_shape_features(features_df)
        
        # Decay Strength Features
        features_df = calculate_decay_strength(features_df)
        
        processed_chunks.append(features_df)
        
        # Clean up
        del chunk_df, agg_df, features_df, time_max_data, time_max_wide, time_min_data, time_min_wide
        gc.collect()
        
    print("Concatenating chunks...")
    final_df = pd.concat(processed_chunks)
    
    # Merge with log_df to get target labels
    if train:
        final_df = final_df.merge(log_df[['object_id', 'Z', 'Z_err', 'target']], on='object_id', how='left')
    else:
        final_df = final_df.merge(log_df[['object_id', 'Z', 'Z_err']], on='object_id', how='left')
    
    print("Processing complete.")
    
    # Clean up unnecessary columns
    final_df.drop(columns=['Time (MJD)', 'Flux', 'Flux_err', 'Total_extinction'], errors='ignore', inplace=True)
    
    return final_df

In [6]:
# Process training data
train_features = feature_extraction(RAW_DATA_PATH, train=True)

Processing split_01...
Processing split_02...
Processing split_03...
Processing split_04...
Processing split_05...
Processing split_06...
Processing split_07...
Processing split_08...
Processing split_09...
Processing split_10...
Processing split_11...
Processing split_12...
Processing split_13...
Processing split_14...
Processing split_15...
Processing split_16...
Processing split_17...
Processing split_18...
Processing split_19...
Processing split_20...
Concatenating chunks...
Processing complete.


In [7]:
train_features.head()


Unnamed: 0,object_id,Flux_corrected_mean_g,Flux_corrected_mean_i,Flux_corrected_mean_r,Flux_corrected_mean_u,Flux_corrected_mean_y,Flux_corrected_mean_z,Flux_corrected_max_g,Flux_corrected_max_i,Flux_corrected_max_r,...,shape_plumpness_g,shape_plumpness_r,shape_plumpness_i,shape_plumpness_z,shape_plumpness_y,decay_ratio_u,decay_ratio_g,Z,Z_err,target
0,Dornhoth_fervain_onodrim,-0.541341,2.601577,1.412578,0.950745,-0.458879,1.48976,1.424,28.277937,13.8027,...,-0.380155,0.102341,0.092,0.05068,-0.068501,0.000166,-0.868267,3.049,,0
1,Dornhoth_galadh_ylf,0.218875,0.394725,0.317303,0.03493,0.757253,0.573209,1.518313,5.797249,3.027457,...,0.144156,0.104808,0.068088,0.072869,0.06207,-0.753647,-0.224853,0.4324,,0
2,Elrim_melethril_thul,3.757406,7.781453,5.462329,0.078012,0.065533,9.287801,8.691179,14.758927,11.552149,...,0.432324,0.472841,0.527237,0.606077,0.005052,-1.164189,0.030279,0.4673,,0
3,Ithil_tobas_rodwen,0.28994,0.455992,0.448044,0.161333,0.348274,0.540509,1.42555,2.26982,1.92208,...,0.203388,0.233104,0.200894,0.177858,0.064116,-0.905091,-0.334633,0.6946,,0
4,Mirion_adar_Druadan,0.05612,0.428605,0.242114,-0.018189,0.308948,0.322406,1.798828,6.010829,2.709919,...,0.031198,0.089344,0.071306,0.109675,0.121521,-1.562661,-0.741583,0.4161,,0


In [None]:
# Verify that we have collapsed the time-series
is_unique = train_features.index.is_unique  # If object_id is the index
# OR
# is_unique = train_features['object_id'].is_unique 

if is_unique:
    print(f"Data is correctly aggregated. Shape: {train_features.shape}")
else:
    print("CRITICAL ERROR: You still have multiple rows per object.")

✅ Data is correctly aggregated. Shape: (3043, 128)


In [9]:
print("Available columns are:", train_features.columns.tolist())

Available columns are: ['object_id', 'Flux_corrected_mean_g', 'Flux_corrected_mean_i', 'Flux_corrected_mean_r', 'Flux_corrected_mean_u', 'Flux_corrected_mean_y', 'Flux_corrected_mean_z', 'Flux_corrected_max_g', 'Flux_corrected_max_i', 'Flux_corrected_max_r', 'Flux_corrected_max_u', 'Flux_corrected_max_y', 'Flux_corrected_max_z', 'Flux_corrected_min_g', 'Flux_corrected_min_i', 'Flux_corrected_min_r', 'Flux_corrected_min_u', 'Flux_corrected_min_y', 'Flux_corrected_min_z', 'Flux_corrected_std_g', 'Flux_corrected_std_i', 'Flux_corrected_std_r', 'Flux_corrected_std_u', 'Flux_corrected_std_y', 'Flux_corrected_std_z', 'Flux_corrected_skew_g', 'Flux_corrected_skew_i', 'Flux_corrected_skew_r', 'Flux_corrected_skew_u', 'Flux_corrected_skew_y', 'Flux_corrected_skew_z', 'Flux_change_rate_mean_g', 'Flux_change_rate_mean_i', 'Flux_change_rate_mean_r', 'Flux_change_rate_mean_u', 'Flux_change_rate_mean_y', 'Flux_change_rate_mean_z', 'Flux_change_rate_max_g', 'Flux_change_rate_max_i', 'Flux_change_rate

In [10]:
test_features = feature_extraction(RAW_DATA_PATH, train=False)

Processing split_01...
Processing split_02...
Processing split_03...
Processing split_04...
Processing split_05...
Processing split_06...
Processing split_07...
Processing split_08...
Processing split_09...
Processing split_10...
Processing split_11...
Processing split_12...
Processing split_13...
Processing split_14...
Processing split_15...
Processing split_16...
Processing split_17...
Processing split_18...
Processing split_19...
Processing split_20...
Concatenating chunks...
Processing complete.


In [11]:
train_features.to_parquet(PROCESSED_DATA_PATH / "train_features.parquet", index=False)
test_features.to_parquet(PROCESSED_DATA_PATH / "test_features.parquet", index=False)