In [50]:
# Setup & configuration
import pandas as pd
import numpy as np
import gc
from pathlib import Path
from scipy.optimize import curve_fit
import warnings

warnings.filterwarnings('ignore')

# --- File Paths ---
# Using pathlib ensures this works on Windows/Linux/Mac
RAW_DATA_PATH = Path("../data/raw")
PROCESSED_DATA_PATH = Path("../data/processed")
TRAIN_MODE = True # Set to False for test data processing

# --- Physics Constants ---
# Effective Wavelengths (Angstroms)
LAMBDA_EFF = {
    'u': 3670, 'g': 4825, 'r': 6261, 
    'i': 7672, 'z': 9097, 'y': 9600
}

# Galactic Extinction Coefficients
EXTINCTION_COEFFICIENTS = {
    'u': 4.81, 'g': 3.64, 'r': 2.70,
    'i': 2.06, 'z': 1.58, 'y': 1.31
}

BANDS = ['u', 'g', 'r', 'i', 'z', 'y']

# Redshift Limits (TDEs are usually Z < 1.9)
Z_LIMIT_TDE = 1.9

In [51]:
# Basic physics functions

def calculate_magnitudes(flux, flux_err):
    """
    Converts Flux to Luptitude (arcsinh magnitude).
    Handles negative flux naturally.
    """
    # 2.5 * log10(e) approx 1.0857
    x = flux / (flux_err + 1e-9)
    return -2.5 * np.arcsinh(x)

def get_weights(flux_err):
    """Inverse variance weighting"""
    return 1.0 / (flux_err**2 + 1e-9)

def weighted_avg_and_std(values, weights):
    """
    Return the weighted average and standard deviation.
    """
    average = np.average(values, weights=weights)
    variance = np.average((values - average)**2, weights=weights)
    return average, np.sqrt(variance)

def calculate_dust_impact(ebv, band_coeff):
    """
    Calculates transparency (fraction of light that survives dust).
    """
    return 10**(-0.4 * band_coeff * ebv)

def calculate_stetson_k(flux, flux_err, band_mean_flux):
    """
    Vectorized calculation of Stetson K residuals (robust kurtosis).
    """
    scaled_residual = (flux - band_mean_flux) / (flux_err + 1e-9)
    return np.abs(scaled_residual)

In [52]:
# Additional feature calculations
from scipy.stats import skew, kurtosis

def calculate_percentile_features(fluxes):
    """
    Captures the distribution shape without curve fitting.
    """
    if len(fluxes) == 0: return {}
    
    # Percentiles
    p = np.percentile(fluxes, [5, 25, 50, 75, 95])
    
    return {
        'flux_min': p[0],
        'flux_25': p[1],
        'flux_median': p[2],
        'flux_75': p[3],
        'flux_max': p[4],
        'flux_amp': p[4] - p[0], # Amplitude
        'flux_ratio_sq': (p[4] / (p[2] + 0.1)) # Peakiness relative to median
    }

def calculate_time_features(times, fluxes):
    """
    Captures Rise/Fall behavior using raw timestamps.
    """
    if len(times) < 2: return {}
    
    # Sort by time
    idx = np.argsort(times)
    t = times[idx]
    f = fluxes[idx]
    
    # Time of Max Flux
    idx_max = np.argmax(f)
    t_max = t[idx_max]
    
    # Rise and Fall durations
    t_rise = t_max - t[0]
    t_fall = t[-1] - t_max
    
    return {
        'time_total': t[-1] - t[0],
        'time_rise': t_rise,
        'time_fall': t_fall,
        'rise_fall_ratio': t_rise / (t_fall + 1e-5) # TDEs have fast rise, slow decay
    }

def calculate_detected_features(times, fluxes, errors, threshold=5.0):
    """
    Calculates statistics only on 'significant' detections (SNR > 5).
    Crucial for distinguishing real events from background noise.
    """
    snr = fluxes / (errors + 1e-9)
    mask = snr > threshold
    
    if not np.any(mask):
        return {'detected_duration': 0, 'detected_count': 0}
    
    t_det = times[mask]
    
    return {
        'detected_duration': t_det.max() - t_det.min(),
        'detected_count': len(t_det),
        'detected_max_snr': np.max(snr[mask])
    }

In [53]:
# Main processing function

def process_chunk(chunk_df, log_df):
    # 1. Merge Metadata
    chunk_df = chunk_df.merge(log_df[['object_id', 'Z', 'EBV']], on="object_id", how="left")
    
    # 2. Vectorized Corrections
    ext_factor = chunk_df["Filter"].map(EXTINCTION_COEFFICIENTS)
    chunk_df["correction_scale"] = 10**(0.4 * ext_factor * chunk_df["EBV"])
    chunk_df["Flux_corr"] = chunk_df["Flux"] * chunk_df["correction_scale"]
    chunk_df["Flux_err_corr"] = chunk_df["Flux_err"] * chunk_df["correction_scale"]
    
    # --- NEW: Pre-calculate Signal-to-Noise Ratio (SNR) ---
    # We add this before aggregation so we can compute statistics on it
    chunk_df["SNR"] = chunk_df["Flux_corr"] / (chunk_df["Flux_err_corr"] + 1e-9)

    # 3. Aggregations (Vectorized)
    aggs = {
        'Flux_corr': [
            'mean', 'std', 'max', 'min', 'median',  # Basics
            skew, kurtosis                          # Shape: TDEs are skewed and peaked
        ],
        'Flux_err_corr': ['mean'],
        'SNR': ['max', 'mean'],                     # Signal Quality: High Max SNR = Real Event
        'correction_scale': ['max'],                # Dust penalty
        'Time (MJD)': ['count', lambda x: x.max() - x.min()] # Density & Duration (Fast Proxy)
    }
    
    agg_df = chunk_df.groupby(['object_id', 'Filter']).agg(aggs)
    
    # Flatten Columns & Rename for clarity
    new_cols = []
    for c in agg_df.columns:
        col_name = f"{c[0]}_{c[1]}"
        col_name = col_name.replace('skew', 'skewness').replace('kurtosis', 'kurt')
        col_name = col_name.replace('<lambda_0>', 'duration_proxy') # Rename the duration lambda
        new_cols.append(col_name)
    
    agg_df.columns = new_cols
    
    # Unstack & Clean
    features = agg_df.unstack(level='Filter')
    features.columns = [f"{c[0]}_{c[1]}".replace('Band ', '').replace(' ', '_') for c in features.columns]
    features = features.reset_index()
    
    # 4. Feature Loop (Robust Logic)
    # Kept as is for the specific percentile logic which is harder to aggregate
    robust_features = []
    
    for obj_id, group in chunk_df.groupby('object_id'):
        obj_feats = {'object_id': obj_id}
        z = group['Z'].iloc[0]
        dist_mod = (5 * np.log10(z + 1e-5) + 42) if (pd.notna(z) and z > 0) else 0

        # --- PER BAND METRICS ---
        for band in ['g', 'r', 'i']:
            band_data = group[group['Filter'] == f'Band {band}']
            if len(band_data) < 3: continue 
            
            t = band_data['Time (MJD)'].values
            f = band_data['Flux_corr'].values
            e = band_data['Flux_err_corr'].values
            
            # A. Shape (Percentiles)
            p_feats = calculate_percentile_features(f)
            for k, v in p_feats.items():
                obj_feats[f'{k}_{band}'] = v
            
            # B. Time (Rise/Fall)
            t_feats = calculate_time_features(t, f)
            for k, v in t_feats.items():
                obj_feats[f'{k}_{band}'] = v
                
            # C. Detection (Signal Strength)
            d_feats = calculate_detected_features(t, f, e)
            for k, v in d_feats.items():
                obj_feats[f'{k}_{band}'] = v

            # D. Absolute Magnitude
            if pd.notna(z) and z > 0:
                obj_feats[f'abs_mag_max_{band}'] = -2.5 * np.arcsinh(p_feats.get('flux_max', 1)) - dist_mod
        
        # --- GLOBAL COLORS ---
        if 'flux_max_g' in obj_feats and 'flux_max_r' in obj_feats:
            obj_feats['peak_color_g_r'] = obj_feats['flux_max_g'] / (obj_feats['flux_max_r'] + 1e-5)
            
        obj_feats['low_Z'] = 1 if (pd.notna(z) and z < Z_LIMIT_TDE) else 0
            
        robust_features.append(obj_feats)

    # Merge
    df_robust = pd.DataFrame(robust_features)
    final_chunk = features.merge(df_robust, on='object_id', how='left')
    
    return final_chunk

In [54]:
# Execution Loop
processed_chunks = []

# Load Metadata
log_df = pd.read_csv(RAW_DATA_PATH / "train_log.csv" if TRAIN_MODE else RAW_DATA_PATH / "test_log.csv")

print(f"Starting Processing... Mode: {'TRAIN' if TRAIN_MODE else 'TEST'}")

# Loop through splits
for i in range(1, 21): # Assuming 20 splits
    folder = f"split_{i:02d}"
    
    # Construct path
    prefix = "train_" if TRAIN_MODE else "test_"
    file_path = RAW_DATA_PATH / folder / (prefix + "full_lightcurves.csv")
    
    if not file_path.exists(): continue
    
    print(f"Processing {folder}...")
    chunk = pd.read_csv(file_path)
    
    # Run Engine
    features = process_chunk(chunk, log_df)
    processed_chunks.append(features)
    
    # Memory Management
    del chunk, features
    gc.collect()

# Concatenate results
full_df = pd.concat(processed_chunks, ignore_index=True)
print(f"Chunk processing complete. Shape: {full_df.shape}")

Starting Processing... Mode: TRAIN
Processing split_01...
Processing split_02...
Processing split_03...
Processing split_04...
Processing split_05...
Processing split_06...
Processing split_07...
Processing split_08...
Processing split_09...
Processing split_10...
Processing split_11...
Processing split_12...
Processing split_13...
Processing split_14...
Processing split_15...
Processing split_16...
Processing split_17...
Processing split_18...
Processing split_19...
Processing split_20...
Chunk processing complete. Shape: (3043, 80)


In [55]:
# Final Assembly and Cleanup

# 1. Post-Process: Calculate Weighted Means
for b in BANDS:
    num_col = f"w_flux_sum_{b}"
    den_col = f"w_sum_{b}"
    
    if num_col in full_df.columns and den_col in full_df.columns:
        # Weighted Mean Flux
        full_df[f"flux_w_mean_{b}"] = full_df[num_col] / (full_df[den_col] + 1e-9)
        # Weighted SNR (approximate)
        full_df[f"snr_weight_{b}"] = np.sqrt(full_df[den_col])

# 2. Post-Process: Calculate Colors
# Color = u-g, g-r, etc.
for b1, b2 in zip(BANDS[:-1], BANDS[1:]):
    col1 = f"flux_w_mean_{b1}" # Using weighted mean flux
    col2 = f"flux_w_mean_{b2}"
    
    if col1 in full_df.columns and col2 in full_df.columns:
        # Simple flux difference or ratio proxy
        full_df[f"color_diff_{b1}_{b2}"] = full_df[col1] - full_df[col2]

# 3. Post-Process: TDE Specifics
# "Confident Blue Flux": High U-band flux that survived dust correction
if 'flux_w_mean_u' in full_df.columns and 'u_band_transparency' in full_df.columns:
    full_df['confident_blue_flux'] = full_df['flux_w_mean_u'] * full_df['u_band_transparency']

# 4. Cleanup: Drop Intermediate/Error Columns
cols_to_drop = [c for c in full_df.columns if c.startswith('w_flux_sum') or c.startswith('w_sum')]
print(f"Dropping {len(cols_to_drop)} intermediate calculation columns...")
full_df.drop(columns=cols_to_drop, inplace=True)

# 5. Final Merge with Target
final_dataset = full_df.merge(log_df, on='object_id', how='left')

# Drop non-feature columns
final_dataset.drop(columns=['English Translation'], errors='ignore', inplace=True)

print("Engineering Complete.")
print("Final Dataset Shape:", final_dataset.shape)
final_dataset.head()

Dropping 0 intermediate calculation columns...
Engineering Complete.
Final Dataset Shape: (3043, 86)


Unnamed: 0,object_id,Flux_corr_mean_g,Flux_corr_mean_i,Flux_corr_mean_r,Flux_corr_mean_u,Flux_corr_mean_y,Flux_corr_mean_z,Flux_corr_std_g,Flux_corr_std_i,Flux_corr_std_r,...,Time_(MJD)_duration_proxy_u,Time_(MJD)_duration_proxy_y,Time_(MJD)_duration_proxy_z,low_Z,Z,Z_err,EBV,SpecType,split,target
0,Dornhoth_fervain_onodrim,-0.541341,2.601577,1.412578,0.950745,-0.458879,1.48976,0.982121,7.802839,4.255918,...,849.3841,1241.0691,1241.0691,0,3.049,,0.11,AGN,split_01,0
1,Dornhoth_galadh_ylf,0.218875,0.394725,0.317303,0.03493,0.757253,0.573209,0.439899,1.080404,0.759543,...,2246.6157,2362.156,2297.967,1,0.4324,,0.058,SN II,split_01,0
2,Elrim_melethril_thul,3.757406,7.781453,5.462329,0.078012,0.065533,9.287801,2.995947,8.62449,4.07802,...,498.8939,767.8628,1093.2284,1,0.4673,,0.577,AGN,split_01,0
3,Ithil_tobas_rodwen,0.28994,0.455992,0.448044,0.161333,0.348274,0.540509,0.363483,0.64355,0.539392,...,2858.4129,2841.2281,2835.4998,1,0.6946,,0.012,AGN,split_01,0
4,Mirion_adar_Druadan,0.05612,0.428605,0.242114,-0.018189,0.308948,0.322406,0.976711,1.50729,1.246042,...,1859.9219,1809.1164,1901.8916,1,0.4161,,0.058,AGN,split_01,0


In [56]:
# === CELL: FEATURE CORRELATION CHECK ===
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

def analyze_high_correlations(df, threshold=0.95):
    """
    Identifies feature pairs with correlation above 'threshold'.
    Returns a dataframe of the pairs and a list of suggested columns to drop.
    """
    # Select only numeric columns
    numeric_df = df.select_dtypes(include=[np.number])
    
    # Calculate Correlation Matrix (Absolute value: -0.99 is just as redundant as 0.99)
    corr_matrix = numeric_df.corr().abs()
    
    # Select upper triangle of correlation matrix to avoid duplicates (A-B vs B-A) and self-correlation (A-A)
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find features with correlation greater than threshold
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    
    # Create a detailed report
    record = []
    for col in upper.columns:
        # Get rows where correlation is high
        high_corr = upper.index[upper[col] > threshold].tolist()
        for row in high_corr:
            record.append({
                'Feature 1': row,
                'Feature 2': col,
                'Correlation': upper.loc[row, col]
            })
            
    report_df = pd.DataFrame(record)
    
    if not report_df.empty:
        report_df = report_df.sort_values(by='Correlation', ascending=False).reset_index(drop=True)
        print(f"‚ö†Ô∏è Found {len(report_df)} highly correlated pairs (Threshold: {threshold})")
        print(f"üìâ Suggest dropping {len(to_drop)} columns to reduce noise.")
    else:
        print("‚úÖ No highly correlated pairs found.")
        
    return report_df, to_drop

# --- EXECUTION ---
# Exclude 'target' and 'object_id' from this check
features_to_check = final_dataset.drop(columns=['target', 'object_id', 'Z_err'], errors='ignore')

corr_report, suggested_drops = analyze_high_correlations(features_to_check, threshold=0.95)

# Display Redundant Pairs
print("\n Correlated Feature Pairs:")
display(corr_report)

# Visual Heatmap (Only if < 50 features, otherwise it's unreadable)
if len(features_to_check.columns) < 50:
    plt.figure(figsize=(12, 10))
    sns.heatmap(features_to_check.corr(), cmap='coolwarm', center=0, annot=False)
    plt.title("Feature Correlation Matrix")
    plt.show()

‚ö†Ô∏è Found 26 highly correlated pairs (Threshold: 0.95)
üìâ Suggest dropping 15 columns to reduce noise.

 Correlated Feature Pairs:


Unnamed: 0,Feature 1,Feature 2,Correlation
0,correction_scale_max_y,correction_scale_max_z,0.999454
1,correction_scale_max_i,correction_scale_max_z,0.998196
2,correction_scale_max_i,correction_scale_max_r,0.99646
3,correction_scale_max_i,correction_scale_max_y,0.995571
4,correction_scale_max_g,correction_scale_max_r,0.991678
5,correction_scale_max_r,correction_scale_max_z,0.989628
6,correction_scale_max_y,EBV,0.989543
7,correction_scale_max_g,correction_scale_max_u,0.986763
8,correction_scale_max_z,EBV,0.9848
9,correction_scale_max_r,correction_scale_max_y,0.983937


In [57]:
# Save Processed Data
output_file = PROCESSED_DATA_PATH / ("train_features.parquet" if TRAIN_MODE else "test_features.parquet")
final_dataset.to_parquet(output_file, index=False)
print(f"Saved processed data to {output_file}")

Saved processed data to ..\data\processed\train_features.parquet
