# Neural Network Preprocessing

This notebook prepares data specifically for Neural Network models. While tree-based models (XGBoost, LightGBM, CatBoost) can handle raw data, Neural Networks require:
- Feature engineering (NaN count)
- Imputation (median fill)
- Scaling (StandardScaler)

**Important**: The scaler is fit on training data only, then used to transform both train and test.

In [8]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib
import warnings

warnings.filterwarnings('ignore')

In [9]:
# Configuration
TRAIN_MODE = False  # Set to False when processing test data

# Paths
DATA_DIR = os.path.join('..', 'data', 'processed')
TRAIN_INPUT = os.path.join(DATA_DIR, '2dgp_train_features.parquet')
TEST_INPUT = os.path.join(DATA_DIR, '2dgp_test_features.parquet')
TRAIN_OUTPUT = os.path.join(DATA_DIR, 'train_processed_nn.parquet')
TEST_OUTPUT = os.path.join(DATA_DIR, 'test_processed_nn.parquet')
SCALER_PATH = os.path.join(DATA_DIR, 'nn_scaler.pkl')

In [10]:
# Load data based on mode
if TRAIN_MODE:
    print("Loading TRAINING data...")
    df = pd.read_parquet(TRAIN_INPUT)
else:
    print("Loading TEST data...")
    df = pd.read_parquet(TEST_INPUT)

print(f"Loaded data shape: {df.shape}")
print(f"Columns: {df.columns.tolist()[:10]}...")

Loading TEST data...
Loaded data shape: (7135, 289)
Columns: ['object_id', 'Flux_mean_g', 'Flux_mean_i', 'Flux_mean_r', 'Flux_mean_u', 'Flux_mean_y', 'Flux_mean_z', 'Flux_max_g', 'Flux_max_i', 'Flux_max_r']...


In [11]:
# Identify feature columns (exclude metadata)
metadata_cols = ['object_id', 'target', 'split', 'SpecType']
existing_metadata = [c for c in metadata_cols if c in df.columns]
feature_cols = [c for c in df.columns if c not in metadata_cols]

print(f"Metadata columns found: {existing_metadata}")
print(f"Number of feature columns: {len(feature_cols)}")

Metadata columns found: ['object_id']
Number of feature columns: 288


In [12]:
# Feature Engineering: Create nan_count column
print("Creating nan_count feature...")
df['nan_count'] = df[feature_cols].isna().sum(axis=1)
print(f"nan_count statistics:")
print(df['nan_count'].describe())

Creating nan_count feature...
nan_count statistics:
count    7135.000000
mean        1.954450
std         5.159255
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max        37.000000
Name: nan_count, dtype: float64


In [13]:
# Imputation: Fill NaNs with column median
print("\nImputing NaN values with median...")
nan_before = df[feature_cols].isna().sum().sum()
print(f"Total NaN values before imputation: {nan_before}")

# Update feature_cols to include nan_count
feature_cols_with_nan_count = feature_cols + ['nan_count']

# Fill NaN with median
for col in feature_cols:
    if df[col].isna().any():
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)

nan_after = df[feature_cols].isna().sum().sum()
print(f"Total NaN values after imputation: {nan_after}")


Imputing NaN values with median...
Total NaN values before imputation: 13945
Total NaN values after imputation: 0


In [14]:
# Scaling
if TRAIN_MODE:
    print("\nFitting StandardScaler on TRAINING data...")
    scaler = StandardScaler()
    df[feature_cols_with_nan_count] = scaler.fit_transform(df[feature_cols_with_nan_count])
    
    # Save the scaler for test data processing
    joblib.dump(scaler, SCALER_PATH)
    print(f"Scaler saved to: {SCALER_PATH}")
else:
    print("\nLoading fitted StandardScaler for TEST data...")
    scaler = joblib.load(SCALER_PATH)
    
    if hasattr(scaler, 'feature_names_in_'):
        expected_features = scaler.feature_names_in_
        
        # Check if missing columns
        missing = set(expected_features) - set(df.columns)
        if missing:
            raise ValueError(f"Test data is missing features expected by scaler: {missing}")
            
        # Force the variable to match the scaler's expectation
        feature_cols_with_nan_count = list(expected_features)
        
        # Reorder the DataFrame columns to match the scaler's exact order
        X_test_aligned = df[feature_cols_with_nan_count]
    else:
        # Fallback for older sklearn versions (unlikely needed)
        X_test_aligned = df[feature_cols_with_nan_count]
        
    # Transform using the aligned data
    df[feature_cols_with_nan_count] = scaler.transform(X_test_aligned)
    
    print("Test data transformed using training scaler.")


Loading fitted StandardScaler for TEST data...
Test data transformed using training scaler.


In [15]:
# Save output
if TRAIN_MODE:
    output_path = TRAIN_OUTPUT
else:
    output_path = TEST_OUTPUT

df.to_parquet(output_path, index=False)
print(f"\nProcessed data saved to: {output_path}")
print(f"Shape: {df.shape}")


Processed data saved to: ..\data\processed\test_processed_nn.parquet
Shape: (7135, 290)


In [16]:
# Verification
print("\n=== Verification ===")
print(f"nan_count column exists: {'nan_count' in df.columns}")
print(f"Remaining NaN values: {df[feature_cols_with_nan_count].isna().sum().sum()}")
print(f"\nSample of scaled features (first 5 rows, first 3 feature columns):")
print(df[feature_cols_with_nan_count[:3]].head())


=== Verification ===
nan_count column exists: True
Remaining NaN values: 0

Sample of scaled features (first 5 rows, first 3 feature columns):
   Flux_mean_g  Flux_mean_i  Flux_mean_r
0     2.277641     2.104160     1.374098
1    -0.333957    -0.308295    -0.277532
2     0.302764     1.419263     0.271161
3    -0.230878    -0.249474    -0.203311
4    -0.073605    -0.284840    -0.175776
