# Ultimate Optimized NN - Target: R² > 0.82

## Implementierte Verbesserungen:
1. **Hyperparameter Optimization** (Optuna)
2. **XGBoost + NN Hybrid Ensemble**
3. **Stacked Meta-Ensemble**
4. **5-Fold Cross-Validation**
5. **Advanced Feature Engineering**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import Ridge
from sklearn.cluster import KMeans
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import lightgbm as lgb
import optuna
import warnings
warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print(f"Device: {device}")

Device: mps


In [2]:
# ===== ADVANCED FEATURE ENGINEERING =====
def create_advanced_features(df, fit_kmeans=None):
    """Ultimate Feature Engineering Pipeline"""
    df = df.copy()
    
    # === Basic Features ===
    df['rooms_per_household'] = df['total_rooms'] / (df['households'] + 1)
    df['bedrooms_ratio'] = df['total_bedrooms'] / (df['total_rooms'] + 1)
    df['population_density'] = df['population'] / (df['households'] + 1)
    df['income_per_bedroom'] = df['median_income'] / (df['total_bedrooms'] + 1)
    df['rooms_per_person'] = df['total_rooms'] / (df['population'] + 1)
    df['bedrooms_per_person'] = df['total_bedrooms'] / (df['population'] + 1)
    
    # === Polynomial Features (wichtigste) ===
    df['income_squared'] = df['median_income'] ** 2
    df['income_cubed'] = df['median_income'] ** 3
    df['income_sqrt'] = np.sqrt(df['median_income'])
    df['age_squared'] = df['housing_median_age'] ** 2
    
    # === Interaction Features ===
    df['income_x_rooms'] = df['median_income'] * df['rooms_per_household']
    df['income_x_age'] = df['median_income'] * df['housing_median_age']
    df['income_x_density'] = df['median_income'] * df['population_density']
    df['age_x_rooms'] = df['housing_median_age'] * df['rooms_per_household']
    
    # === Geospatial Features ===
    df['location_interaction'] = df['longitude'] * df['latitude']
    df['distance_to_center'] = np.sqrt(df['longitude']**2 + df['latitude']**2)
    
    # === Clustering Features (Geospatial Clusters) ===
    if fit_kmeans is None:
        kmeans = KMeans(n_clusters=10, random_state=42, n_init=10)
        df['geo_cluster'] = kmeans.fit_predict(df[['longitude', 'latitude']])
    else:
        df['geo_cluster'] = fit_kmeans.predict(df[['longitude', 'latitude']])
        kmeans = fit_kmeans
    
    # === Log Transforms (skewed features) ===
    df['log_population'] = np.log1p(df['population'])
    df['log_total_rooms'] = np.log1p(df['total_rooms'])
    
    return df, kmeans

def preprocess_data(train_df, test_df):
    target_col = 'median_house_value'
    X_train = train_df.drop(columns=[target_col])
    y_train = train_df[target_col].values
    
    # Outlier Removal
    lower_q = np.percentile(y_train, 1)
    mask = y_train >= lower_q
    X_train = X_train[mask]
    y_train = y_train[mask]
    
    # Feature Engineering
    X_train, kmeans = create_advanced_features(X_train)
    X_test, _ = create_advanced_features(test_df.copy(), fit_kmeans=kmeans)
    
    # Remove IDs
    X_train = X_train.drop(columns=['id'], errors='ignore')
    X_test = X_test.drop(columns=['id'], errors='ignore')
    
    # One-Hot Encoding (ocean_proximity + geo_cluster)
    cat_cols = X_train.select_dtypes(include=['object']).columns.tolist() + ['geo_cluster']
    X_train = pd.get_dummies(X_train, columns=cat_cols, drop_first=False)
    X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=False)
    for col in set(X_train.columns) - set(X_test.columns):
        X_test[col] = 0
    X_test = X_test[X_train.columns]
    
    # Imputation
    imputer = SimpleImputer(strategy='median')
    X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
    
    return X_train, y_train, X_test

print("Loading and preprocessing data...")
train_df = pd.read_csv('../train.csv')
test_df = pd.read_csv('../test.csv')
X_full, y_full, X_test = preprocess_data(train_df, test_df)
print(f"Features: {X_full.shape[1]} | Samples: {len(X_full)}")

Loading and preprocessing data...
Features: 41 | Samples: 16346


In [3]:
# ===== DATA AUGMENTATION =====
def augment_data(X, y, noise_level=0.025, augment_factor=4):
    X_list = [X]
    y_list = [y]
    feature_std = np.std(X, axis=0)
    
    for i in range(augment_factor - 1):
        noise = np.random.normal(0, noise_level, X.shape) * feature_std
        X_noisy = X + noise
        y_noise = np.random.normal(1.0, 0.008, y.shape)
        y_noisy = y * y_noise
        X_list.append(X_noisy)
        y_list.append(y_noisy)
    
    return np.vstack(X_list), np.hstack(y_list)

In [4]:
# ===== OPTIMIZED RESIDUAL NETWORK =====
class ResidualBlock(nn.Module):
    def __init__(self, dim, dropout_rate=0.2):
        super(ResidualBlock, self).__init__()
        self.fc = nn.Linear(dim, dim)
        self.bn = nn.BatchNorm1d(dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.activation = nn.LeakyReLU(0.1)
        
    def forward(self, x):
        identity = x
        out = self.fc(x)
        out = self.bn(out)
        out = self.activation(out)
        out = self.dropout(out)
        return out + identity

class OptimizedResidualNet(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, n_residual_blocks=2, dropout_rate=0.25):
        super(OptimizedResidualNet, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.LeakyReLU(0.1),
            nn.Dropout(dropout_rate)
        )
        
        # Residual Blocks
        self.res_blocks = nn.ModuleList([
            ResidualBlock(hidden_dim, dropout_rate) for _ in range(n_residual_blocks)
        ])
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.LeakyReLU(0.1),
            nn.Dropout(dropout_rate * 0.7),
            
            nn.Linear(hidden_dim // 2, hidden_dim // 4),
            nn.BatchNorm1d(hidden_dim // 4),
            nn.LeakyReLU(0.1),
            nn.Dropout(dropout_rate * 0.5),
            
            nn.Linear(hidden_dim // 4, 1)
        )
        
    def forward(self, x):
        x = self.encoder(x)
        for res_block in self.res_blocks:
            x = res_block(x)
        x = self.decoder(x)
        return x.squeeze()

In [5]:
# ===== HYPERPARAMETER OPTIMIZATION WITH OPTUNA =====
def objective(trial, X_train, y_train, X_val, y_val):
    """Optuna Objective Function"""
    # Hyperparameters to optimize
    lr = trial.suggest_loguniform('lr', 1e-4, 5e-3)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-3, 5e-2)
    dropout = trial.suggest_uniform('dropout', 0.15, 0.35)
    hidden_dim = trial.suggest_categorical('hidden_dim', [96, 128, 160, 192])
    n_res_blocks = trial.suggest_int('n_res_blocks', 1, 3)
    batch_size = trial.suggest_categorical('batch_size', [64, 128, 256])
    
    # Convert to tensors
    X_train_t = torch.FloatTensor(X_train).to(device)
    y_train_t = torch.FloatTensor(y_train).to(device)
    X_val_t = torch.FloatTensor(X_val).to(device)
    y_val_t = torch.FloatTensor(y_val).to(device)
    
    train_loader = DataLoader(
        TensorDataset(X_train_t, y_train_t), 
        batch_size=batch_size, 
        shuffle=True
    )
    val_loader = DataLoader(
        TensorDataset(X_val_t, y_val_t), 
        batch_size=batch_size, 
        shuffle=False
    )
    
    # Model
    model = OptimizedResidualNet(
        input_dim=X_train.shape[1],
        hidden_dim=hidden_dim,
        n_residual_blocks=n_res_blocks,
        dropout_rate=dropout
    ).to(device)
    
    criterion = nn.HuberLoss(delta=1.0)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=30)
    
    # Training (reduced epochs for speed)
    best_val_loss = float('inf')
    patience = 20
    patience_counter = 0
    
    for epoch in range(200):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            pred = model(X_batch)
            loss = criterion(pred, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                pred = model(X_batch)
                val_loss += criterion(pred, y_batch).item()
        val_loss /= len(val_loader)
        
        scheduler.step()
        
        if val_loss < best_val_loss - 1e-4:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
    
    return best_val_loss

print("\n" + "="*60)
print("STEP 1: HYPERPARAMETER OPTIMIZATION")
print("="*60)

# Small train/val for hyperparameter search
X_train_hp, X_val_hp, y_train_hp, y_val_hp = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42
)

# Augment
X_train_aug_hp, y_train_aug_hp = augment_data(X_train_hp.values, y_train_hp, augment_factor=4)

# Scale
y_train_log_hp = np.log1p(y_train_aug_hp)
y_val_log_hp = np.log1p(y_val_hp)
y_scaler_hp = StandardScaler()
y_train_scaled_hp = y_scaler_hp.fit_transform(y_train_log_hp.reshape(-1, 1)).flatten()
y_val_scaled_hp = y_scaler_hp.transform(y_val_log_hp.reshape(-1, 1)).flatten()

scaler_hp = StandardScaler()
X_train_scaled_hp = scaler_hp.fit_transform(X_train_aug_hp)
X_val_scaled_hp = scaler_hp.transform(X_val_hp)

# Run Optuna
study = optuna.create_study(direction='minimize')
study.optimize(
    lambda trial: objective(trial, X_train_scaled_hp, y_train_scaled_hp, X_val_scaled_hp, y_val_scaled_hp),
    n_trials=50,
    show_progress_bar=True
)

best_params = study.best_params
print(f"\nBest Hyperparameters:")
for k, v in best_params.items():
    print(f"  {k}: {v}")
print(f"Best Val Loss: {study.best_value:.4f}")


STEP 1: HYPERPARAMETER OPTIMIZATION


  0%|          | 0/50 [00:00<?, ?it/s]

[W 2025-11-20 13:52:13,313] Trial 10 failed with parameters: {'lr': 0.0014213417885061066, 'weight_decay': 0.001148776094824234, 'dropout': 0.19362508558795238, 'hidden_dim': 192, 'n_res_blocks': 2, 'batch_size': 64} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/cedricstillecke/Documents/CloudExplain/DataScienceTutorial/.venv/lib/python3.13/site-packages/optuna/study/_optimize.py", line 205, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/10/t6g6ht253j50pfcx5b145zm40000gn/T/ipykernel_46354/1431759198.py", line 102, in <lambda>
    lambda trial: objective(trial, X_train_scaled_hp, y_train_scaled_hp, X_val_scaled_hp, y_val_scaled_hp),
                  ~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/10/t6g6ht253j50pfcx5b145zm40000gn/T/ipykernel_46354/1431759198.py", line 53, in objective
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm

KeyboardInterrupt: 

In [None]:
# ===== TRAIN NEURAL NETWORK ENSEMBLE WITH BEST PARAMS =====
print("\n" + "="*60)
print("STEP 2: TRAINING OPTIMIZED NN ENSEMBLE")
print("="*60)

def train_optimized_model(X_train, y_train, X_val, y_val, params, seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    X_train_t = torch.FloatTensor(X_train).to(device)
    y_train_t = torch.FloatTensor(y_train).to(device)
    X_val_t = torch.FloatTensor(X_val).to(device)
    y_val_t = torch.FloatTensor(y_val).to(device)
    
    train_loader = DataLoader(
        TensorDataset(X_train_t, y_train_t),
        batch_size=params['batch_size'],
        shuffle=True
    )
    val_loader = DataLoader(
        TensorDataset(X_val_t, y_val_t),
        batch_size=params['batch_size'],
        shuffle=False
    )
    
    model = OptimizedResidualNet(
        input_dim=X_train.shape[1],
        hidden_dim=params['hidden_dim'],
        n_residual_blocks=params['n_res_blocks'],
        dropout_rate=params['dropout']
    ).to(device)
    
    criterion = nn.HuberLoss(delta=1.0)
    optimizer = optim.AdamW(
        model.parameters(),
        lr=params['lr'],
        weight_decay=params['weight_decay']
    )
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=50, T_mult=2)
    
    best_loss = float('inf')
    best_state = None
    patience = 50
    patience_counter = 0
    
    for epoch in range(500):
        model.train()
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            pred = model(X_batch)
            loss = criterion(pred, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                pred = model(X_batch)
                val_loss += criterion(pred, y_batch).item()
        val_loss /= len(val_loader)
        
        scheduler.step()
        
        if val_loss < best_loss - 1e-4:
            best_loss = val_loss
            best_state = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
    
    model.load_state_dict(best_state)
    return model

# Full train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, test_size=0.15, random_state=42
)

# Augmentation
X_train_aug, y_train_aug = augment_data(X_train.values, y_train, augment_factor=4)

# Scaling
y_train_log = np.log1p(y_train_aug)
y_val_log = np.log1p(y_val)
y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train_log.reshape(-1, 1)).flatten()
y_val_scaled = y_scaler.transform(y_val_log.reshape(-1, 1)).flatten()

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_aug)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Train Ensemble
nn_models = []
seeds = [42, 123, 456, 789, 999]

for i, seed in enumerate(seeds, 1):
    print(f"Training NN model {i}/5 (seed={seed})...")
    model = train_optimized_model(
        X_train_scaled, y_train_scaled,
        X_val_scaled, y_val_scaled,
        best_params, seed
    )
    nn_models.append(model)

print("NN Ensemble Complete!")

In [None]:
# ===== TRAIN LIGHTGBM MODELS =====
print("\n" + "="*60)
print("STEP 3: TRAINING LIGHTGBM ENSEMBLE")
print("="*60)

lgb_models = []
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'max_depth': 8,
    'min_child_samples': 20,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1
}

for i, seed in enumerate(seeds, 1):
    print(f"Training LightGBM model {i}/5 (seed={seed})...")
    lgb_params['random_state'] = seed
    
    train_data = lgb.Dataset(X_train, y_train)
    val_data = lgb.Dataset(X_val, y_val, reference=train_data)
    
    model = lgb.train(
        lgb_params,
        train_data,
        num_boost_round=2000,
        valid_sets=[val_data],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    lgb_models.append(model)

print("LightGBM Ensemble Complete!")

In [None]:
# ===== STACKED META-ENSEMBLE =====
print("\n" + "="*60)
print("STEP 4: STACKED META-ENSEMBLE")
print("="*60)

# Get predictions from all models on validation set
X_val_tensor = torch.FloatTensor(X_val_scaled).to(device)
nn_val_preds = []
for model in nn_models:
    model.eval()
    with torch.no_grad():
        pred_scaled = model(X_val_tensor).cpu().numpy()
        pred_log = y_scaler.inverse_transform(pred_scaled.reshape(-1, 1)).flatten()
        pred = np.expm1(pred_log)
        nn_val_preds.append(pred)

lgb_val_preds = []
for model in lgb_models:
    pred = model.predict(X_val, num_iteration=model.best_iteration)
    lgb_val_preds.append(pred)

# Stack predictions as meta-features
meta_features_val = np.column_stack(nn_val_preds + lgb_val_preds)

# Train meta-model (Ridge Regression)
meta_model = Ridge(alpha=1.0)
meta_model.fit(meta_features_val, y_val)

print(f"Meta-model weights:")
weights = meta_model.coef_
for i, w in enumerate(weights):
    model_type = "NN" if i < len(nn_models) else "LGB"
    model_idx = (i % len(nn_models)) + 1
    print(f"  {model_type}-{model_idx}: {w:.4f}")

# Final prediction on validation
val_predictions_stacked = meta_model.predict(meta_features_val)

In [None]:
# ===== FINAL VALIDATION METRICS =====
rmse = np.sqrt(mean_squared_error(y_val, val_predictions_stacked))
mae = mean_absolute_error(y_val, val_predictions_stacked)
r2 = r2_score(y_val, val_predictions_stacked)
mape = np.mean(np.abs((y_val - val_predictions_stacked) / y_val)) * 100

print(f"\n{'='*60}")
print(f"FINAL VALIDATION METRICS (Stacked Ensemble: 5xNN + 5xLGB)")
print(f"{'='*60}")
print(f"RMSE:  ${rmse:,.2f}")
print(f"MAE:   ${mae:,.2f}")
print(f"R²:    {r2:.4f}")
print(f"MAPE:  {mape:.2f}%")
print(f"{'='*60}")

In [None]:
# ===== TEST PREDICTIONS =====
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)

# NN predictions
nn_test_preds = []
for model in nn_models:
    model.eval()
    with torch.no_grad():
        pred_scaled = model(X_test_tensor).cpu().numpy()
        pred_log = y_scaler.inverse_transform(pred_scaled.reshape(-1, 1)).flatten()
        pred = np.expm1(pred_log)
        nn_test_preds.append(pred)

# LightGBM predictions
lgb_test_preds = []
for model in lgb_models:
    pred = model.predict(X_test, num_iteration=model.best_iteration)
    lgb_test_preds.append(pred)

# Stack and predict
meta_features_test = np.column_stack(nn_test_preds + lgb_test_preds)
test_predictions = meta_model.predict(meta_features_test)

# Submission
submission = pd.DataFrame({
    'Id': range(len(test_predictions)),
    'Predicted': test_predictions
})

submission.to_csv('submission_ultimate.csv', index=False)
print(f"\n✓ Submission saved: submission_ultimate.csv")