In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from scipy.stats import uniform, randint
import shap
from xgboost import XGBRegressor


In [None]:
df = pd.read_csv("AppML_InitialProject_train.csv")
real_data = pd.read_csv("AppML_InitialProject_test_regression.csv")

target_column = 'p_Truth_Energy'
all_features = []
for col in df.columns:
    if col != target_column and col != "p_Truth_isElectron":
        all_features.append(col)

scaler_fs = StandardScaler()
X_normalized = scaler_fs.fit_transform(df[all_features])
X_normalized_df = pd.DataFrame(X_normalized, columns=all_features)


In [None]:
X_temp = X_normalized
y_temp = df[target_column]
X_temp_train, X_temp_test, y_temp_train, y_temp_test = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42)

rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf_model.fit(X_temp_train, y_temp_train)

rf_importance = rf_model.feature_importances_
top_rf_indices = np.argsort(rf_importance)[-70:]  # Top 70 features
rf_features = [all_features[i] for i in top_rf_indices]
print(f"Features after Random Forest analysis: {len(rf_features)}")


In [None]:
X_rf = X_normalized_df[rf_features]

xgb_model = XGBRegressor(
    n_estimators=100, 
    max_depth=6,
    random_state=42,
    device='cuda',
    n_jobs=-1
)
xgb_model.fit(X_rf, df[target_column])

# Calculate SHAP values using XGBoost
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_rf.iloc[:1000]) 
shap_importance = np.abs(shap_values).mean(0)

# Select top features based on SHAP
top_shap_indices = np.argsort(shap_importance)[-20:]
selected_columns = [rf_features[i] for i in top_shap_indices]
print(f"Final selected features: {len(selected_columns)}")
print("Selected features:", selected_columns)

In [None]:
X_df = df[selected_columns]
y = df[target_column]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_df)

X = np.array(X_scaled)
y = np.array(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# XGBoost solution

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
dall = xgb.DMatrix(X, label=y)

print("Performing Hyperparameter Optimization")

#parameter search space
param_grid = [
    {
        'max_depth': np.random.randint(3, 15),
        'learning_rate': np.random.uniform(0.01, 0.21),
        'subsample': np.random.uniform(0.6, 1.0),
        'colsample_bytree': np.random.uniform(0.6, 1.0),
        'reg_alpha': np.random.uniform(0, 1),
        'reg_lambda': np.random.uniform(0, 1),
        'objective': 'reg:squarederror',
        'device': 'cuda',
        'random_state': 42
    }
    for _ in range(10)
]

best_rmse = float('inf')
best_params = None
best_cv_results = None

for i, params in enumerate(param_grid):
    print(f"Testing parameter combination {i+1}/10...")
    
    # Use XGBoost's native cross-validation
    cv_results = xgb.cv(
        params=params,
        dtrain=dall,
        num_boost_round=500,
        nfold=3,
        metrics=['rmse'],
        early_stopping_rounds=50,
        seed=42,
        verbose_eval=False
    )
    
    final_rmse = cv_results['test-rmse-mean'].iloc[-1]
    
    if final_rmse < best_rmse:
        best_rmse = final_rmse
        best_params = params.copy()
        best_params['n_estimators'] = len(cv_results)
        best_cv_results = cv_results
        
    print(f"  RMSE: {final_rmse:.4f}, n_estimators: {len(cv_results)}")

print(f"\nBest parameters found:")
for param, value in best_params.items():
    if param != 'n_estimators':
        print(f"  {param}: {value}")
print(f"  n_estimators: {best_params['n_estimators']}")
print(f"Best cross-validation RMSE: {best_rmse:.4f}")


In [None]:
final_cv_results = xgb.cv(
    params={k: v for k, v in best_params.items() if k != 'n_estimators'},
    dtrain=dall,
    num_boost_round=best_params['n_estimators'],
    nfold=3,
    metrics=['rmse', 'mae'],
    seed=42,
    verbose_eval=False
)

final_rmse = final_cv_results['test-rmse-mean'].iloc[-1]
final_rmse_std = final_cv_results['test-rmse-std'].iloc[-1]
final_mae = final_cv_results['test-mae-mean'].iloc[-1]
final_mae_std = final_cv_results['test-mae-std'].iloc[-1]

print("Cross Validation Results:")
print(f"RMSE - Mean: {final_rmse:.4f}, Std: {final_rmse_std:.4f}")
print(f"MAE - Mean: {final_mae:.4f}, Std: {final_mae_std:.4f}")
print(f"MSE - Mean: {final_rmse**2:.4f}")

print("Training final model")
final_model = xgb.train(
    params={k: v for k, v in best_params.items() if k != 'n_estimators'},
    dtrain=dtrain,
    num_boost_round=best_params['n_estimators']
)

In [None]:
y_pred = final_model.predict(dtest)

# Predictions on real test data
real_data_select = real_data[selected_columns]
# Scale the real data using the same scaler
real_data_scaled = scaler.transform(real_data_select)

dreal = xgb.DMatrix(real_data_scaled)
y_pred_XGB = final_model.predict(dreal)

# Evaluate the model with regression metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)

from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)

print("Model Performance:")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R² Score: {r2:.4f}")

# Neural network solution

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = scaler_X.fit_transform(X_train)
X_test_scaled = scaler_X.transform(X_test)
y_train_scaled = scaler_y.fit_transform(y_train.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.reshape(-1, 1)).flatten()

model = Sequential()

# Input layer with batch normalization
model.add(Dense(512, input_shape=(X_train_scaled.shape[1],)))
model.add(BatchNormalization())
model.add(tf.keras.layers.Activation('relu'))
model.add(Dropout(0.3))

# Hidden layers
model.add(Dense(256))
model.add(BatchNormalization())
model.add(tf.keras.layers.Activation('relu'))
model.add(Dropout(0.3))

model.add(Dense(128))
model.add(BatchNormalization())
model.add(tf.keras.layers.Activation('relu'))
model.add(Dropout(0.2))

model.add(Dense(64))
model.add(BatchNormalization())
model.add(tf.keras.layers.Activation('relu'))
model.add(Dropout(0.2))

# Output layer
model.add(Dense(1))
    
optimizer = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)
model.compile(
    optimizer=optimizer, 
    loss='huber',  # More robust to outliers than MAE
    metrics=['mae', 'mse']
)

early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience=15, 
    restore_best_weights=True,
    verbose=1
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', 
    factor=0.5, 
    patience=8, 
    min_lr=1e-7,
    verbose=1
)

print("Training improved model...")
# Train 
history = model.fit(
    X_train_scaled, y_train_scaled, 
    epochs=200, 
    batch_size=128, 
    validation_split=0.2,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Evaluate
y_pred_scaled = model.predict(X_test_scaled)
y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()

# 
# Comprehensive evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'\n=== Model Performance ===')
print(f'Final MAE: {mae:.2f}')
print(f'RMSE: {rmse:.2f}')
print(f'R² Score: {r2:.4f}')
print(f'Mean target value: {np.mean(y_test):.2f}')
print(f'MAE as % of mean: {(mae/np.mean(y_test)*100):.1f}%')