In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error
from xgboost import XGBRegressor

# Load and preprocess data
train_data = pd.read_csv('../main_dataset.csv')
train_data['Id'] = np.where(train_data['Id'] < 1e-18, 1e-18, train_data['Id'])
train_data['Log_Id'] = np.log10(train_data['Id'])
X = train_data[['Tox', 'Nd', 'Ns', 'Vds', 'Vgs']]
y = train_data['Log_Id']

poly = PolynomialFeatures(degree=3, include_bias=False)
X_poly = poly.fit_transform(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Baseline model parameters
baseline_params = {
    'n_estimators': 100,
    'learning_rate': 0.1,
    'max_depth': 5,
    'subsample': 1.0,
    'colsample_bytree': 1.0,
    'reg_alpha': 0,
    'reg_lambda': 1,
    'gamma': 0,
    'random_state': 42
}

# Define parameter ranges for individual tuning
param_ranges = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [1, 1.5, 2],
    'gamma': [0, 0.1, 0.3]
}

# Function to evaluate performance of varying parameters
def evaluate_parameter(param_name, param_values):
    results = []
    for value in param_values:
        # Update the specific parameter value
        params = baseline_params.copy()
        params[param_name] = value
        
        # Train the model
        model = XGBRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        variance = np.var(y_pred)
        
        # Store the results
        results.append((value, r2, mae, variance))
        print(f"{param_name}={value} | R2: {r2:.10f}, MAE: {mae:.10f}, Variance: {variance:.10f}")
    
    return results

# Analyze each parameter's effect
param_results = {}
for param_name, values in param_ranges.items():
    print(f"\nEvaluating {param_name}...")
    param_results[param_name] = evaluate_parameter(param_name, values)

# Example: Access results for a specific parameter
# param_results['n_estimators'] contains results for each setting of 'n_estimators'



Evaluating n_estimators...
n_estimators=50 | R2: 0.9952869758, MAE: 0.2118820036, Variance: 26.4361362457
n_estimators=100 | R2: 0.9968729272, MAE: 0.1596426447, Variance: 26.7963771820
n_estimators=200 | R2: 0.9977208017, MAE: 0.1353399777, Variance: 26.8512115479

Evaluating learning_rate...
learning_rate=0.01 | R2: -0.1245403212, MAE: 5.1380219632, Variance: 10.5974187851
learning_rate=0.05 | R2: 0.9952429894, MAE: 0.2158108914, Variance: 26.3934841156
learning_rate=0.1 | R2: 0.9968729272, MAE: 0.1596426447, Variance: 26.7963771820

Evaluating max_depth...
max_depth=3 | R2: 0.9914134669, MAE: 0.2872589179, Variance: 26.5399761200
max_depth=5 | R2: 0.9968729272, MAE: 0.1596426447, Variance: 26.7963771820
max_depth=7 | R2: 0.9983933589, MAE: 0.1076105797, Variance: 26.8718338013

Evaluating subsample...
subsample=0.6 | R2: 0.9968445839, MAE: 0.1613944706, Variance: 26.7991600037
subsample=0.8 | R2: 0.9969224391, MAE: 0.1589753326, Variance: 26.8051319122
subsample=1.0 | R2: 0.9968729