In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.tree import DecisionTreeRegressor

# Load and preprocess data
train_data = pd.read_csv('../main_dataset.csv')
train_data['Id'] = np.where(train_data['Id'] < 1e-18, 1e-18, train_data['Id'])
train_data['Log_Id'] = np.log10(train_data['Id'])
X = train_data[['Tox', 'Nd', 'Ns', 'Vds', 'Vgs']]
y = train_data['Log_Id']

# Polynomial feature transformation and scaling
poly = PolynomialFeatures(degree=3, include_bias=False)
X_poly = poly.fit_transform(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Baseline model parameters for DecisionTreeRegressor
baseline_params = {
    'max_depth': None,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'max_features': None,
    'max_leaf_nodes': None,
    'ccp_alpha': 0.0
}

# Define parameter ranges for individual tuning
param_ranges = {
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes': [None, 10, 20],
    'ccp_alpha': [0.0, 0.01, 0.1]
}

# Function to evaluate performance of varying parameters
def evaluate_parameter(param_name, param_values):
    results = []
    for value in param_values:
        # Update the specific parameter value
        params = baseline_params.copy()
        params[param_name] = value
        
        # Train the model
        model = DecisionTreeRegressor(**params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        r2 = r2_score(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        variance = mean_squared_error(y_pred, y_test)
        
        # Store the results
        results.append((value, r2, mae, variance))
        print(f"{param_name}={value} | R2: {r2:.10f}, MAE: {mae:.10f}, Variance: {variance:.10f}")
    
    return results

# Analyze each parameter's effect
param_results = {}
for param_name, values in param_ranges.items():
    print(f"\nEvaluating {param_name}...")
    param_results[param_name] = evaluate_parameter(param_name, values)

# Access results for a specific parameter if needed
# Example: param_results['max_depth'] contains results for each setting of 'max_depth'



Evaluating max_depth...
max_depth=3 | R2: 0.9529171539, MAE: 0.7282927728, Variance: 1.2690365462
max_depth=5 | R2: 0.9793855020, MAE: 0.4966516070, Variance: 0.5556280802
max_depth=7 | R2: 0.9913800619, MAE: 0.2825527731, Variance: 0.2323354983
max_depth=None | R2: 0.9987510184, MAE: 0.0553037253, Variance: 0.0336641346

Evaluating min_samples_split...
min_samples_split=2 | R2: 0.9987594831, MAE: 0.0553229829, Variance: 0.0334359833
min_samples_split=5 | R2: 0.9988055794, MAE: 0.0591222635, Variance: 0.0321935371
min_samples_split=10 | R2: 0.9988479842, MAE: 0.0624188919, Variance: 0.0310505893

Evaluating min_samples_leaf...
min_samples_leaf=1 | R2: 0.9987634885, MAE: 0.0551795695, Variance: 0.0333280265
min_samples_leaf=5 | R2: 0.9989190260, MAE: 0.0643222021, Variance: 0.0291357817
min_samples_leaf=10 | R2: 0.9988210740, MAE: 0.0692469683, Variance: 0.0317759069

Evaluating max_features...
max_features=None | R2: 0.9987673858, MAE: 0.0551919107, Variance: 0.0332229802
max_features