In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV, PredefinedSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error as mse
import pandas as pd
import xgboost as xg
import numpy as np
import torch
import random

In [2]:
def set_seed(seed=42):
    """Set all random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(42)  

Load Dataset

In [3]:
dataset = pd.read_csv('dataset.csv')
dataset.head()

Unnamed: 0,X,Y,Z,A0,A1,A2,A5,A6,A7
0,0.0,-0.759297,0.419074,3.31,2.01,0.0,1.94,1.58,1.65
1,0.058824,-0.759205,0.418863,3.39,2.23,0.0,2.09,1.73,1.8
2,0.117647,-0.759242,0.419042,3.46,2.16,0.0,2.09,1.8,1.87
3,0.176471,-0.759302,0.419248,3.67,2.3,0.0,2.23,1.87,1.94
4,0.235294,-0.759177,0.41897,3.82,2.3,0.0,2.16,1.87,1.94


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10708 entries, 0 to 10707
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       10708 non-null  float64
 1   Y       10708 non-null  float64
 2   Z       10708 non-null  float64
 3   A0      10708 non-null  float64
 4   A1      10708 non-null  float64
 5   A2      10708 non-null  float64
 6   A5      10708 non-null  float64
 7   A6      10708 non-null  float64
 8   A7      10708 non-null  float64
dtypes: float64(9)
memory usage: 753.0 KB


In [5]:
X, y = dataset[["X", "Y", "Z"]], dataset[['A0', 'A1', 'A2', 'A5', 'A6', 'A7']]

x_mean = X.iloc[:, 0].mean()

x_std = X.iloc[:, 0].std()
y_mean = X.iloc[:, 1].mean()
y_std = X.iloc[:, 1].std()
z_mean = X.iloc[:, 2].mean()
z_std = X.iloc[:, 2].std()

# Normalize each column
X_norm = np.copy(X)
X_norm[:, 0] = (X.iloc[:, 0] - x_mean) / x_std
X_norm[:, 1] = (X.iloc[:, 1] - y_mean) / y_std
X_norm[:, 2] = (X.iloc[:, 2] - z_mean) / z_std

y_norm = y / 73.8

In [6]:
X_norm = torch.tensor(pd.DataFrame(X_norm).values, dtype=torch.float32)
y_norm = torch.tensor(pd.DataFrame(y_norm).values, dtype=torch.float32)

X_train, X_temp, y_train, y_temp = train_test_split(X_norm, y_norm, test_size=0.2, shuffle=True, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [7]:
X_train

tensor([[-0.9425,  0.6090,  1.6928],
        [-0.7765,  1.6047,  0.8203],
        [ 1.5648, -0.5537,  0.8535],
        ...,
        [-0.1568, -0.2147, -1.0535],
        [-1.2321, -1.6867,  0.6647],
        [ 0.6153,  0.3961,  0.7643]])

Starting with Decision tree Regressor 

In [8]:
split_index = [-1] * len(X_train) + [0] * len(X_val)
X_combined = np.vstack((X_train, X_val))
y_combined = np.concatenate((y_train, y_val))
ps = PredefinedSplit(test_fold=split_index)

In [9]:
decision_tree_regressor = DecisionTreeRegressor(random_state=42)
parameters_dt = {"splitter":["best","random"],
             "max_depth" : [1,3,5,7,9,11,12],
           "min_samples_leaf": [1,2,3,4,5,6,7,8,9,10],
           "min_weight_fraction_leaf": [0.1,0.2,0.3,0.4,0.5],
           "max_features": ["log2","sqrt",None],
           "max_leaf_nodes": [None,10,20,30,40,50,60,70,80,90] }

In [10]:
tuning_model=GridSearchCV(decision_tree_regressor, param_grid=parameters_dt, scoring='neg_mean_squared_error', cv=ps, verbose=1, n_jobs=-1)
tuning_model.fit(X_combined, y_combined)

Fitting 1 folds for each of 21000 candidates, totalling 21000 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [11]:
best_model = tuning_model.best_estimator_

# Evaluate on test set
y_pred_decision_tree = best_model.predict(X_test)
y_pred_decision_tree = torch.tensor(y_pred_decision_tree, dtype=torch.float32)
y_pred_scales_decision_tree = y_pred_decision_tree * 73.8  # Scale back to original units

y_test_scaled = y_test * 73.8  # Scale back to original units
test_mse = mse(y_test_scaled, y_pred_scales_decision_tree)  
print(f"Best Model Test MSE: {test_mse:.4f}")
print(f"Best Parameters: {tuning_model.best_params_}")

y_pred_scales_decision_tree = torch.round(y_pred_scales_decision_tree, decimals=1)

Best Model Test MSE: 339.0545
Best Parameters: {'max_depth': 5, 'max_features': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.1, 'splitter': 'best'}


In [12]:
y_pred_scales_decision_tree 

tensor([[73.7000, 69.5000, 73.6000, 18.0000, 49.1000, 31.2000],
        [ 5.0000, 18.1000, 35.3000,  5.8000, 31.0000, 17.7000],
        [63.4000, 54.0000,  0.2000, 39.0000, 51.3000, 47.7000],
        ...,
        [73.5000, 73.6000, 73.7000, 53.6000,  7.3000,  3.5000],
        [73.7000, 69.5000, 73.6000, 18.0000, 49.1000, 31.2000],
        [63.4000, 54.0000,  0.2000, 39.0000, 51.3000, 47.7000]])

In [13]:
y_test_scaled 

tensor([[73.4300, 73.3600, 73.2900,  0.0000,  0.0000, 69.4700],
        [ 0.0000,  0.0000, 57.5600,  0.0000, 48.9800,  0.0000],
        [49.2700, 28.4200,  0.0000, 16.8800, 24.0200, 23.3700],
        ...,
        [73.8000, 73.7200, 73.8000, 54.3900,  0.0000,  0.0000],
        [73.8000, 73.8000, 73.8000,  0.0000, 73.8000, 54.6100],
        [73.5800, 71.1300,  0.0000, 44.6500, 67.7400, 56.7000]])

Now we will continue with Random Forest Regressor

In [14]:
random_forest_regressor = RandomForestRegressor(random_state=42)

parameters_rf = {
    "n_estimators": [50, 100, 150],  # More trees for better stability
    "max_depth": [None, 10, 20, 30],  # None for unlimited depth
    "min_samples_split": [2, 5, 10],  # Higher values prevent overfitting
    "min_samples_leaf": [1, 2, 4],  # Controls leaf node size
    "max_features": ["sqrt", "log2", None]  # More options for feature sampling
}

tuning_model_rf = GridSearchCV(random_forest_regressor, param_grid=parameters_rf, scoring='neg_mean_squared_error', cv=ps, verbose=1, n_jobs=-1)
tuning_model_rf.fit(X_combined, y_combined)

Fitting 1 folds for each of 324 candidates, totalling 324 fits


In [15]:
best_model_rf = tuning_model_rf.best_estimator_

# Evaluate on test set
y_pred_random_forest = best_model_rf.predict(X_test)
y_pred_random_forest = torch.tensor(y_pred_random_forest, dtype=torch.float32)
y_pred_scaled_random_forest = y_pred_random_forest * 73.8  # Scale back to original units

test_mse_rf = mse(y_test_scaled, y_pred_scaled_random_forest)  
print("Random Forest Model Evaluation:")
print(f"Best Model Test MSE: {test_mse_rf:.4f}")
print(f"Best Parameters: {tuning_model_rf.best_params_}")

y_pred_scaled_random_forest = torch.round(y_pred_scaled_random_forest, decimals=1)

Random Forest Model Evaluation:
Best Model Test MSE: 0.2418
Best Parameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}


In [16]:
y_pred_scaled_random_forest

tensor([[73.7000, 73.8000, 73.7000,  0.0000,  0.0000, 70.0000],
        [ 0.0000,  0.0000, 59.0000,  0.0000, 49.9000,  0.0000],
        [49.2000, 29.3000,  0.0000, 17.3000, 24.6000, 23.9000],
        ...,
        [73.6000, 73.6000, 73.6000, 54.1000,  0.0000,  0.0000],
        [73.8000, 73.7000, 73.6000,  0.0000, 73.5000, 53.5000],
        [73.7000, 69.6000,  0.0000, 43.6000, 66.3000, 55.7000]])

In [17]:
y_test_scaled

tensor([[73.4300, 73.3600, 73.2900,  0.0000,  0.0000, 69.4700],
        [ 0.0000,  0.0000, 57.5600,  0.0000, 48.9800,  0.0000],
        [49.2700, 28.4200,  0.0000, 16.8800, 24.0200, 23.3700],
        ...,
        [73.8000, 73.7200, 73.8000, 54.3900,  0.0000,  0.0000],
        [73.8000, 73.8000, 73.8000,  0.0000, 73.8000, 54.6100],
        [73.5800, 71.1300,  0.0000, 44.6500, 67.7400, 56.7000]])

Now XGBoost model

In [18]:
xgboost_regressor = xg.XGBRegressor(objective ='reg:linear', random_state=42)
parameters_gb = {
    "n_estimators": [50, 100, 200],          # Number of boosting stages
    "learning_rate": [0.01, 0.05, 0.1],      # Shrinkage factor (lower = more robust)
    "max_depth": [3, 4, 5],                    # Depth of individual trees
    "min_samples_split": [2, 5, 10],              # Minimum samples to split a node
    "min_samples_leaf": [1, 2, 4],                # Minimum samples at a leaf node
    "max_features": ["sqrt", "log2", None],       # Features considered for splits  
}

tuning_model_gb = GridSearchCV(xgboost_regressor, param_grid=parameters_gb, scoring='neg_mean_squared_error', cv=ps, verbose=1, n_jobs=-1)
tuning_model_gb.fit(X_combined, y_combined)

Fitting 1 folds for each of 729 candidates, totalling 729 fits


Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.



In [19]:
best_model_gb = tuning_model_gb.best_estimator_

# Evaluate on test set
y_pred_gb = best_model_gb.predict(X_test)
y_pred_gb = np.clip(y_pred_gb, 0, 73.8)  # Ensure predictions are non-negative
y_pred_gb = torch.tensor(y_pred_gb, dtype=torch.float32)
y_pred_scaled_gb = y_pred_gb * 73.8  # Scale back to original units

test_mse_gb = mse(y_test_scaled, y_pred_scaled_gb)  
print("Random Forest Model Evaluation:")
print(f"Best Model Test MSE: {test_mse_gb:.4f}")
print(f"Best Parameters: {tuning_model_gb.best_params_}")

y_pred_scaled_gb = torch.round(y_pred_scaled_gb, decimals=1)

Random Forest Model Evaluation:
Best Model Test MSE: 0.7647
Best Parameters: {'learning_rate': 0.1, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [20]:
y_pred_scaled_gb

tensor([[73.7000, 73.7000, 73.7000,  0.0000,  0.0000, 69.5000],
        [ 0.0000,  0.0000, 59.7000,  0.0000, 50.5000,  0.0000],
        [49.6000, 29.3000,  0.0000, 17.2000, 24.7000, 23.6000],
        ...,
        [73.5000, 73.6000, 73.7000, 54.9000,  0.0000,  0.0000],
        [73.8000, 73.8000, 73.7000,  0.0000, 73.5000, 53.0000],
        [73.7000, 70.4000,  0.0000, 44.4000, 67.3000, 56.3000]])

In [21]:
y_test_scaled

tensor([[73.4300, 73.3600, 73.2900,  0.0000,  0.0000, 69.4700],
        [ 0.0000,  0.0000, 57.5600,  0.0000, 48.9800,  0.0000],
        [49.2700, 28.4200,  0.0000, 16.8800, 24.0200, 23.3700],
        ...,
        [73.8000, 73.7200, 73.8000, 54.3900,  0.0000,  0.0000],
        [73.8000, 73.8000, 73.8000,  0.0000, 73.8000, 54.6100],
        [73.5800, 71.1300,  0.0000, 44.6500, 67.7400, 56.7000]])