In [23]:
from sklearn.model_selection import train_test_split, GridSearchCV, PredefinedSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error as mae
import pandas as pd
import xgboost as xg
import numpy as np
import torch
import random

In [2]:
def set_seed(seed=42):
    """Set all random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(42)  

Load Dataset

In [5]:
dataset = pd.read_csv('19JuneDataset.csv')
dataset.head()

Unnamed: 0,time,A0,A1,A2,A5,A6,A7,X,Y,Z
0,0.0,3.02,1.87,0.28,2.45,0.14,0.0,-0.629333,0.844733,1.169218
1,0.046863,2.95,1.8,0.0,1.87,0.0,0.0,-0.629225,0.844531,1.169302
2,0.078058,2.95,1.8,0.07,2.16,0.0,0.0,-0.629148,0.844372,1.169372
3,0.109227,3.02,1.65,0.0,2.01,0.0,0.0,-0.629085,0.844232,1.169426
4,0.140454,3.17,1.8,0.07,2.3,0.07,0.0,-0.629038,0.844166,1.169454


In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18165 entries, 0 to 18164
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   time    18165 non-null  float64
 1   A0      18165 non-null  float64
 2   A1      18165 non-null  float64
 3   A2      18165 non-null  float64
 4   A5      18165 non-null  float64
 5   A6      18165 non-null  float64
 6   A7      18165 non-null  float64
 7   X       18165 non-null  float64
 8   Y       18165 non-null  float64
 9   Z       18165 non-null  float64
dtypes: float64(10)
memory usage: 1.4 MB


In [14]:
print("Data types:\n", dataset.dtypes)
print("\nMissing values:\n", dataset.isnull().sum())

Data types:
 time    float64
A0      float64
A1      float64
A2      float64
A5      float64
A6      float64
A7      float64
X       float64
Y       float64
Z       float64
dtype: object

Missing values:
 time    0
A0      0
A1      0
A2      0
A5      0
A6      0
A7      0
X       0
Y       0
Z       0
dtype: int64


In [7]:
X, y = dataset[["X", "Y", "Z"]], dataset[['A0', 'A1', 'A2', 'A5', 'A6', 'A7']]

x_mean = X.iloc[:, 0].mean()

x_std = X.iloc[:, 0].std()
y_mean = X.iloc[:, 1].mean()
y_std = X.iloc[:, 1].std()
z_mean = X.iloc[:, 2].mean()
z_std = X.iloc[:, 2].std()

# Normalize each column
X_norm = np.copy(X)
X_norm[:, 0] = (X.iloc[:, 0] - x_mean) / x_std
X_norm[:, 1] = (X.iloc[:, 1] - y_mean) / y_std
X_norm[:, 2] = (X.iloc[:, 2] - z_mean) / z_std

y_norm = y / 73.8

In [16]:
print(x_std, y_std, z_std)

0.03492840222683053 0.0342168269256335 0.01756469685175674


In [20]:
print(X_norm.min(), X_norm.max())

tensor(-4.3411) tensor(2.1788)


In [8]:
X_norm = torch.tensor(pd.DataFrame(X_norm).values, dtype=torch.float32)
y_norm = torch.tensor(pd.DataFrame(y_norm).values, dtype=torch.float32)

X_train, X_temp, y_train, y_temp = train_test_split(X_norm, y_norm, test_size=0.2, shuffle=True, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [9]:
X_train

tensor([[ 0.2064,  0.8040,  0.0440],
        [-2.1598,  1.2789,  0.7274],
        [ 0.4554,  0.7665,  0.8691],
        ...,
        [ 0.3993, -0.9969,  0.6394],
        [-1.5094,  0.7090, -0.9503],
        [-0.1380,  0.6910,  0.3788]])

Starting with Decision tree Regressor 

In [10]:
split_index = [-1] * len(X_train) + [0] * len(X_val)
X_combined = np.vstack((X_train, X_val))
y_combined = np.concatenate((y_train, y_val))
ps = PredefinedSplit(test_fold=split_index)

In [11]:
decision_tree_regressor = DecisionTreeRegressor(random_state=42)
parameters_dt = {"splitter":["best","random"],
             "max_depth" : [1,3,5,7,9,11,12],
           "min_samples_leaf": [1,2,3,4,5,6,7,8,9,10],
           "min_weight_fraction_leaf": [0.1,0.2,0.3,0.4,0.5],
           "max_features": ["log2","sqrt",None],
           "max_leaf_nodes": [None,10,20,30,40,50,60,70,80,90] }

In [28]:
tuning_model=GridSearchCV(decision_tree_regressor, param_grid=parameters_dt, scoring='neg_mean_absolute_error', cv=ps, verbose=1, n_jobs=-1)
tuning_model.fit(X_combined, y_combined)

Fitting 1 folds for each of 21000 candidates, totalling 21000 fits


  _data = np.array(data, dtype=dtype, copy=copy,


In [24]:
best_model = tuning_model.best_estimator_

# Evaluate on test set
y_pred_decision_tree = best_model.predict(X_test)
y_pred_decision_tree = torch.tensor(y_pred_decision_tree, dtype=torch.float32)
y_pred_scales_decision_tree = y_pred_decision_tree * 73.8  # Scale back to original units

y_test_scaled = y_test * 73.8  # Scale back to original units
test_mse = mae(y_test_scaled, y_pred_scales_decision_tree)  
print(f"Best Model Test MSE: {test_mse:.4f}")
print(f"Best Parameters: {tuning_model.best_params_}")

y_pred_scales_decision_tree = torch.round(y_pred_scales_decision_tree, decimals=1)

Best Model Test MSE: 10.5491
Best Parameters: {'max_depth': 5, 'max_features': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_weight_fraction_leaf': 0.1, 'splitter': 'best'}


In [25]:
y_pred_scales_decision_tree 

tensor([[72.6000, 55.3000, 70.5000, 40.5000, 71.2000, 40.9000],
        [37.0000, 69.7000, 62.6000, 60.6000, 66.7000, 73.4000],
        [72.6000, 55.3000, 70.5000, 40.5000, 71.2000, 40.9000],
        ...,
        [73.5000, 62.9000, 73.6000, 14.3000, 39.9000,  6.3000],
        [73.4000, 73.6000, 73.7000, 62.8000,  6.2000,  0.0000],
        [73.4000, 73.6000, 73.7000, 62.8000,  6.2000,  0.0000]])

In [26]:
y_test_scaled 

tensor([[73.8000, 73.8000, 73.8000,  0.1400, 73.2900, 55.1800],
        [13.4900, 73.8000, 73.8000, 73.2200, 73.8000, 73.8000],
        [73.8000, 73.8000, 73.8000, 73.2200, 67.5900,  0.0000],
        ...,
        [73.8000, 73.8000, 73.8000, 36.3500,  0.0000,  0.0000],
        [73.5800, 73.5100, 73.6500, 72.5700,  4.3200,  0.0000],
        [73.8000, 73.8000, 73.8000, 63.2600,  0.0000,  0.0000]])

Now we will continue with Random Forest Regressor

In [29]:
random_forest_regressor = RandomForestRegressor(random_state=42)

parameters_rf = {
    "n_estimators": [50, 100, 150],  # More trees for better stability
    "max_depth": [None, 10, 20, 30],  # None for unlimited depth
    "min_samples_split": [2, 5, 10],  # Higher values prevent overfitting
    "min_samples_leaf": [1, 2, 4],  # Controls leaf node size
    "max_features": ["sqrt", "log2", None]  # More options for feature sampling
}

tuning_model_rf = GridSearchCV(random_forest_regressor, param_grid=parameters_rf, scoring='neg_mean_absolute_error', cv=ps, verbose=1, n_jobs=-1)
tuning_model_rf.fit(X_combined, y_combined)

Fitting 1 folds for each of 324 candidates, totalling 324 fits


In [30]:
best_model_rf = tuning_model_rf.best_estimator_

# Evaluate on test set
y_pred_random_forest = best_model_rf.predict(X_test)
y_pred_random_forest = torch.tensor(y_pred_random_forest, dtype=torch.float32)
y_pred_scaled_random_forest = y_pred_random_forest * 73.8  # Scale back to original units

test_mse_rf = mae(y_test_scaled, y_pred_scaled_random_forest)  
print("Random Forest Model Evaluation:")
print(f"Best Model Test MSE: {test_mse_rf:.4f}")
print(f"Best Parameters: {tuning_model_rf.best_params_}")

y_pred_scaled_random_forest = torch.round(y_pred_scaled_random_forest, decimals=1)

Random Forest Model Evaluation:
Best Model Test MSE: 0.2890
Best Parameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 150}


In [31]:
y_pred_scaled_random_forest

tensor([[73.7000, 73.8000, 73.8000,  0.1000, 73.4000, 55.4000],
        [13.5000, 73.5000, 73.3000, 72.8000, 73.3000, 73.6000],
        [73.5000, 73.5000, 73.6000, 73.2000, 67.6000,  0.0000],
        ...,
        [73.3000, 73.5000, 73.7000, 36.1000,  0.0000,  0.0000],
        [73.5000, 73.5000, 73.6000, 72.7000,  4.5000,  0.0000],
        [73.1000, 73.4000, 73.7000, 63.1000,  0.0000,  0.0000]])

In [32]:
y_test_scaled

tensor([[73.8000, 73.8000, 73.8000,  0.1400, 73.2900, 55.1800],
        [13.4900, 73.8000, 73.8000, 73.2200, 73.8000, 73.8000],
        [73.8000, 73.8000, 73.8000, 73.2200, 67.5900,  0.0000],
        ...,
        [73.8000, 73.8000, 73.8000, 36.3500,  0.0000,  0.0000],
        [73.5800, 73.5100, 73.6500, 72.5700,  4.3200,  0.0000],
        [73.8000, 73.8000, 73.8000, 63.2600,  0.0000,  0.0000]])

Now XGBoost model

In [33]:
xgboost_regressor = xg.XGBRegressor(objective ='reg:linear', random_state=42)
parameters_gb = {
    "n_estimators": [50, 100, 200],          # Number of boosting stages
    "learning_rate": [0.01, 0.05, 0.1],      # Shrinkage factor (lower = more robust)
    "max_depth": [3, 4, 5],                    # Depth of individual trees
    "min_samples_split": [2, 5, 10],              # Minimum samples to split a node
    "min_samples_leaf": [1, 2, 4],                # Minimum samples at a leaf node
    "max_features": ["sqrt", "log2", None],       # Features considered for splits  
}

tuning_model_gb = GridSearchCV(xgboost_regressor, param_grid=parameters_gb, scoring='neg_mean_squared_error', cv=ps, verbose=1, n_jobs=-1)
tuning_model_gb.fit(X_combined, y_combined)

Fitting 1 folds for each of 729 candidates, totalling 729 fits


Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.



In [37]:
best_model_gb = tuning_model_gb.best_estimator_

# Evaluate on test set
y_pred_gb = best_model_gb.predict(X_test)
y_pred_gb = np.clip(y_pred_gb, 0, 73.8)  # Ensure predictions are non-negative
y_pred_gb = torch.tensor(y_pred_gb, dtype=torch.float32)
y_pred_scaled_gb = y_pred_gb * 73.8  # Scale back to original units

test_mse_gb = mae(y_test_scaled, y_pred_scaled_gb)  
print("XGBoost Forest Model Evaluation:")
print(f"Best Model Test MSE: {test_mse_gb:.4f}")
print(f"Best Parameters: {tuning_model_gb.best_params_}")

y_pred_scaled_gb = torch.round(y_pred_scaled_gb, decimals=1)

XGBoost Forest Model Evaluation:
Best Model Test MSE: 0.3845
Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [35]:
y_pred_scaled_gb

tensor([[73.7000, 73.6000, 73.9000,  0.2000, 73.6000, 55.8000],
        [13.8000, 73.7000, 73.0000, 73.1000, 73.4000, 73.6000],
        [73.6000, 73.6000, 73.8000, 73.5000, 67.4000,  0.0000],
        ...,
        [73.5000, 73.6000, 73.7000, 36.6000,  0.0000,  0.0000],
        [73.5000, 73.5000, 73.7000, 72.8000,  4.3000,  0.0000],
        [73.4000, 73.5000, 73.7000, 62.9000,  0.0000,  0.0000]])

In [36]:
y_test_scaled

tensor([[73.8000, 73.8000, 73.8000,  0.1400, 73.2900, 55.1800],
        [13.4900, 73.8000, 73.8000, 73.2200, 73.8000, 73.8000],
        [73.8000, 73.8000, 73.8000, 73.2200, 67.5900,  0.0000],
        ...,
        [73.8000, 73.8000, 73.8000, 36.3500,  0.0000,  0.0000],
        [73.5800, 73.5100, 73.6500, 72.5700,  4.3200,  0.0000],
        [73.8000, 73.8000, 73.8000, 63.2600,  0.0000,  0.0000]])