<a href="https://colab.research.google.com/github/BEMiklos/PlantTraits2024/blob/main/XGB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q kaggle

In [4]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [5]:
!kaggle competitions download -c planttraits2024

Downloading planttraits2024.zip to /content
 99% 3.15G/3.17G [00:27<00:00, 301MB/s]
100% 3.17G/3.17G [00:27<00:00, 126MB/s]


In [6]:
!unzip -q '/content/planttraits2024.zip'

In [67]:
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import albumentations as A
import cv2

import torch
from torch import nn
import torch.nn.functional as F
from torchvision.io import read_image
from torch.utils.data import Dataset, ConcatDataset, DataLoader, Subset, random_split
from torchvision import transforms

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score
from sklearn.preprocessing import PolynomialFeatures

import xgboost as xgb

import re

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
random_state = 7

np.random.seed(0)

random.seed(0)

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
TRAIN_IMAGES_PATH = '/content/train_images'
TEST_IMAGES_PATH = '/content/test_images'

### Untility functions

In [4]:
mean_pattern = r'X\d+_mean'
sd_pattern = r'X\d+_sd'

In [5]:
def find_matching_elements(lst,pattern):
    matching_elements = [element for element in lst if re.search(pattern, element)]
    return matching_elements

In [6]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [7]:
train_df.sample(10)

Unnamed: 0,id,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,...,X18_mean,X26_mean,X50_mean,X3112_mean,X4_sd,X11_sd,X18_sd,X26_sd,X50_sd,X3112_sd
24886,184873835,-1.032917,574.295227,90.25238,62.482738,1227.480225,42.762859,87,144,125,...,0.174163,0.645791,0.433834,615.749773,,,,,,
31965,190494649,0.440794,589.538086,48.0,33.024284,1090.136475,39.804287,54,149,118,...,0.554565,1.121493,1.130741,725.767936,0.002047,0.909884,0.073212,0.059998,0.042375,30.941051
28421,195642066,17.284445,341.909515,65.457146,89.490135,544.396545,31.776667,138,163,151,...,0.439074,0.601914,1.030841,304.82168,0.003821,0.326769,0.027901,0.153138,0.013564,16.485387
12674,164426751,12.075297,166.142853,24.5,49.969826,902.658264,38.064285,129,148,141,...,0.199941,3.176854,2.202036,509.618373,,,,,,
31158,186882920,17.013542,1030.795898,266.556122,91.027039,177.292847,14.822959,107,117,112,...,0.231805,0.051018,1.190324,89.976644,0.005543,0.221099,0.025756,0.024287,0.015604,5.421096
45443,195637432,-0.526925,317.861908,87.438095,113.449104,1096.622192,40.898571,108,146,128,...,0.359241,1.062527,1.332752,3322.765956,,,,,,
19727,194871319,20.958969,2068.119141,264.828583,53.143101,488.990204,19.647144,115,144,128,...,0.232182,0.041521,2.149017,669.891966,,,,,,
20600,186386731,18.402203,563.590454,81.157143,64.466095,437.827026,24.329046,121,136,131,...,0.230897,0.500619,0.942541,608.056653,,,,,,
24052,164361896,17.402231,870.571411,82.178574,35.694508,825.165405,35.850002,148,174,162,...,0.283748,0.163265,1.672265,62.625917,,,,,,
35949,195065314,11.712444,437.18222,34.533333,28.75639,632.724487,29.993334,127,155,137,...,0.009491,0.147462,1.895993,9.095304,0.005892,0.749986,0.000848,0.008639,0.137162,1.257774


In [8]:
len(train_df.columns), len(test_df.columns)

(176, 164)

In [9]:
len(train_df), len(test_df)

(55489, 6545)

In [10]:
targets = [col for col in train_df.columns if col not in test_df.columns]
targets = targets[:6]

In [11]:
metadata_columns = [col for col in train_df.columns if col in test_df.columns and col != 'id']

 TODO: hiányzó értékek kezelésének pontosítása

In [12]:
# Check for missing values in the train DataFrame
train_missing_values = train_df.isnull().sum()
print("Missing values in Train DataFrame:")
print(train_missing_values)

Missing values in Train DataFrame:
id                                                                           0
WORLDCLIM_BIO1_annual_mean_temperature                                       0
WORLDCLIM_BIO12_annual_precipitation                                         0
WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month        0
WORLDCLIM_BIO15_precipitation_seasonality                                    0
                                                                         ...  
X11_sd                                                                   16341
X18_sd                                                                   16341
X26_sd                                                                   16341
X50_sd                                                                   16341
X3112_sd                                                                 16341
Length: 176, dtype: int64


In [13]:
train_df.fillna(train_df.mean(), inplace=True)

In [14]:
columns_to_log = ['X11_mean', 'X18_mean', 'X26_mean', 'X50_mean', 'X3112_mean',
                  'X4_sd', 'X11_sd', 'X18_sd', 'X26_sd', 'X50_sd', 'X3112_sd']

# Apply log10 transformation to the specified columns
for column in columns_to_log:
    train_df[column] = np.log10(train_df[column] + 1)  # Adding 1 to avoid log(0)


In [15]:
def split_dataframe(train_df, test_df, train_size=0.8, val_size=0.2):
    if train_size + val_size != 1:
        raise ValueError("train_size, val_size, and test_size must sum to 1")

    train_df, val_df = train_test_split(train_df, train_size=train_size, random_state=random_state)
    return train_df, val_df, test_df

In [16]:
def r2_loss(output, target):
    target = target
    output = output
    target_mean = torch.mean(target)
    ss_tot = torch.sum((target - target_mean) ** 2)
    ss_res = torch.sum((target - output) ** 2)
    r2 = 1 - ss_res / ss_tot
    return 1 - r2

In [24]:
!pip -q install torchmetrics

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/841.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/841.5 kB[0m [31m1.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/841.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m286.7/841.5 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m655.4/841.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m841.5/841.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [32]:
from torchmetrics.regression import R2Score

# Data preparation

In [19]:
bemeno_valtozok = metadata_columns

In [20]:
celvaltozok = targets[:6]
print("Targets:",celvaltozok)

Targets: ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean', 'X3112_mean']


In [21]:
train_df, val_df, test_df = split_dataframe(train_df.sample(frac=1, random_state=1), test_df.sample(frac=1, random_state=1))

In [22]:
train_df

Unnamed: 0,id,WORLDCLIM_BIO1_annual_mean_temperature,WORLDCLIM_BIO12_annual_precipitation,WORLDCLIM_BIO13.BIO14_delta_precipitation_of_wettest_and_dryest_month,WORLDCLIM_BIO15_precipitation_seasonality,WORLDCLIM_BIO4_temperature_seasonality,WORLDCLIM_BIO7_temperature_annual_range,SOIL_bdod_0.5cm_mean_0.01_deg,SOIL_bdod_100.200cm_mean_0.01_deg,SOIL_bdod_15.30cm_mean_0.01_deg,...,X18_mean,X26_mean,X50_mean,X3112_mean,X4_sd,X11_sd,X18_sd,X26_sd,X50_sd,X3112_sd
24239,171376117,15.120000,1123.524414,179.373337,75.942123,307.151367,19.777777,117,132,124,...,0.150808,0.460390,0.365825,3.030315,0.003389,0.316719,1.230538,2.048182,0.031452,2.657072
26870,196019212,13.562074,1100.364502,180.542221,73.934456,289.352539,19.765333,120,129,122,...,0.110277,0.018063,0.380435,1.104905,0.004645,0.209981,0.071754,0.003870,0.014029,0.808458
40848,173458431,26.131250,3183.500000,528.928589,65.707703,76.344696,12.764285,95,99,98,...,0.910721,1.381843,0.415153,4.168933,0.003189,0.525816,0.354643,0.560645,0.058388,3.775916
51793,176675400,2.116071,887.904785,151.476196,68.991066,99.114906,16.292856,112,126,120,...,0.152621,0.226221,0.540500,2.839951,0.002170,0.017644,0.010979,0.003070,0.001258,1.205834
21466,74535495,21.885370,331.084442,64.617775,90.718796,418.202576,25.793777,153,153,153,...,0.517726,0.743853,0.435185,3.350601,0.003967,0.242263,0.316392,0.243510,0.020678,2.887234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25516,188291299,12.671389,816.938110,141.033340,72.865730,190.145630,20.977144,114,125,124,...,0.247571,0.817857,0.357862,3.414937,0.003389,0.316719,1.230538,2.048182,0.031452,2.657072
47484,141070257,19.723888,114.900002,34.566666,114.762772,240.889420,20.880001,140,143,141,...,0.182551,0.263291,1.338501,2.799447,0.004340,0.004888,0.034858,0.024770,0.150192,2.293097
20133,195683798,10.255437,1059.966675,45.538094,16.683119,899.692749,36.213810,138,166,151,...,1.222907,3.721420,0.342839,3.440436,0.003787,0.231735,0.255957,2.906730,0.014705,2.113903
39678,194874328,19.927977,3078.928467,269.892853,38.449482,467.681274,18.314285,109,135,117,...,0.234801,0.648415,0.366869,4.081805,0.003389,0.316719,1.230538,2.048182,0.031452,2.657072


# Feature engineering

In [36]:
worldclim = train_df.columns[1:7]
soil = train_df.columns[7:68]
modis = train_df.columns[68:128]
vod = train_df.columns[128:164]

In [37]:
targets = ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean','X3112_mean', 'X4_sd',
 'X11_sd',
 'X18_sd',
 'X26_sd',
 'X50_sd',
 'X3112_sd']

In [38]:
feature_cols = [col for col in train_df.columns if col not in targets]
features = train_df[feature_cols]

In [46]:
targets = ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean','X3112_mean']

In [39]:
imputer = SimpleImputer(strategy='median')

features_imputed = imputer.fit_transform(features)
features_imputed = pd.DataFrame(features_imputed, columns=features.columns)

In [40]:
scaler = StandardScaler()

features_scaled = scaler.fit_transform(features_imputed)
features_scaled = pd.DataFrame(features_scaled, columns=features.columns)

In [47]:
y_multi = train_df[targets]
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    features_scaled[feature_cols], y_multi, test_size=0.2, random_state=42)

multi_xgb_model = MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42))

multi_xgb_model.fit(X_train_multi, y_train_multi)

y_pred_multi = multi_xgb_model.predict(X_test_multi)

r2_scores = r2_score(y_test_multi, y_pred_multi, multioutput='raw_values')
print("R² scores for each target (on scaled features):", r2_scores)

R² scores for each target (on scaled features): [ 0.06519863  0.16375705  0.3071952   0.23258292 -0.24942554  0.25492991]


## Dimension reduction based on importance

In [48]:
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(features_scaled, train_df[targets], test_size=0.2, random_state=42)

xgb_model_full = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_model_full.fit(X_train_full, y_train_full)

importances = xgb_model_full.feature_importances_

feature_names = features_scaled.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)
print(importance_df.head(20))

                                              Feature  Importance
0              WORLDCLIM_BIO1_annual_mean_temperature    0.053923
1             WORLDCLIM_BIO7_temperature_annual_range    0.032605
2              WORLDCLIM_BIO4_temperature_seasonality    0.019557
3   MODIS_2000.2020_monthly_mean_surface_reflectan...    0.015389
4                     SOIL_ocd_60.100cm_mean_0.01_deg    0.010839
5   MODIS_2000.2020_monthly_mean_surface_reflectan...    0.009633
6                    SOIL_sand_60.100cm_mean_0.01_deg    0.009385
7                WORLDCLIM_BIO12_annual_precipitation    0.008858
8   WORLDCLIM_BIO13.BIO14_delta_precipitation_of_w...    0.008784
9   MODIS_2000.2020_monthly_mean_surface_reflectan...    0.008703
10                   SOIL_phh2o_30.60cm_mean_0.01_deg    0.008532
11                VOD_Ku_1987_2017_multiyear_mean_m08    0.008341
12                    SOIL_silt_30.60cm_mean_0.01_deg    0.008260
13                 VOD_X_1997_2018_multiyear_mean_m10    0.008218
14        

In [49]:
top_features = importance_df['Feature'].head(30).tolist()
y_multi = train_df[targets]
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    features_scaled[top_features], y_multi, test_size=0.2, random_state=42)

multi_xgb_model = MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42))

multi_xgb_model.fit(X_train_multi, y_train_multi)

y_pred_multi = multi_xgb_model.predict(X_test_multi)

r2_scores = r2_score(y_test_multi, y_pred_multi, multioutput='raw_values')
print("R² scores for each target (with the top 30 important features):", r2_scores)

R² scores for each target (with the top 30 important features): [ 0.093593    0.15704444  0.303049    0.228631   -0.18055592  0.24359068]


R² scores for each target (on scaled features): [ 0.06519863,  0.16375705,  0.3071952,   0.23258292, -0.24942554,  0.25492991]

R² scores for each target (with the top 30 important features): [ 0.093593,    0.15704444,  0.303049,    0.228631,   -0.18055592,  0.24359068]

# Dimension reduction with PCA

In [50]:
pca = PCA(n_components=min(features_scaled.shape))

pca.fit(features_scaled)

cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

In [51]:
n_components_95 = np.where(cumulative_variance_ratio >= 0.95)[0][0] + 1

pca_95 = PCA(n_components=n_components_95)

features_pca_95 = pca_95.fit_transform(features_scaled)

features_pca_95_df = pd.DataFrame(features_pca_95, columns=[f'PC{i+1}' for i in range(n_components_95)])
features_pca_95_df.head(), n_components_95

(        PC1       PC2       PC3       PC4       PC5       PC6       PC7  \
 0  1.752592 -4.705826 -1.658646 -0.237254  0.793511 -2.338007 -1.751561   
 1  2.015580 -2.177639 -2.931064  0.877646  2.630513 -0.920006 -1.026357   
 2  6.143819 -6.758618  4.730867  0.514096 -1.498595  0.882158 -4.123164   
 3  2.972285  1.678727  2.701243 -2.987341 -0.353421  4.852251 -1.072083   
 4 -7.082553 -2.660474 -5.109291 -4.465095  1.849223 -1.732129 -0.115992   
 
         PC8       PC9      PC10      PC11      PC12      PC13      PC14  \
 0  2.650341  0.879054  0.370322 -1.133138 -0.110596 -0.455378  0.193836   
 1  0.542654  1.081605 -0.052364 -0.253646 -0.305663 -1.651285  0.337962   
 2  5.063336 -0.348512  3.381313 -1.122087  1.348846  0.416042 -2.509848   
 3 -0.430468 -3.790803  1.163000 -1.283125 -0.621630 -0.566913  0.409377   
 4 -0.483845  1.874738  0.996085 -0.396926 -0.226993 -0.830693  1.196308   
 
        PC15      PC16      PC17      PC18      PC19  
 0 -1.147980 -0.223689 -0.908

In [54]:
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    features_pca_95_df, y_multi, test_size=0.2, random_state=42)

multi_xgb_model = MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42))

multi_xgb_model.fit(X_train_multi, y_train_multi)

y_pred_multi = multi_xgb_model.predict(X_test_multi)

r2_scores = r2_score(y_test_multi, y_pred_multi, multioutput='raw_values')
print("R² scores for each target (PCA):", r2_scores)

R² scores for each target (PCA): [ 0.0397597   0.15494675  0.28841653  0.21596206 -0.2079703   0.24695846]


R² scores for each target (on scaled features): [ 0.06519863, 0.16375705, 0.3071952, 0.23258292, -0.24942554, 0.25492991]

R² scores for each target (with the top 30 important features): [ 0.093593, 0.15704444, 0.303049, 0.228631, -0.18055592, 0.24359068]

R² scores for each target (PCA): [ 0.0397597,   0.15494675,  0.28841653,  0.21596206, -0.2079703,   0.24695846]

In [65]:
X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(features_pca_95_df, train_df[targets], test_size=0.2, random_state=42)

xgb_model_full = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
xgb_model_full.fit(X_train_full, y_train_full)

importances = xgb_model_full.feature_importances_

feature_names = features_pca_95_df.columns
importance_df_pca = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df_pca = importance_df.sort_values(by='Importance', ascending=False).reset_index(drop=True)
print(importance_df_pca.head(20))

                                              Feature  Importance
0              WORLDCLIM_BIO1_annual_mean_temperature    0.053923
1             WORLDCLIM_BIO7_temperature_annual_range    0.032605
2              WORLDCLIM_BIO4_temperature_seasonality    0.019557
3   MODIS_2000.2020_monthly_mean_surface_reflectan...    0.015389
4                     SOIL_ocd_60.100cm_mean_0.01_deg    0.010839
5   MODIS_2000.2020_monthly_mean_surface_reflectan...    0.009633
6                    SOIL_sand_60.100cm_mean_0.01_deg    0.009385
7                WORLDCLIM_BIO12_annual_precipitation    0.008858
8   WORLDCLIM_BIO13.BIO14_delta_precipitation_of_w...    0.008784
9   MODIS_2000.2020_monthly_mean_surface_reflectan...    0.008703
10                   SOIL_phh2o_30.60cm_mean_0.01_deg    0.008532
11                VOD_Ku_1987_2017_multiyear_mean_m08    0.008341
12                    SOIL_silt_30.60cm_mean_0.01_deg    0.008260
13                 VOD_X_1997_2018_multiyear_mean_m10    0.008218
14        

In [66]:
top_features = importance_df_pca['Feature'].head(30).tolist()
y_multi = train_df[targets]
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    features_scaled[top_features], y_multi, test_size=0.2, random_state=42)

multi_xgb_model = MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42))

multi_xgb_model.fit(X_train_multi, y_train_multi)

y_pred_multi = multi_xgb_model.predict(X_test_multi)

r2_scores = r2_score(y_test_multi, y_pred_multi, multioutput='raw_values')
print("R² scores for each target (PCA with the top 30 important features):", r2_scores)

R² scores for each target (PCA with the top 30 important features): [ 0.093593    0.15704444  0.303049    0.228631   -0.18055592  0.24359068]


R² scores for each target (on scaled features): [ 0.06519863, 0.16375705, 0.3071952, 0.23258292, -0.24942554, 0.25492991]

R² scores for each target (with the top 30 important features): [ 0.093593, 0.15704444, 0.303049, 0.228631, -0.18055592, 0.24359068]

R² scores for each target (PCA): [ 0.0397597, 0.15494675, 0.28841653, 0.21596206, -0.2079703, 0.24695846]

R² scores for each target (PCA with the top 30 important features): [ 0.093593,    0.15704444,  0.303049,    0.228631,   -0.18055592,  0.24359068]

In [74]:
from sklearn.model_selection import GridSearchCV

In [75]:
top_features = importance_df_pca['Feature'].head(30).tolist()
targets = ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean','X3112_mean']
y_multi = train_df[targets]
X = features_scaled[top_features]

X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X, y_multi, test_size=0.2, random_state=42)

xgb_base = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

multi_xgb = MultiOutputRegressor(xgb_base)

param_grid = {
    'estimator__n_estimators': [50, 100, 200],
    'estimator__max_depth': [3, 5, 7],
    'estimator__learning_rate': [0.01, 0.1, 0.2]
}

grid_search = GridSearchCV(estimator=multi_xgb, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='r2')

grid_search.fit(X_train_multi, y_train_multi)

print("Best parameters found: ", grid_search.best_params_)
print("Best R² score found: ", grid_search.best_score_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


  pid = os.fork()


Best parameters found:  {'estimator__learning_rate': 0.1, 'estimator__max_depth': 5, 'estimator__n_estimators': 100}
Best R² score found:  0.1551971370767798


In [77]:
top_features = importance_df_pca['Feature'].head(30).tolist()
targets = ['X4_mean', 'X11_mean', 'X18_mean', 'X26_mean', 'X50_mean','X3112_mean']
y_multi = train_df[targets]
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    features_scaled[top_features], y_multi, test_size=0.2, random_state=42)

multi_xgb_model = MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42, learning_rate=0.1, max_depth=5))

multi_xgb_model.fit(X_train_multi, y_train_multi)

y_pred_multi = multi_xgb_model.predict(X_test_multi)

r2_scores = r2_score(y_test_multi, y_pred_multi, multioutput='raw_values')
print("R² scores for each target (PCA with the top 30 important features):", r2_scores)

R² scores for each target (PCA with the top 30 important features): [0.12358511 0.17924043 0.31857293 0.24550138 0.01399558 0.26230538]


R² scores for each target (PCA with the top 30 important features optimized): [0.12358511, 0.17924043, 0.31857293, 0.24550138, 0.01399558, 0.26230538]

 R² scores for each target (PCA with the top 30 important features): [ 0.093593, 0.15704444, 0.303049, 0.228631, -0.18055592, 0.24359068]

