In [None]:


# -*- coding: utf-8 -*-
"""
Created on Wed Oct 29 15:40:04 2025

@author: zemsk


Bleaching Presence Detection
Target variable: Percent_Bleaching

"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import os


path = "./coral-reef-global-bleaching"
filename_read = os.path.join(path, "coral_whole.csv")

df = pd.read_csv(filename_read)

# Removed only locations and labels, no real data touched
# df.drop("Ocean_Name", axis=1, inplace=True)
# df.drop("Country_Name", axis=1, inplace=True)
# df.drop("Sample_ID", axis=1, inplace=True)
# df.drop("Date_Year", axis=1, inplace=True)
# df.drop("Bleaching_Level", axis=1, inplace=True)
# df.drop("Realm_Name", axis=1, inplace=True)
# # #Percent_Cover is not a best predictor and also contain 30% of its fields as null.
# # # For the sake of bigger dataset this feature is dropped
# df.drop("Percent_Cover", axis=1, inplace=True)
# # df.drop("ClimSST", inplace=True, axis=1)
# df.drop("Exposure", inplace=True, axis=1)
# # df.drop("Temperature_Maximum", inplace=True, axis=1)

# label_encoder = LabelEncoder()
# df["Exposure"] = label_encoder.fit_transform(df["Exposure"])

# These features were taken into account that data is nonlinear
# df = df[["Distance_to_Shore", "Temperature_Mean", "Turbidity", "TSA", "Depth_m", "Percent_Bleaching"]]

# These features were taken into account that data is linear
df = df[['Cyclone_Frequency', 'Depth_m', 'ClimSST', 'Distance_to_Shore', 'Turbidity', 'TSA', 'Temperature_Mean', 'Percent_Bleaching']]

df.dropna(inplace=True)
df.info()



In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Percent_Bleaching'])
y = df['Percent_Bleaching']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
import cuml
import cudf
from cuml.ensemble import RandomForestRegressor as cuRF
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# Convert data to cudf DataFrame (GPU equivalent of pandas DataFrame)
X_cudf = cudf.DataFrame.from_records(X)
y_cudf = cudf.Series(y)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_cudf, y_cudf, test_size=0.2, random_state=42)

# Create and train the CuML RandomForestRegressor (on GPU)
rf_model = cuRF(n_estimators=600, max_depth=30, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test.to_array(), y_pred.to_array()))
print(f'RMSE: {rmse}')


In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=600,
    max_depth=30,
    criterion="friedman_mse",
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.,
    bootstrap=False,

    random_state=42,
    n_jobs=-1)

model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error

y_pred = model.predict(X_test)

score = r2_score(y_test, y_pred)

print(score)
# print(model.oob_score_)
print(mean_absolute_error(y_test, y_pred))


I have a feeling that ~0.65 (max 0.687) is the best performance for RandomForest. No matte how I would tackle its params or change feature set. Results tent to improve slightly, when include more features, despite big multicollinearity. Mb we could look for some more features from big file.
Could also try to take big file and apply PCA and see what happens

In [None]:
import xgboost as xgb
import numpy as np
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split, GridSearchCV, PredefinedSplit
from sklearn.metrics import mean_squared_error

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data into DMatrix format (for GPU acceleration)
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Define parameter grid for GridSearchCV
params = {
    "max_depth": [12],  # Depth of trees
    "min_child_weight": [1, 3, 6],  # Minimum sum of instance weight (hessian) in a child
    "gamma": [0.3],  # Minimum loss reduction to make a further partition
    "subsample": [0.7, 0.8, 1.0],  # Fraction of samples to grow each tree
    "colsample_bytree": [0.5, 0.7, 1.0],  # Fraction of features to consider for each tree
    "reg_alpha": [0, 0.1, 1],  # L1 regularization
    "reg_lambda": [1, 10, 50],  # L2 regularization
    "learning_rate": [0.05, 0.1],  # Step size shrinkage
}

# Add GPU-specific parameters to the param grid
params_gpu = {
    'tree_method': ['hist'],  # Use GPU-based histogram method for faster training
    'device': ['cuda'],  # Specify GPU device (use CUDA)
}

# Combine the two parameter grids into one
combined_params = {**params, **params_gpu}

test_fold = np.concatenate([ -1 * np.ones(len(X_train)), 0 * np.ones(len(X_test)) ])


ps = PredefinedSplit(test_fold)

# Create a GridSearchCV object with the combined parameters
gs = GridSearchCV(
    xgb.XGBRegressor(),
    combined_params,
    cv=ps,
    scoring="neg_mean_absolute_error",
    verbose=1,
    n_jobs=-1
)

# Train the model using GridSearchCV with GPU
gs.fit(X_train, y_train)

# Print the best parameters from GridSearchCV
print("Best parameters from GridSearchCV:", gs.best_params_)

# Get the best model from GridSearchCV
best_model = gs.best_estimator_

# Make predictions using the best model
y_pred = best_model.predict(X_test)

# Calculate RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')


In [None]:
evallist = [(dvalidation, 'validation'), (dtrain, 'train')]
num_round = 50 # Number of boosting rounds

bst = xgb.train(params, dtrain, num_round, evallist)
print("XGBoost model training complete on GPU.")

In [None]:
import xgboost as xgb
 
 

params = {
    'colsample_bytree': 1.0, 
    'device': 'cuda', 
    'gamma': 0.3, 
    'learning_rate': 0.1, 
    'max_depth': 12, 
    'min_child_weight': 6, 
    'reg_alpha': 0.1, 
    'reg_lambda': 1, 
    'subsample': 0.7, 
    'tree_method': 'hist',
    'objective': 'reg:squarederror',  # For regression task
    'eval_metric': 'mae',  
}


xgb_model = xgb.XGBRegressor(**params)

xgb_model.fit(X_train, y_train)

bst = xgb.train(
    params_gpu,
    dtrain,
    num_boost_round=600,
    evals=[(dtrain, 'train')],
    verbose_eval=True
)

In [None]:
from sklearn.model_selection import GridSearchCV, PredefinedSplit

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params_gpu = {
    'tree_method': ['hist'],
    'device': ['cuda'],
}

params = {
    "max_depth": [12],
    "min_child_weight": [1, 3, 6],
    "gamma": [0.3],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.5, 0.7, 1.0],
    "reg_alpha": [0, 0.1, 1],
    "reg_lambda": [1, 10, 50],
    "learning_rate": [0.05, 0.1],
}

combined_params = {**params, **params_gpu}

test_fold = np.concatenate([
    -1 * np.ones(len(X_train)),
     0 * np.ones(len(X_test))
])
ps = PredefinedSplit(test_fold)

gs = GridSearchCV(
    xgb.XGBRegressor(),
    combined_params,
    cv=ps,
    scoring="neg_mean_absolute_error",
    verbose=1,
    n_jobs=-1
)

X_all = np.concatenate([X_train, X_test])
y_all = np.concatenate([y_train, y_test])

gs.fit(X_all, y_all)
print(gs.best_params_)


In [None]:
y_pred = xgb_model.predict(X_test)

score = r2_score(y_test, y_pred)

print(score)
# print(model.oob_score_)
print(mean_absolute_error(y_test, y_pred))


Compared to the Random Forest model, XGBoost improved predictive performance by 39%. Since I didnt observe strong nonlinear patterns during preprocessing, this gain likely comes from XGBoost ability to capture more complex feature interactions rather than nonlinear effects in individual variables. This is consistent with the correlation analysis: the Spearman matrix indicates that changes in some features influence others quite noticeably, while the mutual information matrix shows little evidence of direct nonlinear relationships. Taken together, it suggests that the datasetâ€™s complexity comes primarily from interactions between features rather than from standalone nonlinearities.

In [None]:
from matplotlib.pyplot import imread

# -*- coding: utf-8 -*-
"""
Created on Saturday Nov 15 15:34 2025

@author: 100yearsahead


Bleaching Presence Detection
Target variable: Percent_Bleaching

"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import os


path = "./coral-reef-global-bleaching"
filename_read = os.path.join(path, "coral_whole.csv")

df = pd.read_csv(filename_read)

# Removed only locations and labels, no real data touched
# df.drop("Ocean_Name", axis=1, inplace=True)
# df.drop("Country_Name", axis=1, inplace=True)
# df.drop("Sample_ID", axis=1, inplace=True)
# df.drop("Date_Year", axis=1, inplace=True)
# df.drop("Bleaching_Level", axis=1, inplace=True)
# df.drop("Realm_Name", axis=1, inplace=True)
# # #Percent_Cover is not a best predictor and also contain 30% of its fields as null.
# # # For the sake of bigger dataset this feature is dropped
# df.drop("Percent_Cover", axis=1, inplace=True)
# # df.drop("ClimSST", inplace=True, axis=1)
# df.drop("Exposure", inplace=True, axis=1)
# # df.drop("Temperature_Maximum", inplace=True, axis=1)

# label_encoder = LabelEncoder()
# df["Exposure"] = label_encoder.fit_transform(df["Exposure"])

# These features were taken into account that data is nonlinear
# df = df[["Distance_to_Shore", "Temperature_Mean", "Turbidity", "TSA", "Depth_m", "Percent_Bleaching"]]

# These features were taken into account that data is linear
#df = df[['Cyclone_Frequency', 'Depth_m', 'ClimSST', 'Distance_to_Shore', 'Turbidity', 'TSA', 'Temperature_Mean', 'Percent_Bleaching']]

df.dropna(inplace=True)
df  = df.drop(columns=['Sample_ID'])
df.info()
print(df)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Split first (no leakage)
X = df.drop(columns=['Percent_Bleaching'])
y = df['Percent_Bleaching']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Since we have categorical variables we need to seperate the numeric and the categorical variables
cat_cols = ['Realm_Name','Ocean_Name','Country_Name','Exposure','Bleaching_Level']
num_cols = [col for col in X.columns if col not in cat_cols]



# We one_hot_encode the categorical features
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_train_cat = ohe.fit_transform(X_train[cat_cols])
X_test_cat  = ohe.transform(X_test[cat_cols])


# We scale the numeric features
scaler = StandardScaler()

X_train_num = scaler.fit_transform(X_train[num_cols])
X_test_num  = scaler.transform(X_test[num_cols])


# Combine the categorical and numerical features
X_train_processed = np.hstack([X_train_num, X_train_cat])
X_test_processed  = np.hstack([X_test_num, X_test_cat])






In [None]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=9,
    learning_rate=0.3,
    min_child_weight=1,
    gamma=0, #we dont need to add penalty since there`s enough samples I guess
    subsample=1,
    colsample_bytree=1,
    reg_alpha=0,
    reg_lambda=1,
    objective='reg:squarederror',
)

xgb_model.fit(X_train_processed, y_train)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error

y_pred = xgb_model.predict(X_test_processed)

score = r2_score(y_test, y_pred)

print(score)
# print(model.oob_score_)
print(mean_squared_error(y_test, y_pred))


In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=600,
    max_depth=30,
    criterion="friedman_mse",
    max_features=0.5,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.,
    bootstrap=False,
    random_state=42,
    n_jobs=-1)

model.fit(X_train_processed, y_train)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error

y_pred = model.predict(X_test_processed)

score = r2_score(y_test, y_pred)

print(score)
# print(model.oob_score_)
print(mean_absolute_error(y_test, y_pred))


The good performance of the Random Forest model on high-dimensional feature sets can be attributed to its ensemble nature and its ability to aggregate a large number of weak but diverse predictors.

In contrast, XGBoost rely on sequential learning, where each new tree attempts to correct the residual errors of the previous ones. When the dataset contains only a few strong predictors, XGBoost can explore these features more deeply and model complex interactions, often achieving superior predictive performance.

Thus, Random Forests may outperform boosting methods in scenarios with large numbers of weak predictors, whereas XGBoost is m
ore effective when the dataset contains fewer but more influential features.