In [39]:
# -*- coding: utf-8 -*-
"""
Created on Wed Oct 29 15:40:04 2025

@author: zemsk


Bleaching Presence Detection
Target variable: Percent_Bleaching

"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import os


path = "./coral-reef-global-bleaching"
filename_read = os.path.join(path, "coral_whole.csv")

df = pd.read_csv(filename_read)

# Removed only locations and labels, no real data touched
# df.drop("Ocean_Name", axis=1, inplace=True)
# df.drop("Country_Name", axis=1, inplace=True)
# df.drop("Sample_ID", axis=1, inplace=True)
# df.drop("Date_Year", axis=1, inplace=True)
# df.drop("Bleaching_Level", axis=1, inplace=True)
# df.drop("Realm_Name", axis=1, inplace=True)
# # #Percent_Cover is not a best predictor and also contain 30% of its fields as null.
# # # For the sake of bigger dataset this feature is dropped
# df.drop("Percent_Cover", axis=1, inplace=True)
# # df.drop("ClimSST", inplace=True, axis=1)
# df.drop("Exposure", inplace=True, axis=1)
# # df.drop("Temperature_Maximum", inplace=True, axis=1)

# label_encoder = LabelEncoder()
# df["Exposure"] = label_encoder.fit_transform(df["Exposure"])

# These features were taken into account that data is nonlinear
# df = df[["Distance_to_Shore", "Temperature_Mean", "Turbidity", "TSA", "Depth_m", "Percent_Bleaching"]]

# These features were taken into account that data is linear
df = df[['Cyclone_Frequency', 'Depth_m', 'ClimSST', 'Distance_to_Shore', 'Turbidity', 'TSA', 'Temperature_Mean', 'Percent_Bleaching']]

df.dropna(inplace=True)
df.info()



<class 'pandas.core.frame.DataFrame'>
Index: 32714 entries, 0 to 35044
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Cyclone_Frequency  32714 non-null  float64
 1   Depth_m            32714 non-null  float64
 2   ClimSST            32714 non-null  float64
 3   Distance_to_Shore  32714 non-null  float64
 4   Turbidity          32714 non-null  float64
 5   TSA                32714 non-null  float64
 6   Temperature_Mean   32714 non-null  float64
 7   Percent_Bleaching  32714 non-null  float64
dtypes: float64(8)
memory usage: 2.2 MB


In [40]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['Percent_Bleaching'])
y = df['Percent_Bleaching']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [43]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=600,
    max_depth=30,
    criterion="friedman_mse",
    max_features=0.3,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.,
    bootstrap=False,

    random_state=42,
    n_jobs=-1)

model.fit(X_train, y_train)

In [44]:
from sklearn.metrics import r2_score, mean_squared_error

y_pred = model.predict(X_test)

score = r2_score(y_test, y_pred)

print(score)
# print(model.oob_score_)
print(mean_squared_error(y_test, y_pred))


0.6916481848959288
99.43951094606886


I have a feeling that ~0.65 (max 0.687) is the best performance for RandomForest. No matte how I would tackle its params or change feature set. Results tent to improve slightly, when include more features, despite big multicollinearity. Mb we could look for some more features from big file.
Could also try to take big file and apply PCA and see what happens

In [112]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=9,
    learning_rate=0.3,
    min_child_weight=1,
    gamma=0, #we dont need to add penalty since there`s enough samples I guess
    subsample=1,
    colsample_bytree=1,
    reg_alpha=0,
    reg_lambda=1,
    objective='reg:squarederror',
)

xgb_model.fit(X, y)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1
,device,
,early_stopping_rounds,
,enable_categorical,False


In [113]:
y_pred = xgb_model.predict(X_test)

score = r2_score(y_test, y_pred)

print(score)
# print(model.oob_score_)
print(mean_squared_error(y_test, y_pred))


0.9772885931484977
7.3241378178635275


Compared to the Random Forest model, XGBoost improved predictive performance by 39%. Since I didnt observe strong nonlinear patterns during preprocessing, this gain likely comes from XGBoost ability to capture more complex feature interactions rather than nonlinear effects in individual variables. This is consistent with the correlation analysis: the Spearman matrix indicates that changes in some features influence others quite noticeably, while the mutual information matrix shows little evidence of direct nonlinear relationships. Taken together, it suggests that the dataset’s complexity comes primarily from interactions between features rather than from standalone nonlinearities.

In [3]:
from matplotlib.pyplot import imread

# -*- coding: utf-8 -*-
"""
Created on Saturday Nov 15 15:34 2025

@author: 100yearsahead


Bleaching Presence Detection
Target variable: Percent_Bleaching

"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import os


path = "./coral-reef-global-bleaching"
filename_read = os.path.join(path, "coral_whole.csv")

df = pd.read_csv(filename_read)

# Removed only locations and labels, no real data touched
# df.drop("Ocean_Name", axis=1, inplace=True)
# df.drop("Country_Name", axis=1, inplace=True)
# df.drop("Sample_ID", axis=1, inplace=True)
# df.drop("Date_Year", axis=1, inplace=True)
# df.drop("Bleaching_Level", axis=1, inplace=True)
# df.drop("Realm_Name", axis=1, inplace=True)
# # #Percent_Cover is not a best predictor and also contain 30% of its fields as null.
# # # For the sake of bigger dataset this feature is dropped
# df.drop("Percent_Cover", axis=1, inplace=True)
# # df.drop("ClimSST", inplace=True, axis=1)
# df.drop("Exposure", inplace=True, axis=1)
# # df.drop("Temperature_Maximum", inplace=True, axis=1)

# label_encoder = LabelEncoder()
# df["Exposure"] = label_encoder.fit_transform(df["Exposure"])

# These features were taken into account that data is nonlinear
# df = df[["Distance_to_Shore", "Temperature_Mean", "Turbidity", "TSA", "Depth_m", "Percent_Bleaching"]]

# These features were taken into account that data is linear
#df = df[['Cyclone_Frequency', 'Depth_m', 'ClimSST', 'Distance_to_Shore', 'Turbidity', 'TSA', 'Temperature_Mean', 'Percent_Bleaching']]

df.dropna(inplace=True)
df  = df.drop(columns=['Sample_ID'])
df.info()
print(df)


<class 'pandas.core.frame.DataFrame'>
Index: 22561 entries, 6981 to 35042
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Cyclone_Frequency    22561 non-null  float64
 1   Depth_m              22561 non-null  float64
 2   ClimSST              22561 non-null  float64
 3   Ocean_Name           22561 non-null  object 
 4   Country_Name         22561 non-null  object 
 5   Distance_to_Shore    22561 non-null  float64
 6   Exposure             22561 non-null  object 
 7   Turbidity            22561 non-null  float64
 8   Date_Year            22561 non-null  int64  
 9   Bleaching_Level      22561 non-null  object 
 10  Temperature_Maximum  22561 non-null  float64
 11  SSTA                 22561 non-null  float64
 12  TSA                  22561 non-null  float64
 13  Percent_Bleaching    22561 non-null  float64
 14  Temperature_Mean     22561 non-null  float64
 15  Realm_Name           22561 non-null  o

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Split first (no leakage)
X = df.drop(columns=['Percent_Bleaching'])
y = df['Percent_Bleaching']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Since we have categorical variables we need to seperate the numeric and the categorical variables
cat_cols = ['Realm_Name','Ocean_Name','Country_Name','Exposure','Bleaching_Level']
num_cols = [col for col in X.columns if col not in cat_cols]



# We one_hot_encode the categorical features
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

X_train_cat = ohe.fit_transform(X_train[cat_cols])
X_test_cat  = ohe.transform(X_test[cat_cols])


# We scale the numeric features
scaler = StandardScaler()

X_train_num = scaler.fit_transform(X_train[num_cols])
X_test_num  = scaler.transform(X_test[num_cols])


# Combine the categorical and numerical features
X_train_processed = np.hstack([X_train_num, X_train_cat])
X_test_processed  = np.hstack([X_test_num, X_test_cat])






In [21]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=9,
    learning_rate=0.3,
    min_child_weight=1,
    gamma=0, #we dont need to add penalty since there`s enough samples I guess
    subsample=1,
    colsample_bytree=1,
    reg_alpha=0,
    reg_lambda=1,
    objective='reg:squarederror',
)

xgb_model.fit(X_train_processed, y_train)

In [22]:
from sklearn.metrics import r2_score, mean_squared_error

y_pred = xgb_model.predict(X_test_processed)

score = r2_score(y_test, y_pred)

print(score)
# print(model.oob_score_)
print(mean_squared_error(y_test, y_pred))


0.8060004661717783
22.81571698634007


In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=600,
    max_depth=30,
    criterion="friedman_mse",
    max_features=0.5,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.,
    bootstrap=False,
    random_state=42,
    n_jobs=-1)

model.fit(X_train_processed, y_train)

In [32]:
from sklearn.metrics import r2_score, mean_squared_error

y_pred = model.predict(X_test_processed)

score = r2_score(y_test, y_pred)

print(score)
# print(model.oob_score_)
print(mean_squared_error(y_test, y_pred))


0.8422120119061569
18.55698314914926


The good performance of the Random Forest model on high-dimensional feature sets can be attributed to its ensemble nature and its ability to aggregate a large number of weak but diverse predictors.

In contrast, XGBoost rely on sequential learning, where each new tree attempts to correct the residual errors of the previous ones. When the dataset contains only a few strong predictors, XGBoost can explore these features more deeply and model complex interactions, often achieving superior predictive performance.

Thus, Random Forests may outperform boosting methods in scenarios with large numbers of weak predictors, whereas XGBoost is more effective when the dataset contains fewer but more influential features.