In [47]:
from matplotlib.pyplot import imread

# -*- coding: utf-8 -*-
"""
Created on Saturday Nov 15 15:34 2025

@author: 100yearsahead


Bleaching Presence Detection
Target variable: Percent_Bleaching

"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import os


path = "../../coral-reef-global-bleaching"
filename_read = os.path.join(path, "coral_whole.csv")

df = pd.read_csv(filename_read)

# Removed only locations and labels, no real data touched
# df.drop("Ocean_Name", axis=1, inplace=True)
# df.drop("Country_Name", axis=1, inplace=True)
# df.drop("Sample_ID", axis=1, inplace=True)
# df.drop("Date_Year", axis=1, inplace=True)
# df.drop("Bleaching_Level", axis=1, inplace=True)
# df.drop("Realm_Name", axis=1, inplace=True)
# # #Percent_Cover is not a best predictor and also contain 30% of its fields as null.
# # # For the sake of bigger dataset this feature is dropped
# df.drop("Percent_Cover", axis=1, inplace=True)
# # df.drop("ClimSST", inplace=True, axis=1)
# df.drop("Exposure", inplace=True, axis=1)
# # df.drop("Temperature_Maximum", inplace=True, axis=1)

# label_encoder = LabelEncoder()
# df["Exposure"] = label_encoder.fit_transform(df["Exposure"])

# These features were taken into account that data is nonlinear
# df = df[["Distance_to_Shore", "Temperature_Mean", "Turbidity", "TSA", "Depth_m", "Percent_Bleaching"]]

# These features were taken into account that data is linear
#df = df[['Cyclone_Frequency', 'Depth_m', 'ClimSST', 'Distance_to_Shore', 'Turbidity', 'TSA', 'Temperature_Mean', 'Percent_Bleaching']]

# df  = df.drop(columns=['Sample_ID', 'Percent_Cover', 'Date_Year' ])
selected_features = ['Country_Name', 'Bleaching_Level', 'TSA', 'Turbidity', 'Realm_Name', 'Ocean_Name', 'ClimSST', 'SSTA', 'Percent_Bleaching']
df = df[selected_features]
df.dropna(inplace=True)

df.info()
print(df)


<class 'pandas.core.frame.DataFrame'>
Index: 34393 entries, 0 to 35044
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Country_Name       34393 non-null  object 
 1   Bleaching_Level    34393 non-null  object 
 2   TSA                34393 non-null  float64
 3   Turbidity          34393 non-null  float64
 4   Realm_Name         34393 non-null  object 
 5   Ocean_Name         34393 non-null  object 
 6   ClimSST            34393 non-null  float64
 7   SSTA               34393 non-null  float64
 8   Percent_Bleaching  34393 non-null  float64
dtypes: float64(5), object(4)
memory usage: 2.6+ MB
           Country_Name Bleaching_Level   TSA  Turbidity  \
0                  Cuba          Colony -0.80     0.0287   
1      French Polynesia          Colony  1.29     0.0262   
2        United Kingdom          Colony -2.64     0.0429   
3         United States          Colony -2.27     0.0424   
4         United 

In [48]:
import pandas as pd
import numpy as np
from scipy.stats import skew, boxcox
from sklearn.preprocessing import PowerTransformer


numeric_cols = df.select_dtypes(include=[np.number]).columns
skewness_results = {}

for col in numeric_cols:
    clean_data = df[col].dropna()
    if len(clean_data) > 0:
        skew_val = skew(clean_data)
        skewness_results[col] = {
            'skewness': skew_val,
            'skew_type': 'Right (Positive)' if skew_val > 0.5 else 'Left (Negative)' if skew_val < -0.5 else 'Approximately Symmetric',

        }

right_skewed = {col: data for col, data in skewness_results.items() if data['skewness'] > 0.5}
left_skewed = {col: data for col, data in skewness_results.items() if data['skewness'] < -0.5}

print(left_skewed.keys())
print(right_skewed.keys())
for col_right in right_skewed.keys():
    df[col_right] = df[col_right].replace(0, 1e-6)
    df[col_right], lam = boxcox(df[col_right])

for col_left in left_skewed.keys():
    pt = PowerTransformer(method='yeo-johnson')
    df[[col_left]] = pt.fit_transform(df[[col_left]])

skewness_df = pd.DataFrame(skewness_results).T
skewness_df = skewness_df.sort_values('skewness', key=abs, ascending=False)
skewness_df[:-1]

dict_keys(['TSA', 'ClimSST'])
dict_keys(['Turbidity', 'Percent_Bleaching'])


Unnamed: 0,skewness,skew_type
Turbidity,3.865916,Right (Positive)
Percent_Bleaching,2.587238,Right (Positive)
ClimSST,-1.676739,Left (Negative)
TSA,-1.006318,Left (Negative)


Not only solves skewed data problem, but also removes outliers

In [50]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder

df_edit = df.drop(columns=[ ], axis=1)

# Split first (no leakage)
X = df_edit.drop(columns=['Percent_Bleaching'])
y = df_edit['Percent_Bleaching']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Since we have categorical variables we need to seperate the numeric and the categorical variables
cat_cols = ['Bleaching_Level', 'Ocean_Name', 'Country_Name', 'Realm_Name']
num_cols = [col for col in X.columns if col not in cat_cols]



# We one_hot_encode the categorical features
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)


X_train_cat = ohe.fit_transform(X_train[cat_cols])
X_test_cat  = ohe.transform(X_test[cat_cols])

ohe_feature_names = ohe.get_feature_names_out(cat_cols)



# We scale the numeric features
scaler = StandardScaler()

# Even after normalization its still crucial to scale-up data
X_train_num = scaler.fit_transform(X_train[num_cols])
X_test_num  = scaler.transform(X_test[num_cols])


# Combine the categorical and numerical features
X_train_processed = np.hstack([X_train_num, X_train_cat])
X_test_processed  = np.hstack([X_test_num, X_test_cat])

processed_feature_names = np.concatenate([num_cols, ohe_feature_names])

In [51]:

import tensorflow as ts
from tensorflow import keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dropout


nodes_number = int((X_train_processed.shape[1] + 1) * 2/3)
# nodes_number = 74
model = Sequential()
model.add(Input(shape= (X_train_processed.shape[1],)))
model.add(Dense(nodes_number, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(nodes_number, activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(1))
model.summary()



Install cuda toolkit for GPU on desktop

Tensorflow GPU version (maybe try conda env)

In [52]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    patience=10,
    restore_best_weights=True,
    monitor='loss'
)

model.compile(optimizer="adam",
              loss="mse",
              metrics=['mae'],
               )

model.fit(X_train_processed, y_train, epochs=200, verbose=2, callbacks=[early_stop])

model.summary()

Epoch 1/200
860/860 - 1s - 2ms/step - loss: 40.5910 - mae: 5.2711
Epoch 2/200
860/860 - 1s - 929us/step - loss: 35.6798 - mae: 4.8064
Epoch 3/200
860/860 - 1s - 910us/step - loss: 34.7696 - mae: 4.7129
Epoch 4/200
860/860 - 1s - 921us/step - loss: 34.2780 - mae: 4.6497
Epoch 5/200
860/860 - 1s - 925us/step - loss: 33.9116 - mae: 4.6110
Epoch 6/200
860/860 - 1s - 996us/step - loss: 33.7056 - mae: 4.5999
Epoch 7/200
860/860 - 1s - 933us/step - loss: 33.4445 - mae: 4.5633
Epoch 8/200
860/860 - 1s - 921us/step - loss: 33.2314 - mae: 4.5376
Epoch 9/200
860/860 - 1s - 976us/step - loss: 33.1906 - mae: 4.5317
Epoch 10/200
860/860 - 1s - 1ms/step - loss: 32.9112 - mae: 4.5140
Epoch 11/200
860/860 - 1s - 983us/step - loss: 32.7570 - mae: 4.4891
Epoch 12/200
860/860 - 1s - 1ms/step - loss: 32.4891 - mae: 4.4607
Epoch 13/200
860/860 - 1s - 995us/step - loss: 32.4664 - mae: 4.4674
Epoch 14/200
860/860 - 1s - 990us/step - loss: 32.2407 - mae: 4.4350
Epoch 15/200
860/860 - 1s - 1ms/step - loss: 32.1

In [53]:
from sklearn.metrics import r2_score, mean_squared_error

y_pred = model.predict(X_test_processed)

score = r2_score(y_test, y_pred)

print(score)
print(mean_squared_error(y_test, y_pred))

[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 699us/step
0.568509304397958
26.816403670905462


it takes absurd amount of time to make a very poor predict prediction, probably need to preprocess data again

Best score with such settings

nodes_number = int((X.shape[1] + 1) * 2/3)

model = Sequential()

model.add(Input(shape= (X.shape[1],)))

model.add(Dense(nodes_number, activation='relu'))

model.add(Dense(nodes_number, activation='relu'))

model.add(Dense(1))


with standard scaler data applied to both X and y

0.21329008825613271

0.7689129542072395

Probably it worth to try use 62 features with location and timestamps excluded. Could possibly apply PCA


In [30]:
from keras import Sequential

# -*- coding: utf-8 -*-
"""
Created on Wed Oct 29 15:40:04 2025

@author: zemsk


Bleaching Presence Detection
Target variable: Percent_Bleaching

"""

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, RobustScaler, StandardScaler
import os


path = "../../coral-reef-global-bleaching"
filename_read = os.path.join(path, "coral.csv")

df = pd.read_csv(filename_read)

# # Removed only locations and labels, no real data touched
# df.drop("Ocean_Name", axis=1, inplace=True)
# df.drop("Country_Name", axis=1, inplace=True)
# df.drop("Sample_ID", axis=1, inplace=True)
# df.drop("Date_Year", axis=1, inplace=True)
# df.drop("Bleaching_Level", axis=1, inplace=True)
# df.drop("Realm_Name", axis=1, inplace=True)
#
# cols_to_drop = [
#     'Site_ID', 'Data_Source', 'Latitude_Degrees', 'Longitude_Degrees',
#     'Reef_ID', 'Ecoregion_Name', 'State_Island_Province_Name',
#     'City_Town_Name', 'Site_Name', 'Date', 'Date_Day', 'Date_Month',
#     'Site_Comments', 'Sample_Comments', 'Bleaching_Comments'
# ]
#
# df.drop(cols_to_drop, axis=1, inplace=True)
# #Percent_Cover is not a best predictor and also contain 30% of its fields as null.
# # For the sake of bigger dataset this feature is dropped
# df.drop("Percent_Cover", axis=1, inplace=True)
# df.drop("ClimSST", inplace=True, axis=1)
# df.drop("Exposure", inplace=True, axis=1)
# df.drop("Temperature_Maximum", inplace=True, axis=1)
#
# label_encoder = LabelEncoder()
# df["Exposure"] = label_encoder.fit_transform(df["Exposure"])

# These features were taken into account that data is nonlinear
#df = df[["Distance_to_Shore", "Temperature_Mean", "Turbidity", "TSA", "Depth_m", "Percent_Bleaching"]]

# These features were taken into account that data is linear
# df = df[['Cyclone_Frequency', 'Depth_m', 'ClimSST', 'Distance_to_Shore', 'Turbidity', 'TSA', 'Temperature_Mean', 'Percent_Bleaching']]

df.dropna(inplace=True)
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41361 entries, 0 to 41360
Data columns (total 62 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Site_ID                                41361 non-null  int64  
 1   Sample_ID                              41361 non-null  int64  
 2   Data_Source                            41361 non-null  object 
 3   Latitude_Degrees                       41361 non-null  float64
 4   Longitude_Degrees                      41361 non-null  float64
 5   Ocean_Name                             41361 non-null  object 
 6   Reef_ID                                41361 non-null  object 
 7   Realm_Name                             41361 non-null  object 
 8   Ecoregion_Name                         41361 non-null  object 
 9   Country_Name                           41361 non-null  object 
 10  State_Island_Province_Name             41361 non-null  object 
 11  Ci

  df = pd.read_csv(filename_read)


In [54]:
from sklearn.inspection import permutation_importance

result = permutation_importance(
    model,
    X_test_processed,
    y_test,
    n_repeats=10,
    random_state=42,
    scoring="neg_mean_absolute_error"
)



[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 564us/step
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 574us/step
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 560us/step
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 583us/step
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 563us/step
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 564us/step
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 553us/step
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 576us/step
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 556us/step
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 580us/step
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 553us/step
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 547us/step
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 557us/step

In [55]:
importance = result.importances_mean

# Feature importance
importance_df = pd.DataFrame({
    "feature": processed_feature_names,
    "importance": importance
}).sort_values("importance", ascending=False)

def get_base_feature(feature_name):
    for col in num_cols + cat_cols :
        if feature_name.startswith(col):
            return col
    return feature_name.split("_")[0]

importance_df["base_feature"] = importance_df["feature"].apply(get_base_feature)

grouped_importance = (
    importance_df
    .groupby("base_feature")["importance"]
    .sum()
    .sort_values(ascending=False)
)

grouped_importance = grouped_importance / grouped_importance.sum()

threshold = 0.048
selected_features_tree = [
    f for f, s in zip(grouped_importance.index, grouped_importance.values)
    if s > threshold  # likely you want important features
]
print(f"Selected features for trees: {selected_features_tree}")
grouped_importance

Selected features for trees: ['Bleaching_Level', 'Country_Name', 'TSA', 'SSTA', 'ClimSST', 'Turbidity', 'Realm_Name', 'Ocean_Name']


base_feature
Bleaching_Level    0.270012
Country_Name       0.266821
TSA                0.118131
SSTA               0.090863
ClimSST            0.067220
Turbidity          0.066223
Realm_Name         0.061697
Ocean_Name         0.059033
Name: importance, dtype: float64