<a href="https://colab.research.google.com/github/CiaraFarrellSETU/phd/blob/main/rf%2Bderivedfeatures%2BXGBoostw_earlystopping__2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import rasterio
from rasterio import features
import geopandas as gpd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb

In [None]:
with rasterio.open("/content/RGB.tif") as src:
    rgb_clipped = src.read()   # shape (bands, H, W)
    profile = src.profile
    transform = src.transform
    H, W = src.height, src.width

In [None]:
habitat_gdf = gpd.read_file("/content/Ballymore_habitat_map2018_ITM.shp")

In [None]:
if habitat_gdf.crs != src.crs:
    habitat_gdf = habitat_gdf.to_crs(src.crs)
    print('match')
else:
  print('not match')

not match


In [None]:
with rasterio.open("/content/RGB.tif") as src:
    raster_crs = src.crs
    transform = src.transform
    H, W = src.height, src.width

# Open shapefile
habitat_gdf = gpd.read_file("/content/Ballymore_habitat_map2018_ITM.shp")
print("Raster CRS:", raster_crs)
print("Shapefile CRS:", habitat_gdf.crs)

Raster CRS: EPSG:32629
Shapefile CRS: EPSG:2157


In [None]:
if habitat_gdf.crs != raster_crs:
    habitat_gdf = habitat_gdf.to_crs(raster_crs)

In [None]:
habitat_gdf = habitat_gdf[habitat_gdf.geometry.notnull()]
habitat_gdf = habitat_gdf[~habitat_gdf.geometry.is_empty]

# Fix invalid geometries (common trick: buffer(0))
habitat_gdf["geometry"] = habitat_gdf.buffer(0)

# Check again
print("Valid geometries:", habitat_gdf.geometry.is_valid.sum())
print("Total geometries:", len(habitat_gdf))
print("Bounds:", habitat_gdf.total_bounds)

Valid geometries: 17
Total geometries: 17
Bounds: [ 590081.50819806 5927821.7540698   590694.89681513 5928459.75002118]


In [None]:
print(habitat_gdf.columns)

Index(['Id', 'FossittTyp', 'FossittCod', 'EUType', 'EUCode', 'Habitat',
       'geometry'],
      dtype='object')


In [None]:
# Create a mapping from habitat names to unique IDs
habitat_types = habitat_gdf['Habitat'].unique()
habitat_id_map = {habitat: i + 1 for i, habitat in enumerate(habitat_types)} # Start IDs from 1

# Add a new column with the class IDs
habitat_gdf['class_id'] = habitat_gdf['Habitat'].map(habitat_id_map)

# Display the mapping and the updated DataFrame head
print("Habitat to Class ID Mapping:")
print(habitat_id_map)
print("\nUpdated DataFrame head with 'class_id':")
display(habitat_gdf.tail())

Habitat to Class ID Mapping:
{'Carex-Menyanthes transition mire': 1, 'Juncus subnodulosus fen': 2, 'Mosaic Schoenus-Carex fen/Menyanthes pool': 3, 'Calluna - Eriophorum bog': 4, 'Filipendula-Holcus community': 5, 'Mosaic scrub/Dry grassland': 6, 'Mosaic Carex-Menyanthes transition mire/Filipendula-Holcus community': 7, 'Mosaic Ulex scrub/Molinia cutaway': 8}

Updated DataFrame head with 'class_id':


Unnamed: 0,Id,FossittTyp,FossittCod,EUType,EUCode,Habitat,geometry,class_id
12,0,Rich fen and flush,PF1,Alkaline fen,7230.0,Juncus subnodulosus fen,"POLYGON ((590412.216 5928078.959, 590410.899 5...",2
13,0,Scrub/Cutover bog,WS1/PB4,,,Mosaic Ulex scrub/Molinia cutaway,"POLYGON ((590451.24 5928078.807, 590454.23 592...",8
14,0,Rich fen and flush,PF1,Alkaline fen,7230.0,Juncus subnodulosus fen,"POLYGON ((590445.628 5928162.677, 590440.598 5...",2
15,0,Transition mire and quaking bog,PF3,Transition mires and quaking bogs,7140.0,Carex-Menyanthes transition mire,"POLYGON ((590419.287 5928229.774, 590417.387 5...",1
16,0,Transition mire and quaking bog,PF3,Transition mires and quaking bogs,7140.0,Carex-Menyanthes transition mire,"POLYGON ((590587.384 5928006.935, 590587.853 5...",1


In [None]:
print(habitat_gdf['Habitat'].unique())
print(habitat_gdf['Habitat'].dtype)

['Carex-Menyanthes transition mire' 'Juncus subnodulosus fen'
 'Mosaic Schoenus-Carex fen/Menyanthes pool' 'Calluna - Eriophorum bog'
 'Filipendula-Holcus community' 'Mosaic scrub/Dry grassland'
 'Mosaic Carex-Menyanthes transition mire/Filipendula-Holcus community'
 'Mosaic Ulex scrub/Molinia cutaway']
object


In [None]:
print(habitat_gdf['class_id'].unique())
print(habitat_gdf['class_id'].dtype)

[1 2 3 4 5 6 7 8]
int64


In [None]:
shapes = ((geom, value) for geom, value in zip(habitat_gdf.geometry, habitat_gdf["class_id"]))

habitat_map = features.rasterize(
    shapes=shapes,
    out_shape=(H, W),
    transform=transform,
    fill=0,              # background = 0
    dtype="int32"
)

In [None]:
R = rgb_clipped[0].astype(float)
G = rgb_clipped[1].astype(float)
B = rgb_clipped[2].astype(float)
NIR = rgb_clipped[3].astype(float)

In [None]:
eps = 1e-6
RG_ratio = R / (G + eps)
GB_ratio = G / (B + eps)
brightness = (R + G + B) / 3
NDVI = np.divide((NIR - R), (NIR + R + eps))

features_stack = np.stack([R, G, B, NIR, RG_ratio, GB_ratio, brightness, NDVI], axis=0)

In [None]:
X = features_stack.reshape(features_stack.shape[0], -1).T
y = habitat_map.reshape(-1)

mask = y > 0
X = X[mask]
y = y[mask]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
rf = RandomForestClassifier(n_estimators=400, n_jobs=-1, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)

In [None]:
print("Random Forest Results:")
print(classification_report(y_val, y_pred_rf))

Random Forest Results:
              precision    recall  f1-score   support

           1       0.27      0.30      0.28      3105
           2       0.18      0.16      0.17      1436
           3       0.39      0.54      0.46      4161
           4       0.51      0.45      0.48      2857
           5       0.31      0.30      0.30      2460
           6       0.42      0.47      0.44      1542
           7       0.21      0.13      0.16      1931
           8       0.13      0.05      0.07      1322

    accuracy                           0.34     18814
   macro avg       0.30      0.30      0.30     18814
weighted avg       0.33      0.34      0.33     18814



In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# y is your 1D array of labels after masking
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Update num_class accordingly
num_classes = len(le.classes_)

# Train/test split with encoded labels
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# XGBoost setup
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

params = {
    "objective": "multi:softmax",
    "num_class": num_classes,
    "eval_metric": "mlogloss",
    "eta": 0.1,
    "max_depth": 6,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "seed": 42
}

evals = [(dtrain, "train"), (dval, "val")]

xgb_model = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=evals,
    early_stopping_rounds=20,
    verbose_eval=50
)

[0]	train-mlogloss:2.00657	val-mlogloss:2.00924
[50]	train-mlogloss:1.58215	val-mlogloss:1.63591
[100]	train-mlogloss:1.55038	val-mlogloss:1.62878
[123]	train-mlogloss:1.54080	val-mlogloss:1.62896


In [None]:
y_pred_xgb = xgb_model.predict(dval)
print("XGBoost Accuracy:", accuracy_score(y_val, y_pred_xgb))

XGBoost Accuracy: 0.37402997767619856
