<a href="https://colab.research.google.com/github/CiaraFarrellSETU/phd/blob/main/Pollardstown_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import rasterio
import geopandas as gpd
import numpy as np
from rasterio.mask import mask
from rasterio.warp import reproject, Resampling
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

In [None]:
rgb_path = "/content/pollardstown_ortho.tif"
ndvi_path = "/content/Pollardstown_Summer_NDVI.tif"

rgb = rasterio.open(rgb_path)
ndvi = rasterio.open(ndvi_path)

In [None]:
factor = 4
new_height = rgb.height // factor
new_width = rgb.width // factor

# Downsample RGB (only first 3 bands)
rgb_data = rgb.read([1, 2, 3],
                    out_shape=(3, new_height, new_width))

# Downsample NDVI to same shape
ndvi_data = ndvi.read(1,
                      out_shape=(new_height, new_width))

In [None]:
rows, cols = new_height, new_width
stack = np.zeros((rows, cols, 4), dtype=np.float32)
stack[:, :, 0:3] = np.transpose(rgb_data, (1, 2, 0))
stack[:, :, 3] = ndvi_data

print("Downsampled stack shape:", stack.shape)

Downsampled stack shape: (10000, 10000, 4)


In [None]:
shapefile_path = "/content/Pollardstown_clappied.shp"
gdf = gpd.read_file(shapefile_path)

# Match CRS
if gdf.crs != rgb.crs:
    gdf = gdf.to_crs(rgb.crs)

print("Columns in shapefile:", gdf.columns.tolist())

# Choose the column that represents habitat class
label_column = "PRIMARY_FO"   # <-- change if needed

Columns in shapefile: ['Id', 'NFS_Code', 'NFS_Name', 'County', 'Co_Code', 'Area_sqm', 'Area_ha', 'Poly_Num', '7140', '7210', '7230', '6410', '6430', '91E0', 'OtherAnnex', 'PF1', 'PF2', 'PF3', 'FS1', 'FS2', 'GS1', 'GS2', 'GS3', 'GS4', 'GM1', 'GA1', 'WS1', 'WN2', 'WN4', 'WN6', 'WN7', 'WL1', 'WL2', 'PB1', 'PB2', 'PB3', 'PB4', 'PB5', 'FW4', 'HH3', 'OtherFossi', 'SumAnnex', 'SumFossit', 'DataQual', 'Anx_Mapped', 'Non_Anx_PF', 'PRIMARY_FO', 'Anx_perc', 'Anx_perc_l', 'geometry']


In [None]:
X, y = [], []

for idx, row in gdf.iterrows():
    geom = [row['geometry']]

    # Mask RGB and NDVI rasters by polygon
    out_rgb, _ = mask(rgb, geom, crop=True)
    out_ndvi, _ = mask(ndvi, geom, crop=True)

    # Align sizes (take intersection)
    min_rows = min(out_rgb.shape[1], out_ndvi.shape[0])
    min_cols = min(out_rgb.shape[2], out_ndvi.shape[1])

    out_rgb = out_rgb[:, :min_rows, :min_cols]
    out_ndvi = out_ndvi[:min_rows, :min_cols]

    # Flatten pixels
    r = out_rgb[0].ravel()
    g = out_rgb[1].ravel()
    b = out_rgb[2].ravel()
    n = out_ndvi.ravel()

    # Ensure equal length before stacking
    min_len = min(len(r), len(g), len(b), len(n))
    r, g, b, n = r[:min_len], g[:min_len], b[:min_len], n[:min_len]

    features = np.column_stack([r, g, b, n])

    # Remove nodata
    features = features[~np.isnan(features).any(axis=1)]

    # Append with class label
    class_value = row[label_column]
    y.append(np.full(features.shape[0], class_value))
    X.append(features)

X = np.vstack(X)
y = np.hstack(y)

print("Training samples shape:", X.shape)
print("Labels shape:", y.shape)
print("Unique classes:", np.unique(y))


Training samples shape: (86074, 4)
Labels shape: (86074,)
Unique classes: ['FS1' 'FS1/FS2' 'FS1/PF1' 'GA1' 'GS4' 'GS4/FS1' 'GS4/FS2' 'GS4/WS1' 'PF1'
 'WL2' 'WN6' 'WN6/FS1' 'WS1/WL2']


86,074 training samples (pixels Ã— 4 features: R, G, B, NDVI)

Labels shape matches (one class per pixel)

Unique classes: 13 distinct habitat codes (e.g. FS1, GA1, GS4, PF1, etc.)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

         FS1       0.00      0.00      0.00      5732
     FS1/FS2       0.00      0.00      0.00       288
     FS1/PF1       0.00      0.00      0.00       999
         GA1       0.00      0.00      0.00        66
         GS4       0.00      0.00      0.00      4730
     GS4/FS1       0.00      0.00      0.00      1039
     GS4/FS2       0.00      0.00      0.00      1498
     GS4/WS1       0.00      0.00      0.00       681
         PF1       0.27      1.00      0.43      6975
         WL2       0.67      0.00      0.00      1073
         WN6       0.00      0.00      0.00       817
     WN6/FS1       0.00      0.00      0.00      1461
     WS1/WL2       0.00      0.00      0.00       464

    accuracy                           0.27     25823
   macro avg       0.07      0.08      0.03     25823
weighted avg       0.10      0.27      0.12     25823



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


severe class imbalance  

PF1 has ~7,000 samples, while some classes like GA1 only have 66.

Random Forest tends to predict the majority class, ignoring rare ones.

In [None]:
#simplify labels/ group classes
label_column = "PRIMARY_FO"

simplify_map = {
    'FS1/FS2': 'FS1',
    'FS1/PF1': 'FS1',
    'GS4/FS1': 'GS4',
    'GS4/FS2': 'GS4',
    'GS4/WS1': 'GS4',
    'WN6/FS1': 'WN6',
    'WS1/WL2': 'WS1'
}

gdf[label_column] = gdf[label_column].replace(simplify_map)

In [None]:
X, y = [], []

for idx, row in gdf.iterrows():
    geom = [row['geometry']]
    out_rgb, _ = mask(rgb, geom, crop=True)
    out_ndvi, _ = mask(ndvi, geom, crop=True)

    min_rows = min(out_rgb.shape[1], out_ndvi.shape[0])
    min_cols = min(out_rgb.shape[2], out_ndvi.shape[1])

    out_rgb = out_rgb[:, :min_rows, :min_cols]
    out_ndvi = out_ndvi[:min_rows, :min_cols]

    r = out_rgb[0].ravel()
    g = out_rgb[1].ravel()
    b = out_rgb[2].ravel()
    n = out_ndvi.ravel()

    min_len = min(len(r), len(g), len(b), len(n))
    r, g, b, n = r[:min_len], g[:min_len], b[:min_len], n[:min_len]

    features = np.column_stack([r, g, b, n])
    features = features[~np.isnan(features).any(axis=1)]

    class_value = row[label_column]
    y.append(np.full(features.shape[0], class_value))
    X.append(features)

X = np.vstack(X)
y = np.hstack(y)

print("Training samples shape:", X.shape)
print("Labels shape:", y.shape)
print("Unique classes:", np.unique(y))



Training samples shape: (86074, 4)
Labels shape: (86074,)
Unique classes: ['FS1' 'GA1' 'GS4' 'PF1' 'WL2' 'WN6' 'WS1']


In [None]:
#balance weights
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier(
    n_estimators=200,
    class_weight="balanced",   # <-- balances rare classes
    random_state=42
)
rf.fit(X_train, y_train)

print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

         FS1       0.00      0.00      0.00      7019
         GA1       0.00      0.00      0.00        66
         GS4       0.17      0.00      0.00      7948
         PF1       1.00      0.00      0.00      6975
         WL2       0.67      0.00      0.00      1073
         WN6       0.00      0.00      0.00      2278
         WS1       0.02      1.00      0.04       464

    accuracy                           0.02     25823
   macro avg       0.26      0.14      0.01     25823
weighted avg       0.35      0.02      0.00     25823



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


2 classes

In [None]:
label_column = 'PRIMARY_FO'
group_map = {
    # Wetland habitats
    'PF1': 'Wetland',
    'WL2': 'Wetland',
    'WN6': 'Wetland',
    'WS1': 'Wetland',
    'WS1/WL2': 'Wetland',
    'WN6/FS1': 'Wetland',
    'GS4/WS1': 'Wetland',

    # Grassland/Other habitats
    'FS1': 'Grassland',
    'FS1/FS2': 'Grassland',
    'FS1/PF1': 'Grassland',
    'GA1': 'Grassland',
    'GS4': 'Grassland',
    'GS4/FS1': 'Grassland',
    'GS4/FS2': 'Grassland'
}
gdf['GroupedClass'] = gdf[label_column].replace(group_map)

In [None]:
X, y = [], []

for idx, row in gdf.iterrows():
    geom = [row['geometry']]
    out_rgb, _ = mask(rgb, geom, crop=True)
    out_ndvi, _ = mask(ndvi, geom, crop=True)

    min_rows = min(out_rgb.shape[1], out_ndvi.shape[0])
    min_cols = min(out_rgb.shape[2], out_ndvi.shape[1])

    out_rgb = out_rgb[:, :min_rows, :min_cols]
    out_ndvi = out_ndvi[:min_rows, :min_cols]

    r = out_rgb[0].ravel()
    g = out_rgb[1].ravel()
    b = out_rgb[2].ravel()
    n = out_ndvi.ravel()

    min_len = min(len(r), len(g), len(b), len(n))
    r, g, b, n = r[:min_len], g[:min_len], b[:min_len], n[:min_len]

    features = np.column_stack([r, g, b, n])
    features = features[~np.isnan(features).any(axis=1)]

    class_value = row['GroupedClass']
    y.append(np.full(features.shape[0], class_value))
    X.append(features)

X = np.vstack(X)
y = np.hstack(y)

print("Training samples shape:", X.shape)
print("Labels shape:", y.shape)
print("Unique grouped classes:", np.unique(y))

Training samples shape: (86074, 4)
Labels shape: (86074,)
Unique grouped classes: ['Grassland' 'Wetland']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

rf = RandomForestClassifier(
    n_estimators=200,
    class_weight="balanced",
    random_state=42
)
rf.fit(X_train, y_train)

print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

   Grassland       0.14      0.00      0.00     15033
     Wetland       0.42      1.00      0.59     10790

    accuracy                           0.42     25823
   macro avg       0.28      0.50      0.29     25823
weighted avg       0.26      0.42      0.25     25823



3 classes


In [None]:
label_column = "PRIMARY_FO"

group_map = {
    # Fen/Peatland
    'PF1': 'Fen',
    'FS1': 'Fen',
    'FS1/FS2': 'Fen',
    'FS1/PF1': 'Fen',

    # Grassland
    'GA1': 'Grassland',
    'GS4': 'Grassland',
    'GS4/FS1': 'Grassland',
    'GS4/FS2': 'Grassland',

    # Wetland
    'WL2': 'Wetland',
    'WN6': 'Wetland',
    'WN6/FS1': 'Wetland',
    'WS1': 'Wetland',
    'WS1/WL2': 'Wetland',
    'GS4/WS1': 'Wetland'}
gdf['GroupedClass'] = gdf[label_column].map(group_map).fillna("Other")

In [None]:
X, y = [], []

for idx, row in gdf.iterrows():
    geom = [row['geometry']]
    out_rgb, _ = mask(rgb, geom, crop=True)
    out_ndvi, _ = mask(ndvi, geom, crop=True)

    min_rows = min(out_rgb.shape[1], out_ndvi.shape[0])
    min_cols = min(out_rgb.shape[2], out_ndvi.shape[1])

    out_rgb = out_rgb[:, :min_rows, :min_cols]
    out_ndvi = out_ndvi[:min_rows, :min_cols]

    r = out_rgb[0].ravel()
    g = out_rgb[1].ravel()
    b = out_rgb[2].ravel()
    n = out_ndvi.ravel()

    min_len = min(len(r), len(g), len(b), len(n))
    r, g, b, n = r[:min_len], g[:min_len], b[:min_len], n[:min_len]

    features = np.column_stack([r, g, b, n])
    features = features[~np.isnan(features).any(axis=1)]

    class_value = row['GroupedClass']
    y.append(np.full(features.shape[0], class_value))
    X.append(features)

X = np.vstack(X)
y = np.hstack(y)

print("Training samples shape:", X.shape)
print("Labels shape:", y.shape)
print("Unique grouped classes:", np.unique(y))

Training samples shape: (86074, 4)
Labels shape: (86074,)
Unique grouped classes: ['Fen' 'Grassland' 'Wetland']


In [None]:
import pandas as pd

df = pd.DataFrame(X, columns=["R","G","B","NDVI"])
df["label"] = y

# Sample equal number per class (e.g. 3000 pixels each)
balanced = df.groupby("label").apply(lambda x: x.sample(min(len(x), 3000), random_state=42))
X_bal = balanced[["R","G","B","NDVI"]].values
y_bal = balanced["label"].values

print("Balanced samples shape:", X_bal.shape)
print("Balanced labels:", np.unique(y_bal, return_counts=True))

Balanced samples shape: (9000, 4)
Balanced labels: (array(['Fen', 'Grassland', 'Wetland'], dtype=object), array([3000, 3000, 3000]))


  balanced = df.groupby("label").apply(lambda x: x.sample(min(len(x), 3000), random_state=42))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_bal, y_bal, test_size=0.3, random_state=42)

rf = RandomForestClassifier(
    n_estimators=200,
    random_state=42
)
rf.fit(X_train, y_train)

print(classification_report(y_test, rf.predict(X_test)))

              precision    recall  f1-score   support

         Fen       0.00      0.00      0.00       913
   Grassland       0.33      1.00      0.49       882
     Wetland       1.00      0.00      0.00       905

    accuracy                           0.33      2700
   macro avg       0.44      0.33      0.16      2700
weighted avg       0.44      0.33      0.16      2700

