# Step 2 â€” Train Random Forest crop classifier

**Input:** merged table from `data/prepared/`  
**What it does:**
- filters to crop-only Level-2 classes
- selects VV/VH time-window features
- runs RandomizedSearchCV for Random Forest hyperparameters
- saves best model + feature list into `models/`

In [None]:
import pandas as pd
import numpy as np
import os
import pickle

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from pathlib import Path

REPO_ROOT = Path.cwd().resolve()
if REPO_ROOT.name == "notebooks":
    REPO_ROOT = REPO_ROOT.parent

DATA_PREP = REPO_ROOT / "data" / "prepared"
MODELS_DIR = REPO_ROOT / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

# Filenames (edit ONLY these if your filenames differ)
data_file = DATA_PREP / "S1_point_all_10d_10m_20180101-20180731_Stratum1_VV-VH.csv"
classes_file = DATA_PREP / "LUCAS_2018_Copernicus_attributes_cropmap_level1-2_FROM_EXPORTS.csv"

# Friendly errors if files are missing
assert data_file.exists(), f"Missing dataset file: {data_file}"
assert classes_file.exists(), f"Missing classes file: {classes_file}"

print("Using data_file:", data_file)
print("Using classes_file:", classes_file)
print("Saving outputs to:", MODELS_DIR)

In [None]:
class_table = pd.read_csv(classes_file)
print(class_table)


       POINT_ID  stratum  LC1   LU1  level_1  level_2
0      47242864        1  B11  U111      200      211
1      47322804        1  B11  U111      200      211
2      47602810        1  B11  U111      200      211
3      47622814        1  B11  U111      200      211
4      47642818        1  B11  U111      200      211
...         ...      ...  ...   ...      ...      ...
30929  36503184        1  E20  U111      500      500
30930  36643156        1  E20  U111      500      500
30931  36703150        1  E20  U111      500      500
30932  36503170        1  F40  U111      200      290
30933  36883144        1  F40  U112      200      290

[30934 rows x 6 columns]


In [None]:
# Load class legend table 

classes_L1 = class_table["level_1"].dropna().unique().tolist()
classes_L2 = class_table["level_2"].dropna().unique().tolist()

df = pd.read_csv(data_file, dtype={'level_1': int, 'level_2': int})
print(f"Loaded dataset with shape {df.shape}")


Loaded dataset with shape (1743815, 46)


In [15]:
#official Level-2 set:
L2_official = {211,212,213,214,215,216,217,218,219,221,222,223,230,231,232,233,240,250,290}



# sremove non crop labled data
bad_L1 = {100,300,500,600}
classes_L2 = [c for c in classes_L2 if c in L2_official]

print(f"Classes in level_1: {classes_L1}")
print(f"Classes in level_2: {classes_L2}")
print(df.head())

Classes in level_1: [200, 500, 300, 600, 100]
Classes in level_2: [211, 212, 213, 214, 215, 216, 218, 219, 221, 222, 223, 231, 232, 233, 230, 240, 250, 290]
   POINT_ID  stratum  level_1  level_2  VH_20180101  VH_20180111  VH_20180121  \
0  47242864        1      200      211   -17.729420   -20.325294   -19.684908   
1  47242864        1      200      211   -17.629759   -20.395664   -19.362911   
2  47322804        1      200      211   -16.761300   -16.439291   -19.003990   
3  47322804        1      200      211   -16.949911   -17.447950   -18.359556   
4  47322804        1      200      211   -16.443756   -16.525919   -18.132175   

   VH_20180131  VH_20180210  VH_20180220  ...  VV_20180421  VV_20180501  \
0   -20.850082   -20.764990   -23.271540  ...   -15.297538   -14.691077   
1   -20.440153   -21.169271   -23.260570  ...   -13.002155   -14.203595   
2   -20.409580   -22.091795   -21.210240  ...   -16.568722   -17.411484   
3   -20.564657   -23.349674   -21.620611  ...   -16.0513

In [16]:
df['Classif'] = df['level_2']  # working label for classification (detailed crop types)
if classes_L2:
    df = df[df['Classif'].isin(classes_L2)]
print(f"Data after filtering to crop classes: {df.shape}")



Data after filtering to crop classes: (604610, 47)


In [None]:

feature_regex = r'(((?<![\w\d])VH_)|((?<![\w\d])VV_))'  
feature_regex += r'(2018(0[1-7]))'  

X = df.filter(regex=feature_regex)
y = df['Classif']

print(f"Selected features matrix X shape: {X.shape}")
print(f"Selected target vector y shape: {y.shape}")
# Check a quick summary of class distribution
print("Class distribution in y:")
print(y.value_counts())
#  feature names used in training
import json, os

feature_names = list(X.columns)
feat_path = MODELS_DIR / "RF_feature_names.json"
with open(feat_path, "w") as f:
    json.dump(feature_names, f, indent=2)
print(f"Saved training feature list to {feat_path}")


Selected features matrix X shape: (604610, 42)
Selected target vector y shape: (604610,)
Class distribution in y:
Classif
211    195821
213     89215
216     69089
232     53663
214     31153
290     24290
215     24001
250     23366
222     22901
240     18351
218     16124
221     14620
230      6360
212      4711
223      3780
219      3446
231      1975
233      1744
Name: count, dtype: int64
Saved training feature list to /home/jupyter-amantay/EU/model/RF_feature_names.json


In [18]:
pipeline = Pipeline([('RFclf', RandomForestClassifier(n_jobs=-1, random_state=42))])

# Define the hyperparameter search space for the Random Forest
param_dist = {
    'RFclf__n_estimators': [300, 500, 800, 1000],        # number of trees
    'RFclf__max_features': ['sqrt', 'log2', None],       # features to consider at split ('None' = all features)
    'RFclf__max_depth': [10, 20, 30, None],              # maximum tree depth (None means no limit)
    'RFclf__min_samples_split': [2, 5, 10],              # minimum samples required to split an internal node
    'RFclf__min_samples_leaf': [1, 2, 5, 10],            # minimum samples required in a leaf node
    'RFclf__bootstrap': [True, False],                   # whether to bootstrap samples (True is default in RF)
    'RFclf__criterion': ['gini', 'entropy']              # splitting criterion
}


random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=50,            # number of parameter settings to sample (adjust for thoroughness vs speed)
    cv=3,                 # 3-fold cross-validation
    random_state=42,      # for reproducibility of the search
    n_jobs=-1,            # parallelize across cores
    verbose=1             # print progress messages
)

# light version
# param_dist = {
#     'RFclf__n_estimators': [200, 300, 400],
#     'RFclf__max_features': ['sqrt', 'log2'],
#     'RFclf__max_depth': [10, 20, None],
#     'RFclf__min_samples_split': [2, 5, 10],
#     'RFclf__min_samples_leaf': [1, 2, 4],
#     'RFclf__bootstrap': [True],
#     'RFclf__max_samples': [0.6, 0.7, 0.8, 0.9],
#     'RFclf__criterion': ['gini'],
#     'RFclf__n_jobs': [-1],
# }

# random_search = RandomizedSearchCV(
#     estimator=pipeline,
#     param_distributions=param_dist,
#     n_iter=25,
#     cv=3,
#     random_state=42,
#     n_jobs=1,                  
#     pre_dispatch='2*n_jobs',
#     return_train_score=False,
#     verbose=1
# )
print("Starting RandomizedSearchCV for hyperparameter tuning...")
random_search.fit(X, y)
print(f"Hyperparameter tuning completed. Best score: {random_search.best_score_:.4f}")
print("Best parameters found:", random_search.best_params_)


Starting RandomizedSearchCV for hyperparameter tuning...
Fitting 3 folds for each of 50 candidates, totalling 150 fits




TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}
Detailed tracebacks of the workers should have been printed to stderr in the executor process if faulthandler was not disabled.

In [None]:
# Save 
search_results_path = MODELS_DIR / "RF_random_search_results_bigger.pkl"
with open(search_results_path, "wb") as f:
    pickle.dump(random_search, f)
print(f"Saved RandomizedSearchCV object to {search_results_path}")

best_model = random_search.best_estimator_
best_model_path = MODELS_DIR / "RF_best_model_bigger.pkl"
with open(best_model_path, "wb") as f:
    pickle.dump(best_model, f)
print(f"Saved best Random Forest model to {best_model_path}")


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)
model = best_model.named_steps['RFclf'] if isinstance(best_model, Pipeline) else best_model
y_pred = model.predict(X_val)
print("Accuracy on validation set: {:.2f}%".format(100 * (y_pred == y_val).mean()))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))


In [None]:
rf_model = model  # the RandomForestClassifier from the pipeline
importances = rf_model.feature_importances_
feature_names = X.columns
top_indices = np.argsort(importances)[::-1][:10]  # indices of top 10 important features
print("Top 10 important features:")
for idx in top_indices:
    print(f"{feature_names[idx]}: {importances[idx]:.4f}")


In [None]:
pip install ee

In [None]:
import ee