## Random Forest: [conifers, broadleaf] for all weather

### Imports

In [1]:
import os
import joblib
import numpy as np
import pandas as pd
import importlib
import re
import yaml

import sklearn.ensemble 
import sklearn.model_selection
import sklearn.inspection

import findatree.io as io
import findatree.descriptions as descriptions

# Dictionaries: species_name to ba and vice versa
species_id_to_name = descriptions.species_id_to_name()
species_name_to_id = descriptions.species_name_to_id()

### Definitions

In [23]:
# Directory: Processed tnr%.hdf5s
dir_hdf5 = r"C:\Data\lwf\processed\2021\hdf5"

# Path to flight-log
path_log = r"C:\Data\lwf\Flugbuch_WZE-2021_digitalisiert.csv"

# Directory: sklearn classifier saving
dir_sklearn = r"C:\Data\lwf\analysis\221029_random_forest\sklearn\v01\2021"

# Save names:
save_name_params = 'params.yaml'
save_name_gridcv = 'grid.joblib'
save_name_dataset = 'dataset.joblib'
# save_name_permutation_test_score = 'permutation_test_score.joblib'
# save_name_permutation_feature_importance = 'permutation_feature_importance.joblib'

### Load

#### Load  features and logbook

In [24]:
importlib.reload(io)

# Load features
df_original, params_df = io.allhdf5s_crowns_features_to_dataframe(dir_hdf5, crowns_type='crowns_human')

# Load logbook
log = pd.read_csv(path_log, sep=';', header=0)

#### Assign weather conditions

In [25]:
tnrs = log.Traktnummer.values
weathers = log['Wetter-Code'].values
tnr_to_weather = dict([(tnr, weather) for tnr, weather in zip(tnrs, weathers)])

df_original['weather'] = [tnr_to_weather[tnr] for tnr in df_original.tnr]

print(f"#tnrs = {len(np.unique(df_original[df_original.weather == 0].tnr))} -> sunny")
print(f"#tnrs = {len(np.unique(df_original[df_original.weather == 1].tnr))} -> cloudy")
print(f"#tnrs = {len(np.unique(df_original[df_original.weather == 2].tnr))} -> mixed")
print()
print(f"#crowns = {np.sum(df_original.weather == 0)} -> sunny")
print(f"#crowns = {np.sum(df_original.weather == 1)} -> cloudy")
print(f"#crowns = {np.sum(df_original.weather == 2)} -> mixed")


#tnrs = 46 -> sunny
#tnrs = 75 -> cloudy
#tnrs = 21 -> mixed

#crowns = 1415 -> sunny
#crowns = 1942 -> cloudy
#crowns = 651 -> mixed


####  Clean-up of features
* dtype and NaNs
* Value related

In [26]:
# Copy original
df = df_original.copy()
print(f"{r'#crowns'} = {len(df)}, original")

#### dtype and NaNs
# Convert bhd_2020 column to float32 dtype
df.loc[:, 'bhd_2020'] = pd.to_numeric(df.bhd_2020, errors='coerce')
df.bhd_2020 = df.bhd_2020.astype(np.float32)

# Convert bk column to int32 dtype
df.loc[:, 'bk'] = pd.to_numeric(df.bk, errors='coerce')
df.bk = df.bk.astype(np.int32)

# Drop NaN containing rows
df = df.dropna(axis=0, how='any')   
print(f"{r'#crowns'} = {len(df)}, after removal (NaNs, i.e. completely shadowed or dead)")


#### Value related
queries = []

# Drop bk > 1
query_str = 'bk in [0, 1, 210, 320, 330, 340]'
queries.append(query_str)
df = df.query(query_str)
print(f"{r'#crowns'} = {len(df)}, after query ({query_str})")

# Drop kkl > 3
query_str = 'kkl <= 3'
queries.append(query_str)
df = df.query(query_str)
print(f"{r'#crowns'} = {len(df)}, after query ({query_str})")

# Drop area_bright/0.2**2 < 10
query_str = 'area_bright / 0.2**2 > 10'
queries.append(query_str)
df = df.query(query_str)
print(f"{r'#crowns'} = {len(df)}, after query ({query_str})")

# Drop perc5_ndre < 0
query_str = 'perc5_ndre > -1e-1'
queries.append(query_str)
df = df.query(query_str)
print(f"{r'#crowns'} = {len(df)}, after query ({query_str})")

#crowns = 4008, original
#crowns = 3967, after removal (NaNs, i.e. completely shadowed or dead)
#crowns = 3908, after query (bk in [0, 1, 210, 320, 330, 340])
#crowns = 3907, after query (kkl <= 3)
#crowns = 3876, after query (area_bright / 0.2**2 > 10)
#crowns = 3876, after query (perc5_ndre > -1e-1)


#### Assign classes: Conifers and Broadleaf

In [27]:
importlib.reload(descriptions)

# Define families by patterns
family_patterns = [
    'kiefer|fichte|tanne|douglasie|lärche', 
    'buche|eiche|ahorn|erle|birke|esche',
]

family_names = [
    'conifers',
    'broadleaf',
]

families = descriptions.species_groupby_families(family_patterns, family_names)
family_ids = descriptions.species_id_to_family_id(df.ba.values, families)

# Assign family_id as class under -> class_id
df = df.assign(label = family_ids)

# Create a comprehensive dict for labels definition to save in params
labels = dict([(family_id, family['family_name']) for (family_id, family) in families.items()])

# Print a summary of the classes
print(f"{'label':<15}|{'label_name':<15}|{'label_count'}")
print('-'*50)
for label, label_name in labels.items():
    print(f"{label:<15}|{label_name:<15}|{np.sum(df.label == label)}")

label          |label_name     |label_count
--------------------------------------------------
0              |conifers       |2921
1              |broadleaf      |916


### Infos

#### Search: Pattern in column names

In [None]:
pattern = '^x_|^y'

cols = list(df.columns)
for col in cols:
    if bool(re.search(pattern, col, re.IGNORECASE)):
        print(col)

#### Info: Dataset

In [28]:
importlib.reload(descriptions)

descriptions.print_summary(
    df.tnr.values,
    df.ba.values,
    df.label.values,
    families,
)

Total number of crowns        : 3876
Mean number of crowns per tnr : 27.3
__________________________________________________

species_id| species_name                  | count
--------------------------------------------------
       134| Gemeine Kiefer                : 1391
       118| Gemeine Fichte                : 1185
        20| Rotbuche                      : 443
       100| Weißtanne                     : 179
        48| Traubeneiche                  : 174
        51| Stieleiche                    : 129
       116| Europäische Lärche            : 121
        10| Gemeine Birke                 : 49
        22| Gemeine Esche                 : 44
       136| Douglasie                     : 35
         7| Schwarzerle                   : 28
         5| Bergahorn                     : 18
        13| Hainbuche                     : 17
        36| Kirsche                       : 14
        35| Aspe                          : 9
        53| Roteiche                      : 9
       129| Sc

### Classification

#### Overall parameters

In [29]:
params = {
    'labels':labels,
    'test_size': 0.25,
    'cv_splits': 5,
    'scoring': 'balanced_accuracy',
    'n_permutations': 20,
    'n_repeats': 10,
    'max_samples': 0.5,
}

# Save parameters
io.list_of_dicts_to_yaml(os.path.join(dir_sklearn, save_name_params), [params])

#### Define: Features and labels
* Define features columns: Exclude terr., identifiers and coords. columns
* Define samples: Exclude unassigend samples, based on labels
* Define features, labels and extended labels

Create extended labels `y_extend` with information about `['family', 'ba', 'sst', 'nbv', 'weather']`, to check later if classfication is dependent on these.

In [30]:
#### Define feautures columns

# Get all dataset columns
x_names = list(df.columns)

# Terrestrial feature names to be excluded
terr_names = list(params_df[list(params_df.keys())[0]]['features_terrestrial_names'])

# Identifiers feature names to be excluded
ident_names = [ 'label', 'tnr', 'id', 'weather']

# Coordinate features pattern to be excluded
coordinate_pattern = '^x_|^y_'

# Exclude all
x_names = [col for col in x_names if col not in terr_names]
x_names = [col for col in x_names if col not in ident_names]
x_names = [col for col in x_names if not bool(re.search(coordinate_pattern, col))]


#### Define samples: Exclude unassigend samples, based on labels
samples_include = df.label >= 0


#### Define features, labels and extended classes
# Define features -> x
x = df.loc[samples_include, x_names].values

# Define labels -> y
y_names = ['label']
y = df.loc[samples_include, y_names[0]].values

# Define extended labels -> y_extend
y_extend_names = ['label', 'tnr', 'id', 'ba', 'sst', 'nbv', 'weather']
y_extend = df.loc[samples_include, y_extend_names].values


#### Print infos about  features, labels and extended labels
print(f"x.shape: {x.shape}")
print(f"y.shape: {y.shape}")
print(f"y_extend.shape: {y_extend.shape}")
print(f"Label ratios in y:")
for i in labels.keys():
    print(f"{' '*4}label[{i}]/label[0] = {np.sum(y == i) / np.sum(y == 0):.2f}")

x.shape: (3837, 107)
y.shape: (3837,)
y_extend.shape: (3837, 7)
Label ratios in y:
    label[0]/label[0] = 1.00
    label[1]/label[0] = 0.31


#### Train-test split

In [31]:
x_train, x_test, y_train, y_test, y_extend_train, y_extend_test, = sklearn.model_selection.train_test_split(
    x,
    y,
    y_extend,
    test_size=params['test_size'],
    shuffle=True,
    stratify=y,
)

dataset = {
    'x_names': x_names,
    'y_names': y_names,
    'y_extend_names': y_extend_names,
    'x_train': x_train,
    'x_test': x_test,
    'y_train': y_train,
    'y_test': y_test,
    'y_extend_train': y_extend_train,
    'y_extend_test': y_extend_test,
}

# Save dataset
joblib.dump(dataset, os.path.join(dir_sklearn, save_name_dataset)) 

['C:\\Data\\lwf\\analysis\\221029_random_forest\\sklearn\\v01\\2021\\dataset.joblib']

#### Grid Search

In [32]:
params_grd = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [50, 100, 200],
    'min_samples_leaf': [1, 3, 5],
    'max_samples': [0.75, 0.5]
}
grd = sklearn.model_selection.GridSearchCV(
    sklearn.ensemble.RandomForestClassifier(n_jobs=-1),
    param_grid = params_grd,
    scoring  = params['scoring'],
    cv = sklearn.model_selection.StratifiedShuffleSplit(
        n_splits=params['cv_splits'],
        test_size=params['test_size'],
    )
)

grd.fit(x_train, y_train)

# Save grid
joblib.dump(grd, os.path.join(dir_sklearn, save_name_gridcv))

#### Info about grid search

# Get best estimator prediction for x_test
y_test_pred = grd.best_estimator_.predict(x_test)

# Best estimator
print(f"Params: {grd.best_params_}")
print(f"Scoring: {grd.scoring}")
print(f"Test score: {sklearn.metrics.accuracy_score(y_test, y_test_pred):.2f} (accuracy)")
print(f"Test score: {sklearn.metrics.balanced_accuracy_score(y_test, y_test_pred):.2f} (balanced_accuracy)")

Params: {'criterion': 'entropy', 'max_samples': 0.75, 'min_samples_leaf': 1, 'n_estimators': 200}
Scoring: balanced_accuracy
Test score: 0.97 (accuracy)
Test score: 0.94 (balanced_accuracy)


#### Best Estimator: Permutation Test Score

In [11]:
# score, permutation_score, pvalue = sklearn.model_selection.permutation_test_score(
#     grd.best_estimator_,
#     x_train,
#     y_train,
#     cv=sklearn.model_selection.KFold(n_splits=params['cv_splits'], shuffle=params['shuffle']),
#     n_permutations=params['n_permutations'],
#     n_jobs=-1,
# )

# # Add result to params
# permutation_test_score = {
#     'permutation_scores':  permutation_score,
#     'test_score': score,
#     'pvalue': pvalue,
# }

# # Save permutation_test_score
# joblib.dump(permutation_test_score, os.path.join(dir_sklearn, save_name_permutation_test_score))

: 

: 

#### Best Estimator: Permutation Feature Importance

In [61]:
# perm_imp_train = sklearn.inspection.permutation_importance(
#     grd.best_estimator_,
#     x_test,
#     y_test,
#     n_repeats=params['n_repeats'],
#     max_samples=params['max_samples'],
# )

# # Add result to params
# permutation_feature_importance = {
#     'on': 'test',
#     'importances_mean': perm_imp_train['importances_mean'],
#     'importances_std': perm_imp_train['importances_std'],
#     'importances': perm_imp_train['importances'],
# }

# # Save permutation feature importances
# joblib.dump(permutation_feature_importance, os.path.join(dir_sklearn, save_name_permutation_feature_importance))

['C:\\Data\\lwf\\analysis\\220830_random-forrest\\sklearn\\v01\\permutation_feature_importance.joblib']

### Load: Previous results

In [62]:
# load_name = save_name_gridcv
# grd = joblib.load(os.path.join(dir_sklearn, load_name))

# load_name = save_name_params
# with open(os.path.join(dir_sklearn, load_name), "r") as f:
#     params = yaml.safe_load(f)