### Imports

In [1]:
import os
import joblib
import numpy as np
import pandas as pd
import importlib
import re
import yaml

import sklearn.ensemble 
import sklearn.model_selection
import sklearn.inspection

import findatree.io as io
import findatree.descriptions as descriptions

# Dictionaries: species_name to ba and vice versa
species_id_to_name = descriptions.species_id_to_name()
species_name_to_id = descriptions.species_name_to_id()

### Definitions

In [4]:
# Directory: Processed tnr%.hdf5s
dir_hdf5 = r"C:\Data\lwf\processed\2020\hdf5"

# Path to flight-log
path_log = r"C:\Data\lwf\Flugbuch_WZE-2020_digitalisiert.csv"

# Directory: sklearn
dir_sklearn = r"C:\Data\lwf\analysis\221029_random_forest\sklearn\v01"

# Save names:
save_name_params = 'params.yaml'
save_name_gridcv = 'grid.joblib'
save_name_dataset = 'dataset.joblib'
save_name_permutation_test_score = 'permutation_test_score.joblib'
save_name_permutation_feature_importance = 'permutation_feature_importance.joblib'

### Load

#### Load  features and logbook

In [5]:
importlib.reload(io)

# Load features
df_original, params_df = io.allhdf5s_crowns_features_to_dataframe(dir_hdf5, crowns_type='crowns_human')

# Load logbook
log = pd.read_csv(path_log, sep=';', header=0)

####  Clean-up of features

In [7]:
# Copy original
df = df_original.copy()
print(f"{r'#crowns'} = {len(df)} (original)")

# Convert bhd_2020 column to float32 dtype
df.loc[:, 'bhd_2020'] = pd.to_numeric(df.bhd_2020, errors='coerce')
df.bhd_2020 = df.bhd_2020.astype(np.float32)

# Convert bk column to int32 dtype
df.loc[:, 'bk'] = pd.to_numeric(df.bk, errors='coerce')
df.bk = df.bk.astype(np.int32)

# Drop NaN containing rows
df = df.dropna(axis=0, how='any')   
print(f"{r'#crowns'} = {len(df)} (after removal: NaNs,  i.e. completely shadowed or dead)")

# Drop bk > 1
df = df[df.bk <= 1]
print(f"{r'#crowns'} = {len(df)} (after removal: bk > 1)")

# Drop kkl > 3
df = df[df.kkl <= 3]
print(f"{r'#crowns'} = {len(df)} (after removal: kkl > 3)")

# Drop area_bright/0.2**2 < 10
df = df[df.area_bright / 0.2**2 > 10]
print(f"{r'#crowns'} = {len(df)} (after removal: #(bright pixels) <= 10)")

# Drop perc5_ndre < 0
df = df[df.perc5_ndre > -1e-1]
print(f"{r'#crowns'} = {len(df)} (after removal: perc5_ndre < 1e-1)")


#crowns = 4254 (original)
#crowns = 4158 (after removal: NaNs,  i.e. completely shadowed or dead)
#crowns = 4127 (after removal: bk > 1)
#crowns = 4127 (after removal: kkl > 3)
#crowns = 4089 (after removal: #(bright pixels) <= 10)
#crowns = 4067 (after removal: perc5_ndre < 1e-1)


#### Assign weather conditions

In [8]:
tnrs = log.Traktnummer.values
weathers = log['Wetter-Code'].values
tnr_to_weather = dict([(tnr, weather) for tnr, weather in zip(tnrs, weathers)])

df['weather'] = [tnr_to_weather[tnr] for tnr in df.tnr]

print(f"#tnrs = {len(np.unique(df[df.weather == 0].tnr))} -> sunny")
print(f"#tnrs = {len(np.unique(df[df.weather == 1].tnr))} -> cloudy")
print(f"#tnrs = {len(np.unique(df[df.weather == 2].tnr))} -> mixed")
print()
print(f"#crowns = {np.sum(df.weather == 0)} -> sunny")
print(f"#crowns = {np.sum(df.weather == 1)} -> cloudy")
print(f"#crowns = {np.sum(df.weather == 2)} -> mixed")


#tnrs = 81 -> sunny
#tnrs = 47 -> cloudy
#tnrs = 22 -> mixed

#crowns = 2125 -> sunny
#crowns = 1293 -> cloudy
#crowns = 649 -> mixed


#### Assign families: Conifers and Broadleaf

In [9]:
importlib.reload(descriptions)

# Define families by patterns
family_patterns = [
    'kiefer|fichte|tanne|douglasie|lärche', 
    'buche|eiche|ahorn|erle|birke|esche',
]

family_names = [
    'conifers',
    'broadleaf',
]

families = descriptions.species_groupby_families(family_patterns, family_names)
family_ids = descriptions.species_id_to_family_id(df.ba.values, families)

df = df.assign(
    family = family_ids,
    )

### Infos

#### Search: Pattern in column names

In [None]:
pattern = '^x_|^y'

cols = list(df.columns)
for col in cols:
    if bool(re.search(pattern, col, re.IGNORECASE)):
        print(col)

#### Info: Dataset

In [10]:
importlib.reload(descriptions)

descriptions.print_summary(
    df.tnr.values,
    df.ba.values,
    df.family.values,
    families,
)

Total number of crowns        : 4067
Mean number of crowns per tnr : 27.1
__________________________________________________

species_id| species_name                  | count
--------------------------------------------------
       134| Gemeine Kiefer                : 1424
       118| Gemeine Fichte                : 1215
        20| Rotbuche                      : 489
        48| Traubeneiche                  : 205
       100| Weißtanne                     : 192
        51| Stieleiche                    : 131
       116| Europäische Lärche            : 115
        22| Gemeine Esche                 : 53
        10| Gemeine Birke                 : 45
       136| Douglasie                     : 35
         5| Bergahorn                     : 32
         7| Schwarzerle                   : 27
       129| Schwarzkiefer                 : 22
        13| Hainbuche                     : 18
        36| Kirsche                       : 15
       117| Japanische Lärche             : 10
        53| 

### Classification

#### Overall parameters

In [12]:
params = {
    'classes':families,
    'test_size': 0.25,
    'cv_splits': 5,
    'shuffle': True,
    'scoring': 'accuracy',
    'n_permutations': 20,
    'n_repeats': 10,
    'max_samples': 0.5,
}

# Save parameters
io.list_of_dicts_to_yaml(os.path.join(dir_sklearn, save_name_params), [params])

#### Define: Features and labels
* Exclude terrestrial features
* Exclude coordinates
* Exclude non-family members

Create extended labels `y_extend` with information about `['family', 'ba', 'sst', 'nbv']`, to check later if classficiation is dependent on these.

In [14]:
# Terrestrial and identifaction feature names to be excluded
terr_names = [
    'id', 'tnr', 'family',
    'enr', 'bnr', 'ba', 'bhd_2020', 
    'alter_2020', 'bk', 'kkl', 'nbv',
    'sst', 'gilb', 'kommentar', 'sicherheit',
]

# Coordinate features pattern to be excluded
coordinate_pattern = '^x_|^y_'

# Get all columns
x_names = list(df.columns)

# Exclude terrestrial feature names
x_names = [col for col in x_names if col not in terr_names]

# Exclude coordinates
x_names = [col for col in x_names if not bool(re.search(coordinate_pattern, col))]

# Define features -> x
x = df.loc[df.family >= 0, x_names].values

# Define labels -> y
y_names = ['family']
y = df.loc[df.family >= 0, y_names[0]].values

# Define extended labels -> y_extend
y_extend_names = ['family', 'tnr', 'id', 'ba', 'sst', 'nbv']

y_extend = df.loc[df.family >= 0, y_extend_names].values

print(f"x.shape: {x.shape}")
print(f"y.shape: {y.shape}")
print(f"y_extend.shape: {y_extend.shape}")
print(f"Ratio in y: [#class=0]/[#class=1]: {np.sum(y == 0) / np.sum(y == 1):.1f}")

x.shape: (4028, 108)
y.shape: (4028,)
y_extend.shape: (4028, 6)
Ratio in y: [#class=0]/[#class=1]: 3.0


#### Train-test split

In [15]:
x_train, x_test, y_train, y_test, y_extend_train, y_extend_test, = sklearn.model_selection.train_test_split(
    x,
    y,
    y_extend,
    test_size=params['test_size'],
    shuffle=True,
    stratify=y,
)

dataset = {
    'x_names': x_names,
    'y_names': y_names,
    'y_extend_names': y_extend_names,
    'x_train': x_train,
    'x_test': x_test,
    'y_train': y_train,
    'y_test': y_test,
    'y_extend_train': y_extend_train,
    'y_extend_test': y_extend_test,
}

# Save dataset
joblib.dump(dataset, os.path.join(dir_sklearn, save_name_dataset)) 

['C:\\Data\\lwf\\analysis\\221029_random_forest\\sklearn\\v01\\dataset.joblib']

#### Grid Search

In [16]:
params_grd = {
    'criterion': ['gini', 'entropy'],
    'n_estimators': [50, 100, 200],
    'min_samples_leaf': [1, 3, 5],
    'max_samples': [0.75, 0.5]
}
grd = sklearn.model_selection.GridSearchCV(
    sklearn.ensemble.RandomForestClassifier(n_jobs=-1),
    param_grid = params_grd,
    scoring  = params['scoring'],
    cv = sklearn.model_selection.KFold(n_splits=params['cv_splits'], shuffle=params['shuffle'])
)
grd.fit(x_train, y_train)

# Select best_estimator and retrain on complete training set
rfc = grd.best_estimator_
rfc.fit(x_train, y_train)

# Save gridcv
joblib.dump(grd, os.path.join(dir_sklearn, save_name_gridcv))

# Best estimator
print(f"grd.best_params_: {grd.best_params_}")
print(f"grd.best_estimator_.score(x_test, y_test): {grd.best_estimator_.score(x_test, y_test):.3f}")

grd.best_params_: {'criterion': 'entropy', 'max_samples': 0.75, 'min_samples_leaf': 1, 'n_estimators': 100}
grd.best_estimator_.score(x_test, y_test): 0.930


#### Best Estimator: Permutation Test Score

In [11]:
score, permutation_score, pvalue = sklearn.model_selection.permutation_test_score(
    grd.best_estimator_,
    x_train,
    y_train,
    cv=sklearn.model_selection.KFold(n_splits=params['cv_splits'], shuffle=params['shuffle']),
    n_permutations=params['n_permutations'],
    n_jobs=-1,
)

# Add result to params
permutation_test_score = {
    'permutation_scores':  permutation_score,
    'test_score': score,
    'pvalue': pvalue,
}

# Save permutation_test_score
joblib.dump(permutation_test_score, os.path.join(dir_sklearn, save_name_permutation_test_score))

: 

: 

#### Best Estimator: Permutation Feature Importance

In [61]:
perm_imp_train = sklearn.inspection.permutation_importance(
    grd.best_estimator_,
    x_test,
    y_test,
    n_repeats=params['n_repeats'],
    max_samples=params['max_samples'],
)

# Add result to params
permutation_feature_importance = {
    'on': 'test',
    'importances_mean': perm_imp_train['importances_mean'],
    'importances_std': perm_imp_train['importances_std'],
    'importances': perm_imp_train['importances'],
}

# Save permutation feature importances
joblib.dump(permutation_feature_importance, os.path.join(dir_sklearn, save_name_permutation_feature_importance))

['C:\\Data\\lwf\\analysis\\220830_random-forrest\\sklearn\\v01\\permutation_feature_importance.joblib']

### Load: Previous results

In [62]:
load_name = save_name_gridcv
grd = joblib.load(os.path.join(dir_sklearn, load_name))

load_name = save_name_params
with open(os.path.join(dir_sklearn, load_name), "r") as f:
    params = yaml.safe_load(f)