In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
from collections import namedtuple
import pickle
from decimal import Decimal as D
from decimal import getcontext, ROUND_HALF_UP
import datetime
from time import time
import uuid
import re

import tensorflow as tf

from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from lightgbm import LGBMRegressor

from boruta import BorutaPy

from util_paths import SHARED_FOLDER_PATH
from util_paths import DATASETS_PATH
from util_paths import FEATURES_SELECTED_PATH
from util_paths import EXPERIMENTS_PATH

from models import *
from utils_fvs_sampling_save import *

#ANSI escape sequences
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'

### Run experiments

In [2]:
def class_metrics(y_test, y_hat, err_tol=0.5):
    y_test = np.array(y_test)
    y_hat = np.array(y_hat)

    y_test_class = y_test.round().astype(int)
    y_hat_class = np.zeros(len(y_hat)) - 1

    for i in range(len(y_test)):
        y_t = 6.51
        y_h = 7.1
        
        if np.abs(y_test[i] - y_hat[i]) <= err_tol:
            y_hat_class[i] = y_test_class[i]
            
        else:
            y_hat_class[i] = int(np.round(y_hat[i]))
            
            if y_hat_class[i] == y_test_class[i]:
                if y_test[i] < y_hat[i]:
                    y_hat_class[i] = y_hat_class[i] + 1
                else:
                    y_hat_class[i] = y_hat_class[i] - 1
                    
        
    acc = metrics.accuracy_score(y_test_class, y_hat_class)
    f1 =  metrics.f1_score(y_test_class, y_hat_class, average='weighted')
    
    return acc, f1
    

In [8]:
def get_X_y(year, dataset, fv_type, selected_features):
    cols = []
    if fv_type == FeatureVector.All:
        cols = [col for col in dataset.columns if is_in_all(col)]
    
    elif fv_type == FeatureVector.FeatureImportance:
        cols = selected_features[year]["all"]
        #cols = selected_features[year]["confirmed"]
        cols = [col for col in cols if is_in_all(col)]
    
    elif fv_type == FeatureVector.VH_VV:
        cols = [col for col in dataset.columns if is_vh_vv(col)]
    
    elif fv_type == FeatureVector.Weather:
        cols = [col for col in dataset.columns if is_weather(col)]
    
    elif fv_type == FeatureVector.Topology:
        cols = [col for col in dataset.columns if is_topology(col)]
    
    elif fv_type == FeatureVector.Height:
        cols = [col for col in dataset.columns if is_height(col)]
        
    else:
        raise ValueError("Unknown Feature vector type!")
        
    X = dataset.loc[:, cols]
    
    y_reg = dataset.harvest
    return X, y_reg


import warnings
# False positive of SettingWithCopyWarning, the dataframe in question gets updated.
warnings.filterwarnings("ignore", module="pandas")

def evaluate(
    year, 
    data,
    model_type,
    fv_type,
    sampling,
    n_folds=5,
    save_path="",
    selected_features=None,
    err_tol=0.5
):
    data = data[data["year"] == year].drop(["year"], axis=1)
    
    X, y_reg = get_X_y(year, data, fv_type, selected_features)
    
    kf = KFold(n_splits=n_folds, shuffle=True)
    
    rmses = np.full(n_folds, np.inf)
    accs = np.zeros(n_folds)
    f1s = np.zeros(n_folds)
    
    print(f"Runnning kfold ...")
    start = time()
    have_saved = False
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
        X_train, y_train = X.iloc[train_idx], y_reg.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y_reg.iloc[test_idx]
        
        assert not np.any(np.isnan(X_train)), "NaNs in X_train BEFORE normalization"
        assert not np.any(np.isnan(X_test)), "NaNs in X_test BEFORE normalization"
        
        normalize(X_train, X_test)

        assert not np.any(np.isnan(X_train)), "NaNs in X_train AFTER normalization"
        assert not np.any(np.isnan(X_test)), "NaNs in X_test AFTER normalization"

        model = get_model(X_train, y_train, model_type)
        y_hat = model.predict(X_test)
        rmses[fold] = metrics.mean_squared_error(y_test, y_hat, squared=False)
        
        if (
            save_path != "" 
            and model_type == ModelType.LGBM_Regression 
            and fv_type == FeatureVector.All 
            and not have_saved
        ):
            save(
                y_hat,
                save_path,
                f"y_hat_{year}_{sampling.name}",
                "npy"
            )
            have_saved = True
        
        # Convert to classification problem
        acc, f1 = class_metrics(y_test, y_hat, err_tol)
        accs[fold] = acc
        f1s[fold] = f1
        
    end = time()
    print(f"Elapsed time: {end-start}")
    return Result(rmses, accs, f1s, model_type, fv_type, year)
        
def finalize_result(res: Result):
    print(f"Resutls")
    print("Means")
    print(f"{np.mean(res.rmses)}, RMSE")
    print(f"{np.mean(res.accs)}, ACCURACY")
    print(f"{np.mean(res.f1s)}, F1-SCORE")
    return AvgScore(np.mean(res.rmses), np.mean(res.accs), np.mean(res.f1s))

In [54]:
def run(data, models, fvs, sampling, run_name="", save_path="", selected_features=None, err_tol=0.5):
    all_res = dict()
    spacer = " "*6
    
    for year in YEARS:
        print(f"{RED}{BOLD}{UNDERLINE}{year}{END}: {sampling}")
        
        res_year = dict()
        for model in models:
            print(f"{spacer}{DARKCYAN}{BOLD}{model}{END}")
            res_year[model] = dict()
            
            for fv in fvs:
                print(f"{spacer*2}{BOLD}{fv}{END}")
                result = evaluate(
                    year, 
                    data, 
                    model, 
                    fv, 
                    sampling, 
                    save_path=save_path, 
                    selected_features=selected_features,
                    err_tol=err_tol
                )
                res_year[model][fv] = finalize_result(result)
                print("-"*20)
                
        if save_path != "":
            save(
                res_year,
                save_path,
                f"res_{year}_{run_name}",
                "pkl",
            )
            
        all_res[year] = res_year
    return all_res

def get_selected_features(sampling: Sampling):
    path = ""
    if sampling == Sampling.Grid_3x3_50m:
        path = os.path.join(FEATURES_SELECTED_PATH, "feature_selection.pkl")
    elif sampling == Sampling.Nearest_50m:
        path = os.path.join(FEATURES_SELECTED_PATH, "nearest", "feature_selection_nearest.pkl")
    else:
        return None
    
    with open(path, "rb") as file:
        selected_features = pickle.load(file)
        return selected_features
    
def create_root_folder(sampling):
    root_folder = os.path.join(
        EXPERIMENTS_PATH,
        f"{str(datetime.datetime.now().strftime('%h_%d_%H%M'))}_{sampling.name}"
    )
    while os.path.exists(root_folder):
        root_folder = os.path.join(
            EXPERIMENTS_PATH,
            str(datetime.datetime.now().strftime('%h_%d_%H:%M'))
        )   
        
    os.makedirs(root_folder)
    
    return root_folder
    

def run_all(
    models=None, 
    fvs=None, 
    samplings=None, 
    is_saving=True, 
    err_tol=0.5, 
    additional_name=None
):
    if not models:
        models = [ModelType.LGBM_Regression, ModelType.FNN_Regression]
    if not samplings:
        samplings = list(Sampling)
    
    for sampling in samplings:
        data = pd.read_feather(sampling_2_datasetpath_and_name[sampling]["path"])
        if sampling == Sampling.Nearest_50m:
            cols = [col for col in data.columns if is_nearest_sampling(col) or col == "year"]
            data = data.loc[:, cols]
            
        dataset_name = sampling_2_datasetpath_and_name[sampling]["name"]

        selected_features = get_selected_features(sampling)
        save_path = create_root_folder(sampling) if is_saving else ""
        print(save_path)
        
        all_res = run(
            data, 
            models, 
            fvs if fvs is not None else get_fvs(sampling), 
            sampling,
            run_name=f"{sampling.name}_{dataset_name}",
            save_path=save_path, 
            selected_features=selected_features,
            err_tol=err_tol
        )
        
        if is_saving:
            name = f"results_{datetime.datetime.now().strftime('%h_%d_%H:%M')}_{sampling.name}_{dataset_name}"
            if err_tol > 0.5:
                name += f"_error_allowance_{err_tol}"
            
            if additional_name is not None:
                name += "_" + additional_name
            
            save(
                all_res,
                save_path,
                name,
                "pkl",
            )

In [61]:
YEARS = [2017,2018,2019,2020]

run_all(
    samplings=[Sampling.Grid_3x3_50m],
    models=[ModelType.LGBM_Regression],
    fvs=[FeatureVector.All],
    err_tol=2.0
)


/mimer/NOBACKUP/groups/snic2022-23-428/Oliver/experiments/Dec_05_1830_Grid_3x3_50m
[91m[1m[4m2017[0m: Sampling.Grid_3x3_50m
      [36m[1mModelType.LGBM_Regression[0m
            [1mFeatureVector.All[0m
Runnning kfold ...
Elapsed time: 4.738386154174805
Resutls
Means
0.8066106112524137, RMSE
0.9688929524924111, ACCURACY
0.9670425696860983, F1-SCORE
--------------------
[91m[1m[4m2018[0m: Sampling.Grid_3x3_50m
      [36m[1mModelType.LGBM_Regression[0m
            [1mFeatureVector.All[0m
Runnning kfold ...
Elapsed time: 4.55283784866333
Resutls
Means
0.7536440796019913, RMSE
0.9795419847328244, ACCURACY
0.9786463390021061, F1-SCORE
--------------------
[91m[1m[4m2019[0m: Sampling.Grid_3x3_50m
      [36m[1mModelType.LGBM_Regression[0m
            [1mFeatureVector.All[0m
Runnning kfold ...
Elapsed time: 6.4360129833221436
Resutls
Means
0.922127830556305, RMSE
0.9636467236467237, ACCURACY
0.9619791693727014, F1-SCORE
--------------------
[91m[1m[4m2020[0m: Samp

## Create tables for report

In [50]:
def res_paths():
    paths = glob.glob(os.path.join(EXPERIMENTS_PATH, "Important", "*", "results_*"))
    list(enumerate(paths))
    return paths

paths = res_paths()

In [51]:
for i, p in enumerate(paths):
    print(i)
    print(p.split("/")[-1])
    print()

0
results_Dec_05_16:06_Grid_3x3_50m_alot2.pkl

1
results_Dec_05_16:43_Nearest_22m_hd_22m.pkl

2
results_Dec_05_16:13_Nearest_50m_alot2.pkl

3
results_Dec_05_16:15_Rfi_Grid_50m_rfi_filtered.pkl

4
results_Dec_05_17:46_Nearest_12m_hd_12m.pkl



In [47]:
res_path = paths[2]
print(res_path)

with open(res_path, "rb") as file:
    all_res = pickle.load(file)

/mimer/NOBACKUP/groups/snic2022-23-428/Oliver/experiments/Dec_05_1643_nearest_12/results_Dec_05_17:46_Nearest_12m_hd_12m.pkl


In [48]:
def get_sampling_from(path):
    if "Rfi_Grid" in path:
        return Sampling.Rfi_Grid_50m
    elif "Grid_3x3_50m" in path:
        return Sampling.Grid_3x3_50m
    
    elif "Nearest_50m" in path:
        return Sampling.Nearest_50m
    elif "Nearest_22m" in path:
        return Sampling.Nearest_22m
    elif "Nearest_12m" in path:
        return Sampling.Nearest_12m
    
    else:
        raise ValueError(f"Unknown sampling for res path: {path}")

sampling = get_sampling_from(res_path)
sampling

<Sampling.Nearest_12m: 5>

In [49]:
def latex_entry(results, year, sampling):
    col_names = {
        FeatureVector.All: "All",
        FeatureVector.FeatureImportance: "Feature importance",
        FeatureVector.VH_VV: r"VH \& VV",
        FeatureVector.Weather: "Weather",
        FeatureVector.Topology: "Topology",
        FeatureVector.Height: "Elevation",
    }
    
    for fv in get_fvs(sampling):        
        t_data = f"{col_names[fv]}"
        #t_data = ""
        for model in list(ModelType):
            if not model in results[year]:
                continue
                
            res = results[year][model][fv]
            t_data += f" & {my_round(res.rmse_mean)} & {my_round(res.accs_mean)} & {my_round(res.f1s_mean)}"
            
        t_data += r" \\ "
        print(t_data)
        
print(sampling)

year = 2017
for year in YEARS:
    latex_entry(all_res, year, sampling)
    print()

    

Sampling.Nearest_12m
All & 1.31 & 0.43 & 0.41 & 1.49 & 0.37 & 0.36 \\ 
VH \& VV & 1.34 & 0.42 & 0.40 & 1.53 & 0.36 & 0.34 \\ 
Weather & 1.56 & 0.35 & 0.32 & 1.60 & 0.33 & 0.29 \\ 
Topology & 1.84 & 0.25 & 0.16 & 1.86 & 0.26 & 0.17 \\ 
Elevation & 1.72 & 0.30 & 0.25 & 1.80 & 0.27 & 0.19 \\ 

All & 1.00 & 0.50 & 0.49 & 1.14 & 0.42 & 0.41 \\ 
VH \& VV & 1.02 & 0.49 & 0.48 & 1.11 & 0.45 & 0.44 \\ 
Weather & 1.43 & 0.32 & 0.27 & 1.47 & 0.31 & 0.24 \\ 
Topology & 1.74 & 0.26 & 0.17 & 1.76 & 0.26 & 0.18 \\ 
Elevation & 1.57 & 0.30 & 0.24 & 1.64 & 0.28 & 0.22 \\ 

All & 1.37 & 0.39 & 0.38 & 1.50 & 0.35 & 0.34 \\ 
VH \& VV & 1.42 & 0.38 & 0.36 & 1.52 & 0.33 & 0.32 \\ 
Weather & 1.75 & 0.26 & 0.23 & 1.86 & 0.24 & 0.21 \\ 
Topology & 2.15 & 0.17 & 0.08 & 2.17 & 0.16 & 0.08 \\ 
Elevation & 2.05 & 0.19 & 0.14 & 2.09 & 0.17 & 0.11 \\ 

All & 1.53 & 0.36 & 0.36 & 1.81 & 0.31 & 0.30 \\ 
VH \& VV & 1.63 & 0.34 & 0.33 & 1.92 & 0.31 & 0.30 \\ 
Weather & 2.10 & 0.22 & 0.20 & 2.45 & 0.21 & 0.18 \\ 
Topolog

### Varying error 

In [134]:
def latex_entry_error_allowance():
    
    # Fetch resutls with > 0.5
    paths = glob.glob(os.path.join(EXPERIMENTS_PATH, "Important", "*_err_*", "results_*"))
    all_res = dict()
    
    pattern = r"error_allowance_(\d+.\d*)"
    
    for path in sorted(paths):
        res = re.search(pattern, path)
        
        name = path.split(".")[0][-5:]
        with open(path, "rb") as file:
            all_res[res.groups()[0]] = pickle.load(file)
    
    # Fetch resutls with 0.5, baseline
    path_05 = glob.glob(os.path.join(EXPERIMENTS_PATH, "Important", "*_grid_50", "results_*"))[0]
    with open(path_05, "rb") as file:
        all_res["0.5"] = pickle.load(file)
    
    for err_tol in sorted(all_res):
        t_data = err_tol
            
        for year in YEARS:
            res = all_res[err_tol][year][ModelType.LGBM_Regression][FeatureVector.All]
            t_data += f" & {my_round(res.accs_mean)} & {my_round(res.f1s_mean)}"
        t_data += r" \\ "
        print(t_data)


In [135]:
latex_entry_error_allowance()

0.5 & 0.56 & 0.55 & 0.60 & 0.59 & 0.50 & 0.50 & 0.44 & 0.44 \\ 
0.75 & 0.72 & 0.71 & 0.76 & 0.76 & 0.67 & 0.67 & 0.60 & 0.60 \\ 
1. & 0.83 & 0.82 & 0.86 & 0.86 & 0.79 & 0.78 & 0.72 & 0.72 \\ 
1.25 & 0.90 & 0.89 & 0.91 & 0.91 & 0.86 & 0.86 & 0.80 & 0.79 \\ 
1.5 & 0.94 & 0.93 & 0.95 & 0.94 & 0.91 & 0.91 & 0.86 & 0.86 \\ 
1.75 & 0.96 & 0.95 & 0.97 & 0.96 & 0.94 & 0.94 & 0.90 & 0.90 \\ 
2.0 & 0.97 & 0.97 & 0.98 & 0.98 & 0.96 & 0.96 & 0.93 & 0.93 \\ 


In [123]:
all_res["0.75"][year][ModelType.LGBM_Regression][FeatureVector.All]

AvgScore(rmse_mean=1.0824188819998612, accs_mean=0.6045091172849799, f1s_mean=0.6023207487537051)

In [None]:
def latex_entry_error_allowance_old(all_res, years, errors):
    for err in errors:
        t_data = f"{err}"
        for year in years:
            res = all_res[year][ModelType.LGBM_Regression][FeatureVector.All][err]
            t_data += f" & {my_round(res.accs_mean)} & {my_round(res.f1s_mean)}"
            
        t_data += r" \\ "
        print(t_data)

### Transfer learning 

In [None]:

for year in YEARS:
    print(f"{RED}{BOLD}{UNDERLINE}{year}{END}: {sampling}")

    all_res[year] = dict()
    for model in models:
        print(f"{spacer}{DARKCYAN}{BOLD}{model}{END}")
        all_res[year][model] = dict()

        for fv in fvs:
            print(f"{spacer*2}{BOLD}{fv}{END}")
            all_res[year][model][fv] = dict()

            for err in errors:
                print(f"{spacer*3}{BOLD}{PURPLE}Error allowed: +-{err}{END}")
                result = evaluate2(year, data, model, fv, sampling, err_allowance=err)
                all_res[year][model][fv][err] = finalize_result(result)


In [137]:
TransferRes = namedtuple("TransferRes", ["rmse", "acc", "f1"])

import warnings
# False positive of SettingWithCopyWarning
warnings.filterwarnings("ignore", module="pandas")

import random

def transfer():
    YEARS = [2017, 2018, 2019, 2020]
    n_years = len(YEARS)
    
    file = f"dataset_y_2017_2020_alot2.feather"
    path = os.path.join(DATASETS_PATH, file)
    data = pd.read_feather(path)
    
    dataset_name = "alot2"
    fv = FeatureVector.All
    sampling = Sampling.Grid_3x3_50m
    
    results = dict()
    spacer = " "*3
    
    for year_x in YEARS:
        results[year_x] = dict()
        print(f"{RED}{BOLD}{UNDERLINE}Training year {year_x}{END}")
        for year_y in YEARS:
            if year_x == year_y:
                continue
            
            print(f"{spacer}{DARKCYAN}{BOLD}Test year {year_y}{END}")
            dataset_x = data[data["year"] == year_x].drop(["year"], axis=1)
            dataset_y = data[data["year"] == year_y].drop(["year"], axis=1)

            X, y_reg = get_X_y(year_x, dataset_x, fv, None)
            X_train, X_test, y_train, y_test = train_test_split(X, y_reg, test_size=0.1)

            if year_x != year_y:
                size_test = len(y_test)
                X, y_reg = get_X_y(year_y, dataset_y, fv, None)
                
                idxs = list(range(len(X)))
                random.shuffle(idxs)
                
                X_test = X.iloc[idxs[:size_test]]
                y_test = y_reg.iloc[idxs[:size_test]]

            model = get_lgbm_regressor(X_train, y_train)
            y_hat = model.predict(X_test)
            rmse = metrics.mean_squared_error(y_test, y_hat, squared=False)
            
             # Convert to classification problem
            acc, f1 = class_metrics(y_test, y_hat)

            
            results[year_x][year_y] = TransferRes(rmse, acc, f1)
            print(f"{results[year_x][year_y]}")
            
        print("-"*10)
    
    name = f"trans_results_{datetime.date.today().strftime('%m_%d')}_{dataset_name}_{fv.name}_{sampling.name}"
    save(
        results,
        EXPERIMENTS_PATH,
        name,
        "pkl",
    )
    
    return results

trans_res = transfer()

[91m[1m[4mTraining year 2017[0m
   [36m[1mTest year 2018[0m
TransferRes(rmse=4.443351692480902, acc=0.016216216216216217, f1=0.01104951104951105)
   [36m[1mTest year 2019[0m
TransferRes(rmse=1.7305867660831005, acc=0.1918918918918919, f1=0.1677038036900035)
   [36m[1mTest year 2020[0m
TransferRes(rmse=2.300130924300765, acc=0.16486486486486487, f1=0.13567802275667445)
----------
[91m[1m[4mTraining year 2018[0m
   [36m[1mTest year 2017[0m
TransferRes(rmse=3.9083783425043137, acc=0.03048780487804878, f1=0.018768873403019748)
   [36m[1mTest year 2019[0m
TransferRes(rmse=3.0712976897372446, acc=0.06707317073170732, f1=0.05521045810455782)
   [36m[1mTest year 2020[0m
TransferRes(rmse=3.1402514535478034, acc=0.08536585365853659, f1=0.04884064781927147)
----------
[91m[1m[4mTraining year 2019[0m
   [36m[1mTest year 2017[0m
TransferRes(rmse=1.9873761741026732, acc=0.1560364464692483, f1=0.15642369468042697)
   [36m[1mTest year 2018[0m
TransferRes(rmse=3.5257

In [147]:
def latex_entry_trans(trans_res, years):
    i = 0
    for year_x in YEARS:
        for year_y in YEARS:
            if year_x == year_y:
                continue
                
            t_data = f"{year_x} & {year_y} "
            res = trans_res[year_x][year_y]
            t_data += f" & {my_round(res.rmse)} & {my_round(res.acc)} & {my_round(res.f1)}"

            t_data += r" \\ "
            
            i+=1
            if i % 3 == 0 and year_x != 2020:
                t_data += r"\midrule "
            print(t_data)

In [148]:
latex_entry_trans(trans_res, YEARS)

2017 & 2018  & 4.44 & 0.02 & 0.01 \\ 
2017 & 2019  & 1.73 & 0.19 & 0.17 \\ 
2017 & 2020  & 2.30 & 0.16 & 0.14 \\ \midrule 
2018 & 2017  & 3.91 & 0.03 & 0.02 \\ 
2018 & 2019  & 3.07 & 0.07 & 0.06 \\ 
2018 & 2020  & 3.14 & 0.09 & 0.05 \\ \midrule 
2019 & 2017  & 1.99 & 0.16 & 0.16 \\ 
2019 & 2018  & 3.53 & 0.03 & 0.01 \\ 
2019 & 2020  & 2.04 & 0.18 & 0.17 \\ \midrule 
2020 & 2017  & 1.69 & 0.22 & 0.23 \\ 
2020 & 2018  & 3.75 & 0.02 & 0.01 \\ 
2020 & 2019  & 2.16 & 0.18 & 0.19 \\ 


In [140]:
3%44

3

In [None]:
def old():
    def reg2class(y_test, y_hat):
        return y_test.round().astype(int), y_hat.round().astype(int)

    def class_metrics(y_test, y_hat, err_tol):
        y_test, y_hat = reg2class(y_test, y_hat)
        if err_tol > 0.0:
            y_hat = allow_error(y_test, y_hat, err_tol)

        return metrics.accuracy_score(y_test, y_hat), metrics.f1_score(y_test, y_hat, average='weighted')

    def allow_error(y_test, y_hat, err_tol=1):
        # Does not support confusion matrix
        y_test = np.array(y_test)
        y_hat = np.array(y_hat)

        for i in range(len(y_test)):
            if np.abs(y_test[i] - y_hat[i]) <= err_tol:
                y_hat[i] = y_test[i]
        return y_hat