# Pipeline for estimating lakes area

### 0. Import libraries and load data

In [None]:
%reload_ext autoreload
%matplotlib inline
%autoreload 2

from src.model_trainer import *
from src.data_loader import *
from src.lake_analyzer import *

import warnings
import numpy as np
import pandas as pd
from tqdm import tqdm
warnings.filterwarnings("ignore")

TRAIN_LAKES = ["george", "walker", "melincue"]
TEST_LAKES = ["mono"]
THRESHOLDS_NUMBER = 10

In [None]:
train_data, data_indices = load_lakes(TRAIN_LAKES)
test_data, _ = load_lakes(TEST_LAKES)

### 1. Train and optimize lake detector with cross-validation

In [None]:
f1_scores = []
kappa_scores = []

# define the space for thresholds
thresholds = [
    np.percentile(get_water_index_values(train_data)[2], percentile)
    for percentile in np.linspace(20, 100, THRESHOLDS_NUMBER)
]

# go over each threshold
for threshold in tqdm(thresholds):
    # compute the average metric for the current threshold
    avg_f1, avg_kappa = compute_cross_validation_scores(train_data, data_indices, threshold)

    f1_scores.append(avg_f1)
    kappa_scores.append(avg_kappa)

# display a data frame that contains the results
threshold_scores = pd.DataFrame(
    {"threshold": thresholds, "f1_score": f1_scores, "kappa_score": kappa_scores}
).sort_values("f1_score", ascending=False)
threshold_scores

In [None]:
# get predictions for the train dataset
best_threshold = -0.316582 # threshold_scores["threshold"].iloc[0]
train_detections = predict_th(train_data, best_threshold)

### 2. Test lake detector

In [None]:
# get predictions for the test dataset
test_detections = predict_th(test_data, best_threshold)

# compute the test metrics with the current threshold
avg_f1, avg_kappa = compute_cross_validation_scores_th(
    test_data, [list(range(len(test_data)))], best_threshold, test=True
)
print(f"Test F1: {avg_f1}\nTest kappa: {avg_kappa}")


### 3. Analyze lake evolution

In [None]:
analyze_lake("melincue", train_data, train_detections)
