## 1. Read preprocessed data

In [1]:
import pandas as pd
import numpy as np
import torch
from xgboost import XGBClassifier

# Import class for dataset creating
from data_creator.create_dataset_for_one_model_with_neighbours import AllPointsDatasetCreator

# Import csv -> torch converter
from data_creator.utils import create_celled_data

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# dataset_name from ../data/preprocessed/
dataset_name = "missouri_pdsi.csv"
torch_data = create_celled_data(dataset_name)


file ../data/celled/missouri_pdsi.csv already exists


In [4]:
numpy_data = torch_data.numpy()

In [5]:
numpy_data.shape

(752, 138, 104)

## 2. Preparing data

In [7]:
hist_len = 1
num_of_future_indexes = 12
x_min = 1
x_max = numpy_data.shape[1]
y_min = 1
y_max = numpy_data.shape[2]
pdsi_threshold = [-2]
filter_size = (1,1)
data_creator = AllPointsDatasetCreator(numpy_data, history_len = hist_len, 
                                       num_of_future_indexes = num_of_future_indexes, 
                                       time_border = int(0.7 * numpy_data.shape[0]),
                                       x_min = x_min, x_max = x_max, y_min = y_min, y_max = y_max,
                                       filter_size = filter_size,
                                       pdsi_threshold = pdsi_threshold)

In [8]:
# the most time-consuming procedure
data_creator.create_train_and_test_datasets()

In [9]:
train = data_creator.get_train_array()

In [10]:
test_by_point = data_creator.get_test_array_by_points()

In [11]:
# Separate data and target from each other.
trainX, trainy = train[:, :-num_of_future_indexes], train[:, -num_of_future_indexes:]

In [12]:
trainX.shape

(7422386, 1)

## 3. Fitting and testing Linear classifier

In [13]:
from sklearn.metrics import roc_auc_score, f1_score, average_precision_score, accuracy_score
from sklearn.linear_model import LogisticRegression

In [14]:
# list of forecasting horizons
forecast_hors = list(range(num_of_future_indexes))

# list of final metric values
metric_list = []

In [15]:
# For each horizon we train own linear classifier
for horizon in forecast_hors:
    
    ### Train linear classifier
    model = LogisticRegression()
    model.fit(trainX, trainy[:, horizon])
    
    
    tmp_metric_list = []
    
    for ind in range((x_max - x_min)*(y_max - y_min)):
        testX, testy = test_by_point[ind][:, :-num_of_future_indexes], test_by_point[ind][:, -num_of_future_indexes:]
        pred = model.predict(testX)
        metric = None
        try:
            metric = roc_auc_score(testy[:, horizon], pred)
        except ValueError:
            metric = 0
        tmp_metric_list.append(metric)
    
    median_metric = np.median(tmp_metric_list)
    metric_list.append(median_metric)

In [16]:
print(f"ROC AUC for {np.arange(1, num_of_future_indexes+1)} months forecast")
print(np.round(metric_list, 4))

ROC AUC for [ 1  2  3  4  5  6  7  8  9 10 11 12] months forecast
[0.9003 0.8371 0.7885 0.7444 0.7015 0.6655 0.6233 0.5788 0.5328 0.5
 0.5    0.5   ]
