## 1. Read raw data

In [1]:
import pandas as pd
import numpy as np
import torch
from xgboost import XGBClassifier

# Import class for dataset creating
from data_creator.create_dataset_for_one_model_with_neighbours import AllPointsDatasetCreator

# Import csv -> torch converter
from data_creator.utils import create_celled_data

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dataset_name from ../data/preprocessed/
dataset_name = "missouri_pdsi.csv"
torch_data = create_celled_data(dataset_name)

file ../data/celled/missouri_pdsi.csv already exists


In [3]:
numpy_data = torch_data.numpy()

In [4]:
numpy_data.shape

(752, 138, 104)

## 2. Preparing data

In [5]:
hist_len = 2
num_of_future_indexes = 12
x_min = 1
x_max = numpy_data.shape[1]
y_min = 1
y_max = numpy_data.shape[2]
pdsi_threshold = -2
filter_size = (1,1)
data_creator = AllPointsDatasetCreator(numpy_data, history_len = hist_len, 
                                       num_of_future_indexes = num_of_future_indexes, 
                                       time_border = int(0.7 * numpy_data.shape[0]),
                                       x_min = x_min, x_max = x_max, y_min = y_min, y_max = y_max,
                                       filter_size = filter_size,
                                       pdsi_threshold = pdsi_threshold)

In [6]:
data_creator.create_train_and_test_datasets()

In [7]:
train = data_creator.get_train_array()

In [8]:
test_by_point = data_creator.get_test_array_by_points()

In [9]:
# Separate data and target from each other.
trainX, trainy = train[:, :-num_of_future_indexes], train[:, -num_of_future_indexes:]

In [10]:
trainX.shape

(7408275, 2)

In [11]:
trainy.shape

(7408275, 12)

## 3. Fitting XGBoost

In [12]:
# Train XGBoost
model = XGBClassifier()
model.fit(trainX, trainy)

## 4. Testing model

In [13]:
from sklearn.metrics import roc_auc_score, accuracy_score

In [14]:
roc_auc_list = []

In [15]:
forecast_hors = list(range(num_of_future_indexes))

In [16]:
for horizon in forecast_hors:
    tmp_roc_auc_list = []
    
    for ind in range((x_max - x_min)*(y_max - y_min)):
        testX, testy = test_by_point[ind][:, :-num_of_future_indexes], test_by_point[ind][:, -num_of_future_indexes:]
        pred = model.predict(testX)    
        roc_auc = roc_auc_score(testy[:, horizon], pred[:, horizon])
        tmp_roc_auc_list.append(roc_auc)
    
    median_roc_auc = np.median(tmp_roc_auc_list)
    roc_auc_list.append(median_roc_auc)

In [17]:
print(f"ROC AUC for {np.arange(1, num_of_future_indexes+1)} months forecast")
print(np.round(roc_auc_list, 4))

ROC AUC for [ 1  2  3  4  5  6  7  8  9 10 11 12] months forecast
[0.9081 0.8399 0.7897 0.7549 0.7171 0.6795 0.6495 0.6187 0.5845 0.5399
 0.5128 0.5079]
