## Imports, data load, metric function definition

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import BallTree
import os

In [2]:

my_path = str(os.getcwd())
X_train = np.load(my_path+'/.data/X_train_surge_new.npz')
Y_train = pd.read_csv(my_path+'/.data/Y_train_surge.csv')
X_test = np.load(my_path+'/.data/X_test_surge_new.npz')


In [3]:
surge_train = np.array(Y_train)[:,1:]

In [4]:
def surge_prediction_metric(dataframe_y_true, dataframe_y_pred):
    weights = np.linspace(1, 0.1, 10)[np.newaxis]
    surge1_columns = [
        'surge1_t0', 'surge1_t1', 'surge1_t2', 'surge1_t3', 'surge1_t4',
        'surge1_t5', 'surge1_t6', 'surge1_t7', 'surge1_t8', 'surge1_t9' ]
    surge2_columns = [
        'surge2_t0', 'surge2_t1', 'surge2_t2', 'surge2_t3', 'surge2_t4',
        'surge2_t5', 'surge2_t6', 'surge2_t7', 'surge2_t8', 'surge2_t9' ]
    surge1_score = (weights * (dataframe_y_true[surge1_columns].values - dataframe_y_pred[surge1_columns].values)**2).mean()
    surge2_score = (weights * (dataframe_y_true[surge2_columns].values - dataframe_y_pred[surge2_columns].values)**2).mean()

    return surge1_score + surge2_score

## Benchmark
Train using kNN of pressure fields at two instants in time, with 40 neighbours

Old (not generic) version:
```python
nfields = 2; time_step_slp = 8
slp_train = []
slp_all = X_train['slp']
for i in range(5559):
    slp_train.append(np.ndarray.flatten(slp_all[i,-1]))
    for j in range(1,nfields):
        slp_train[-1] = np.concatenate( ( slp_train[-1], np.ndarray.flatten(slp_all[i,-1-j*time_step_slp]) ) )
slp_train = np.array(slp_train)

slp_test = []
slp_all_test = X_test['slp']
for i in range(509):
    slp_test.append(np.ndarray.flatten(slp_all_test[i,-1]))
    for j in range(1,nfields):
        slp_test[-1] = np.concatenate( ( slp_test[-1], np.ndarray.flatten(slp_all_test[i,-1-j*time_step_slp]) ) )
slp_test = np.array(slp_test)

tree = BallTree(slp_train)

surge_test_benchmark = []; k = 40
for i in range(509):
    dist, ind = tree.query([slp_test[i]], k=k)
    surge_test_benchmark.append(np.mean(surge_train[ind[0]], axis=0))
surge_test_benchmark = np.array(surge_test_benchmark)
```

In [5]:
class kNNpred:
    
    nfields = 2; time_step_slp = 8
    slp_train = []
    
    def train(self, trainX):
        slp_all = trainX
        for i in range(5559):
            self.slp_train.append(np.ndarray.flatten(slp_all[i,-1]))
            for j in range(1,self.nfields):
                self.slp_train[-1] = np.concatenate( \
                    ( self.slp_train[-1], np.ndarray.flatten(slp_all[i,-1-j*self.time_step_slp]) ) )
        self.slp_train = np.array(self.slp_train)
        
    def predict(self, testX):
        slp_test = []
        slp_all_test = testX
        for i in range(509):
            slp_test.append(np.ndarray.flatten(slp_all_test[i,-1]))
            for j in range(1,self.nfields):
                slp_test[-1] = np.concatenate( \
                    ( slp_test[-1], np.ndarray.flatten(slp_all_test[i,-1-j*self.time_step_slp]) ) )
        slp_test = np.array(slp_test)
    
        tree = BallTree(self.slp_train)
        surge_test_benchmark = []; k = 40
        for i in range(509):
            dist, ind = tree.query([slp_test[i]], k=k)
            surge_test_benchmark.append(np.mean(surge_train[ind[0]], axis=0))
        surge_test_benchmark = np.array(surge_test_benchmark)
        return surge_test_benchmark

In [6]:
res = kNNpred()
res.train(X_train['slp'])
surge_test_benchmark = res.predict(X_test['slp'])

## Store output

In [7]:
y_columns = [f'surge1_t{i}' for i in range(10)] + [f'surge2_t{i}' for i in range(10)]
y_test_benchmark = pd.DataFrame(data=surge_test_benchmark, columns=y_columns, index=X_test['id_sequence'])
y_test_benchmark.to_csv('Y_test_benchmark.csv', index_label='id_sequence', sep=',')

## Alternate version: compute a score on training dataset

In [None]:
aaa = kNNpred()
aaa.train(X_train['slp'])
surge_train_pred = aaa.predict(X_train['slp'][:509])

In [8]:
y_columns = [f'surge1_t{i}' for i in range(10)] + [f'surge2_t{i}' for i in range(10)]
Y_train_pred = pd.DataFrame(data=surge_train_pred, columns=y_columns, index=X_train['id_sequence'][:509])

In [10]:
print(f"Score on training: {surge_prediction_metric(Y_train[:509], Y_train_pred)}")

Score on training: 0.6197614176878359
