In [1]:
#############################################################
# 1. Libraries

import os
import datetime
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import scipy.signal

import matplotlib
import matplotlib.pyplot as plt
pd.options.display.max_columns = None    # disp all columns

from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold

import gc
import pickle

import lightgbm as lgb
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
from pytorch_tabnet.metrics import Metric

#############################################################

In [2]:
#############################################################
# 2. Paths & Global Variables

## 2.1 Paths

path = '../../01_Data/'
path_sequences = path + '01_GeneratedSequences/'
path_spectograms = path + '04_GeneratedSpectogramsSTFT/'


df_train = pd.read_csv(path + 'train.csv')
df_sample_submission = pd.read_csv(path + 'sample_submission.csv') 

## 2.2 Global Variables

unique_segments_id_train = set(df_train['segment_id'])
unique_segments_id_test = set(df_sample_submission['segment_id'])

SEQ_LENGTH = 60_001

#############################################################

In [3]:
#############################################################
# 3. Preprocess

dict_segments_paths_train = {
    segment : path + 'train/' + str(segment) + '.csv' for segment in unique_segments_id_train
}

dict_segments_paths_test = {
    segment : path + 'test/' + str(segment) + '.csv' for segment in unique_segments_id_test
}

###

dict_segments_sequences_paths_train = {
    segment : path_sequences + 'train/' + str(segment) + '.npy' for segment in unique_segments_id_train
}

dict_segments_sequences_paths_test = {
    segment : path_sequences + 'test/' + str(segment) + '.npy' for segment in unique_segments_id_test
}


###

df_train['time_to_eruption'] = df_train['time_to_eruption']/(10**6)

dict_labels = {
    segment : df_train['time_to_eruption'][df_train['segment_id']==segment].values.flatten()
    
    for segment in unique_segments_id_train
}


#############################################################

In [4]:
#############################################################
# 5. Build Dataset

fs = 100                # sampling frequency 
n = 256                 # FFT segment size
max_f = 20              # ～20Hz

delta_f = fs / n        # 0.39Hz
delta_t = n / fs / 2    # 1.28s

def generateFeaturesTimeDomain(dict_paths):
    feature_set = []
    for segment_id in tqdm(dict_paths, total=len(dict_paths), position=0):
        data = np.load(dict_paths[segment_id])
        segment = [segment_id]
        
        # mean
        segment += data.mean(axis=0).tolist()
        # std
        segment += data.std(axis=0).tolist()
        # min
        segment += data.min(axis=0).tolist()
        # max
        segment += data.max(axis=0).tolist()
        # 5 percentile
        segment += np.quantile(data, 0.05, axis=0).tolist()
        # 10 percentile
        segment += np.quantile(data, 0.1, axis=0).tolist()
        # 20 percentile
        segment += np.quantile(data, 0.2, axis=0).tolist()
        # 40 percentile
        segment += np.quantile(data, 0.4, axis=0).tolist()
        # 50 percentile
        segment += np.quantile(data, 0.5, axis=0).tolist()
        # 60 percentile
        segment += np.quantile(data, 0.6, axis=0).tolist()
        # 80 percentile
        segment += np.quantile(data, 0.8, axis=0).tolist()
        # 90 percentile
        segment += np.quantile(data, 0.9, axis=0).tolist()
        
        # shift
        d = pd.DataFrame(data)
        d.columns = [f'sensor_{i+1}' for i in range(10)]
        for col in d:
            d[col+'_5000'] = d[col].shift(5000).fillna(0)
            d[col+'_10000'] = d[col].shift(10000).fillna(0)
            d[col+'_20000'] = d[col].shift(20000).fillna(0)
            d[col+'_30000'] = d[col].shift(30000).fillna(0)

        # +5000 / +10000 / +20000 / +30000 self-corr
        for col in d.columns[:10]:
            col1 = col+'_5000'
            col2 = col+'_10000'
            col3 = col+'_20000'
            col4 = col+'_30000'
            tmp1 = d.loc[:, [col, col1]].dropna().fillna(0)
            tmp2 = d.loc[:, [col, col2]].dropna().fillna(0)
            tmp3 = d.loc[:, [col, col3]].dropna().fillna(0)
            tmp4 = d.loc[:, [col, col4]].dropna().fillna(0)
            segment += [tmp1[col].corr(tmp1[col1]), 
                        tmp2[col].corr(tmp2[col2]), 
                        tmp3[col].corr(tmp3[col3]),
                        tmp4[col].corr(tmp4[col4])]

        feature_set.append(segment)
        
    base_colname = ['sensor_'+str(i) for i in range(1, 11)]
    fea_colname = ['segment_id'] + [j + '_mean' for j in base_colname] + [j + '_std' for j in base_colname] + \
                    [j + '_min' for j in base_colname] + [j + '_max' for j in base_colname] + \
                        [j + '_5_quant' for j in base_colname] + [j + '_10_quant' for j in base_colname] + \
                            [j + '_20_quant' for j in base_colname] + [j + '_40_quant' for j in base_colname] + \
                            [j + '_50_quant' for j in base_colname] + [j + '_60_quant' for j in base_colname] + \
                            [j + '_80_quant' for j in base_colname] + [j + '_90_quant' for j in base_colname] + \
                        [j + i for j in base_colname for i in ['_5000_self_corr', '_10000_self_corr', 
                                                               '_20000_self_corr', '_30000_self_corr']]    
    feature_df = pd.DataFrame(feature_set, columns=fea_colname)
    feature_df['segment_id'] = feature_df['segment_id'].astype('int')
    
    return feature_df


def generateFeaturesFreqDomain(dict_paths):
    feature_set = []
    for segment_id in tqdm(dict_paths, total=len(dict_paths), position=0):
        data = np.load(dict_paths[segment_id])
        segment = [segment_id]
        for sensor in range(10):
            x = data[:, sensor]
            f, t, Z = scipy.signal.stft(x, fs = fs, window = 'hann', nperseg = n)
            f = f[:round(max_f/delta_f)+1]
            
            # Time domain
            Z_half = np.abs(Z[:round(Z.shape[0]//2)+1]).T
            min_ = Z_half.min(axis=0).mean()
            max_ = Z_half.max(axis=0).mean()
            std_ = Z_half.std(axis=0).mean()
            mean_ = Z_half.mean(axis=0).mean()
            p25 = np.quantile(Z_half, 0.25, axis=0).mean()
            p50 = np.quantile(Z_half, 0.5, axis=0).mean()
            p75 = np.quantile(Z_half, 0.75, axis=0).mean()
            p90 = np.quantile(Z_half, 0.9, axis=0).mean()
            p95 = np.quantile(Z_half, 0.95, axis=0).mean()
            segment += [min_, max_, std_, mean_, p25, p50, p75, p90, p95]
            
            # Freq domain
            Z = np.abs(Z[:round(max_f/delta_f)+1]).T
            th = Z.mean() * 1 
            Z_pow = Z.copy()
            Z_pow[Z < th] = 0
            Z_num = Z_pow.copy()
            Z_num[Z >= th] = 1

            Z_pow_sum = Z_pow.sum(axis = 0)
            Z_num_sum = Z_num.sum(axis = 0)

            A_pow = Z_pow_sum[round(10/delta_f):].sum()
            A_num = Z_num_sum[round(10/delta_f):].sum()
            BH_pow = Z_pow_sum[round(5/delta_f):round(8/delta_f)].sum()
            BH_num = Z_num_sum[round(5/delta_f):round(8/delta_f)].sum()
            BL_pow = Z_pow_sum[round(1.5/delta_f):round(2.5/delta_f)].sum()
            BL_num = Z_num_sum[round(1.5/delta_f):round(2.5/delta_f)].sum()
            C_pow = Z_pow_sum[round(0.6/delta_f):round(1.2/delta_f)].sum()
            C_num = Z_num_sum[round(0.6/delta_f):round(1.2/delta_f)].sum()
            D_pow = Z_pow_sum[round(2/delta_f):round(4/delta_f)].sum()
            D_num = Z_num_sum[round(2/delta_f):round(4/delta_f)].sum()
            segment += [A_pow, A_num, BH_pow, BH_num, BL_pow, BL_num, C_pow, C_num, D_pow, D_num]

        feature_set.append(segment)

    #Cols Names
    cols = ['segment_id']
    for i in range(10):
        for j in ['min', 'max', 'std', 'mean', 'p25', 'p50', 'p75', 'p90', 'p95']:
            cols += [f's{i+1}_{j}']
        for j in ['A_pow', 'A_num','BH_pow', 'BH_num','BL_pow', 'BL_num','C_pow', 'C_num','D_pow', 'D_num']:
            cols += [f's{i+1}_{j}']
    feature_df = pd.DataFrame(feature_set, columns = cols)
    feature_df['segment_id'] = feature_df['segment_id'].astype('int')
    
    return feature_df


def buildDataset(dict_paths):
    #df_time = generateFeaturesTimeDomain(dict_paths)
    df = generateFeaturesFreqDomain(dict_paths)
    
    #df = pd.merge(df_time, df_freq, how='inner', on='segment_id')
    
    return df


# 5.2 Build Dataframes - 4mins

# df_time_X_train = generateFeaturesTimedomain(dict_segments_sequences_paths_train)
# df_freq_X_train = generateFeaturesFreqDomain(dict_segments_sequences_paths_train)


df_X_train = buildDataset(dict_segments_sequences_paths_train)

df_X_train = df_X_train.merge(df_train[['segment_id', 'time_to_eruption']], how='inner')
features = [col for col in df_X_train.columns.tolist() if col not in ['segment_id', 'time_to_eruption']]
    
#############################################################

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4431.0), HTML(value='')))




In [5]:
#############################################################
# 6. Models

def buildLGBModel(X_train, y_train, X_val, y_val, features, verbose=10, early_stopping_rounds=200):
    cat_features = {}
    params = {'objective': 'rmse',
              'metric': 'rmse',
              'max_depth':14,
              'min_data_in_leaf':5,         # = min_child_samples
              'num_leaves': 2**7 - 1,
              'learning_rate': 0.05,
              'feature_fraction': 0.7,      # = colsample_bytree
              'bagging_fraction': 0.5,      # = subsample
              'bagging_freq': 5,
              'lambda_l1':80,               # = reg_alpha
              'num_iterations': 10000,      # = n_estimators
              'seed': 42,
              'verbose': -1
             }

    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

    evals_result = {}
    
    model = lgb.train(
        params,
        lgb_train,
        valid_sets = (lgb_train, lgb_eval),
        feature_name = features,
        categorical_feature = cat_features,
        verbose_eval = 100,
        evals_result = evals_result,
        early_stopping_rounds = 200
    )

#     model = lgb.LGBMRegressor(random_state = 12,
#                                 max_depth = 8,
#                                 num_leaves=28,
#                                 n_estimators = 250,
#                                 reg_lambda=1.5,
#                                 learning_rate = 0.05)
    
#     model.fit(X_train, y_train)
    return model

    
#############################################################

In [6]:
class TabnetMAE(Metric):
    def __init__(self):
        self._name = "mae"
        self._maximize = False

    def __call__(self, y_true, y_score):
        mae = mean_absolute_error(y_true, y_score[:, 1])
        return mae

In [7]:
#############################################################
# 7. Training

list_segments_train = list(unique_segments_id_train)

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=12)
list_history, list_models = [], []

for num_fold, (train_index, val_index) in enumerate(kf.split(list_segments_train,
                                                             np.zeros(len(list_segments_train)))):
    segments_train_fold = np.asarray(list_segments_train)[train_index]
    segments_val_fold = np.asarray(list_segments_train)[val_index]

    print(f'Num Fold: {num_fold + 1}')
    print(f'Train segments: {len(train_index)} Val segments: {len(val_index)}')
    
    X_train = df_X_train[features][df_X_train['segment_id'].isin(list(segments_train_fold))]
    y_train = df_X_train['time_to_eruption'][df_X_train['segment_id'].isin(list(segments_train_fold))]
    X_val = df_X_train[features][df_X_train['segment_id'].isin(list(segments_val_fold))]
    y_val = df_X_train['time_to_eruption'][df_X_train['segment_id'].isin(list(segments_val_fold))]
    
    #model = buildLGBModel(X_train, y_train, X_val, y_val, features, verbose=10, early_stopping_rounds=200)
    
    model_tabnet = TabNetRegressor()
    model_tabnet.fit(
      X_train.values, 
      np.expand_dims(y_train.values, -1),
      eval_set=[(X_val.values, np.expand_dims(y_val.values, -1))],
      patience=50,
      max_epochs=500,
      eval_metric=[TabnetMAE],
      loss_fn=torch.nn.L1Loss()
    )
    
    #list_models.append(model)

    y_pred_val = model_tabnet.predict(X_val.values)
    mae = np.abs(y_val - y_pred_val.squeeze())

    #pickle.dump(model, open(f'./models/model_tabular_STFT_{num_fold}.pickle', 'wb'))
    torch.save(model_tabnet, f'./models/model_tabular_STFT_{num_fold}')
    
    print('***'*20)
    print(f'Prediction MAE: {mae.mean()}')
    print('***'*20)

#############################################################

Num Fold: 1
Train segments: 3544 Val segments: 887
Device used : cuda
epoch 0  | loss: 22.56065| val_0_mae: 24.54005|  0:00:01s
epoch 1  | loss: 21.41019| val_0_mae: 16.58556|  0:00:01s
epoch 2  | loss: 20.09935| val_0_mae: 19.31124|  0:00:01s
epoch 3  | loss: 18.98425| val_0_mae: 23.43033|  0:00:02s
epoch 4  | loss: 17.70642| val_0_mae: 17.69521|  0:00:02s
epoch 5  | loss: 15.83465| val_0_mae: 17.10458|  0:00:02s
epoch 6  | loss: 14.20443| val_0_mae: 17.94293|  0:00:02s
epoch 7  | loss: 12.60815| val_0_mae: 17.958  |  0:00:02s
epoch 8  | loss: 11.6459 | val_0_mae: 15.88384|  0:00:03s
epoch 9  | loss: 11.17378| val_0_mae: 14.49778|  0:00:03s
epoch 10 | loss: 10.72015| val_0_mae: 16.93676|  0:00:03s
epoch 11 | loss: 10.00044| val_0_mae: 21.2035 |  0:00:03s
epoch 12 | loss: 9.76214 | val_0_mae: 21.09859|  0:00:03s
epoch 13 | loss: 9.56314 | val_0_mae: 27.42346|  0:00:04s
epoch 14 | loss: 9.50574 | val_0_mae: 23.05602|  0:00:04s
epoch 15 | loss: 9.32078 | val_0_mae: 19.14741|  0:00:04s
ep

epoch 141| loss: 3.14675 | val_0_mae: 3.11168 |  0:00:30s
epoch 142| loss: 3.01358 | val_0_mae: 3.34154 |  0:00:30s
epoch 143| loss: 2.99612 | val_0_mae: 3.11405 |  0:00:30s
epoch 144| loss: 3.06688 | val_0_mae: 3.18058 |  0:00:30s
epoch 145| loss: 2.98266 | val_0_mae: 3.13389 |  0:00:30s
epoch 146| loss: 3.13368 | val_0_mae: 3.18809 |  0:00:31s
epoch 147| loss: 3.08331 | val_0_mae: 3.1195  |  0:00:31s
epoch 148| loss: 3.05242 | val_0_mae: 3.06779 |  0:00:31s
epoch 149| loss: 3.05825 | val_0_mae: 3.1345  |  0:00:31s
epoch 150| loss: 3.18234 | val_0_mae: 3.22607 |  0:00:31s
epoch 151| loss: 3.1031  | val_0_mae: 3.02819 |  0:00:32s
epoch 152| loss: 3.05459 | val_0_mae: 3.17012 |  0:00:32s
epoch 153| loss: 2.99837 | val_0_mae: 3.21181 |  0:00:32s
epoch 154| loss: 3.13558 | val_0_mae: 2.98464 |  0:00:32s
epoch 155| loss: 3.05971 | val_0_mae: 3.03645 |  0:00:32s
epoch 156| loss: 2.98554 | val_0_mae: 3.08833 |  0:00:33s
epoch 157| loss: 2.96808 | val_0_mae: 3.04589 |  0:00:33s
epoch 158| los

epoch 283| loss: 2.32002 | val_0_mae: 2.47993 |  0:00:59s
epoch 284| loss: 2.2377  | val_0_mae: 2.27407 |  0:00:59s
epoch 285| loss: 2.21499 | val_0_mae: 2.52463 |  0:00:59s
epoch 286| loss: 2.18671 | val_0_mae: 2.45175 |  0:00:59s
epoch 287| loss: 2.23122 | val_0_mae: 2.42717 |  0:00:59s
epoch 288| loss: 2.3124  | val_0_mae: 2.49553 |  0:01:00s
epoch 289| loss: 2.29697 | val_0_mae: 2.38011 |  0:01:00s
epoch 290| loss: 2.34708 | val_0_mae: 2.48589 |  0:01:00s
epoch 291| loss: 2.28587 | val_0_mae: 2.53793 |  0:01:00s
epoch 292| loss: 2.19697 | val_0_mae: 2.46179 |  0:01:00s
epoch 293| loss: 2.19084 | val_0_mae: 2.38852 |  0:01:01s
epoch 294| loss: 2.24508 | val_0_mae: 2.38202 |  0:01:01s
epoch 295| loss: 2.15357 | val_0_mae: 2.43556 |  0:01:01s
epoch 296| loss: 2.15945 | val_0_mae: 2.52988 |  0:01:01s
epoch 297| loss: 2.27082 | val_0_mae: 2.37289 |  0:01:01s
epoch 298| loss: 2.12971 | val_0_mae: 2.37265 |  0:01:02s
epoch 299| loss: 2.18263 | val_0_mae: 2.37608 |  0:01:02s
epoch 300| los

epoch 0  | loss: 22.77497| val_0_mae: 20.98271|  0:00:00s
epoch 1  | loss: 21.76208| val_0_mae: 19.42258|  0:00:00s
epoch 2  | loss: 20.6102 | val_0_mae: 17.81593|  0:00:00s
epoch 3  | loss: 19.44037| val_0_mae: 16.37098|  0:00:00s
epoch 4  | loss: 18.11905| val_0_mae: 17.93274|  0:00:01s
epoch 5  | loss: 16.67839| val_0_mae: 14.8733 |  0:00:01s
epoch 6  | loss: 14.75837| val_0_mae: 20.23816|  0:00:01s
epoch 7  | loss: 12.96708| val_0_mae: 33.80388|  0:00:01s
epoch 8  | loss: 11.59465| val_0_mae: 23.79183|  0:00:01s
epoch 9  | loss: 10.79066| val_0_mae: 39.15563|  0:00:02s
epoch 10 | loss: 10.74422| val_0_mae: 20.06341|  0:00:02s
epoch 11 | loss: 10.49788| val_0_mae: 17.47858|  0:00:02s
epoch 12 | loss: 10.00242| val_0_mae: 15.60929|  0:00:02s
epoch 13 | loss: 9.87989 | val_0_mae: 17.17432|  0:00:02s
epoch 14 | loss: 9.3721  | val_0_mae: 19.36864|  0:00:03s
epoch 15 | loss: 9.13405 | val_0_mae: 19.83064|  0:00:03s
epoch 16 | loss: 8.72662 | val_0_mae: 22.67545|  0:00:03s
epoch 17 | los

epoch 142| loss: 2.91484 | val_0_mae: 3.19997 |  0:00:29s
epoch 143| loss: 2.85075 | val_0_mae: 3.19608 |  0:00:29s
epoch 144| loss: 2.92827 | val_0_mae: 3.11277 |  0:00:29s
epoch 145| loss: 2.99761 | val_0_mae: 3.04572 |  0:00:30s
epoch 146| loss: 2.86994 | val_0_mae: 3.0807  |  0:00:30s
epoch 147| loss: 2.83859 | val_0_mae: 3.27905 |  0:00:30s
epoch 148| loss: 2.85878 | val_0_mae: 3.16903 |  0:00:30s
epoch 149| loss: 2.81806 | val_0_mae: 3.03324 |  0:00:30s
epoch 150| loss: 2.78023 | val_0_mae: 3.03059 |  0:00:31s
epoch 151| loss: 2.79092 | val_0_mae: 3.01903 |  0:00:31s
epoch 152| loss: 2.82515 | val_0_mae: 2.98498 |  0:00:31s
epoch 153| loss: 2.77635 | val_0_mae: 3.09189 |  0:00:31s
epoch 154| loss: 2.82973 | val_0_mae: 3.07922 |  0:00:31s
epoch 155| loss: 2.78578 | val_0_mae: 3.12648 |  0:00:32s
epoch 156| loss: 2.83798 | val_0_mae: 3.03888 |  0:00:32s
epoch 157| loss: 2.80688 | val_0_mae: 3.01796 |  0:00:32s
epoch 158| loss: 2.7275  | val_0_mae: 2.97352 |  0:00:32s
epoch 159| los

epoch 425| loss: 1.72246 | val_0_mae: 2.01231 |  0:01:27s
epoch 426| loss: 1.82657 | val_0_mae: 2.16003 |  0:01:27s
epoch 427| loss: 1.80361 | val_0_mae: 2.11396 |  0:01:27s
epoch 428| loss: 1.75058 | val_0_mae: 2.09966 |  0:01:28s
epoch 429| loss: 1.70495 | val_0_mae: 2.04808 |  0:01:28s
epoch 430| loss: 1.7619  | val_0_mae: 1.93585 |  0:01:28s
epoch 431| loss: 1.6853  | val_0_mae: 1.86025 |  0:01:28s
epoch 432| loss: 1.72008 | val_0_mae: 2.02025 |  0:01:29s
epoch 433| loss: 1.68399 | val_0_mae: 1.98021 |  0:01:29s
epoch 434| loss: 1.71572 | val_0_mae: 1.89275 |  0:01:29s
epoch 435| loss: 1.67268 | val_0_mae: 1.81226 |  0:01:29s
epoch 436| loss: 1.68629 | val_0_mae: 1.88545 |  0:01:29s
epoch 437| loss: 1.61695 | val_0_mae: 1.90285 |  0:01:30s
epoch 438| loss: 1.69476 | val_0_mae: 1.97009 |  0:01:30s
epoch 439| loss: 1.74199 | val_0_mae: 1.97797 |  0:01:30s
epoch 440| loss: 1.66568 | val_0_mae: 1.97393 |  0:01:30s
epoch 441| loss: 1.6976  | val_0_mae: 1.96474 |  0:01:30s
epoch 442| los

epoch 74 | loss: 3.90167 | val_0_mae: 4.83929 |  0:00:15s
epoch 75 | loss: 3.87164 | val_0_mae: 4.70772 |  0:00:15s
epoch 76 | loss: 3.89715 | val_0_mae: 4.59451 |  0:00:15s
epoch 77 | loss: 3.80078 | val_0_mae: 4.41228 |  0:00:15s
epoch 78 | loss: 3.79327 | val_0_mae: 4.89145 |  0:00:16s
epoch 79 | loss: 3.81153 | val_0_mae: 4.32774 |  0:00:16s
epoch 80 | loss: 3.88968 | val_0_mae: 4.21167 |  0:00:16s
epoch 81 | loss: 3.77981 | val_0_mae: 4.39359 |  0:00:16s
epoch 82 | loss: 3.87955 | val_0_mae: 4.71159 |  0:00:16s
epoch 83 | loss: 3.79659 | val_0_mae: 4.41697 |  0:00:17s
epoch 84 | loss: 3.69343 | val_0_mae: 4.35362 |  0:00:17s
epoch 85 | loss: 3.67261 | val_0_mae: 4.40769 |  0:00:17s
epoch 86 | loss: 3.90541 | val_0_mae: 3.89535 |  0:00:17s
epoch 87 | loss: 3.63427 | val_0_mae: 3.69098 |  0:00:17s
epoch 88 | loss: 3.65268 | val_0_mae: 4.28106 |  0:00:18s
epoch 89 | loss: 3.66897 | val_0_mae: 3.9792  |  0:00:18s
epoch 90 | loss: 3.68345 | val_0_mae: 3.6851  |  0:00:18s
epoch 91 | los

epoch 216| loss: 2.50238 | val_0_mae: 2.51662 |  0:00:44s
epoch 217| loss: 2.32082 | val_0_mae: 2.57196 |  0:00:44s
epoch 218| loss: 2.40058 | val_0_mae: 2.54107 |  0:00:44s
epoch 219| loss: 2.36319 | val_0_mae: 2.55166 |  0:00:44s
epoch 220| loss: 2.38322 | val_0_mae: 2.51402 |  0:00:45s
epoch 221| loss: 2.45438 | val_0_mae: 2.61987 |  0:00:45s
epoch 222| loss: 2.36559 | val_0_mae: 2.55152 |  0:00:45s
epoch 223| loss: 2.33813 | val_0_mae: 2.54487 |  0:00:45s
epoch 224| loss: 2.30297 | val_0_mae: 2.46283 |  0:00:45s
epoch 225| loss: 2.31989 | val_0_mae: 2.58274 |  0:00:46s
epoch 226| loss: 2.38366 | val_0_mae: 2.44873 |  0:00:46s
epoch 227| loss: 2.22987 | val_0_mae: 2.59604 |  0:00:46s
epoch 228| loss: 2.36564 | val_0_mae: 2.51976 |  0:00:46s
epoch 229| loss: 2.36582 | val_0_mae: 2.39713 |  0:00:46s
epoch 230| loss: 2.32853 | val_0_mae: 2.48962 |  0:00:47s
epoch 231| loss: 2.21035 | val_0_mae: 2.4633  |  0:00:47s
epoch 232| loss: 2.32631 | val_0_mae: 2.3997  |  0:00:47s
epoch 233| los

epoch 358| loss: 1.98048 | val_0_mae: 2.20964 |  0:01:13s
epoch 359| loss: 1.94997 | val_0_mae: 2.26698 |  0:01:13s
epoch 360| loss: 1.97406 | val_0_mae: 2.24784 |  0:01:13s
epoch 361| loss: 2.01014 | val_0_mae: 2.26407 |  0:01:13s
epoch 362| loss: 2.1033  | val_0_mae: 2.19519 |  0:01:13s
epoch 363| loss: 2.12277 | val_0_mae: 2.42315 |  0:01:14s
epoch 364| loss: 2.18934 | val_0_mae: 2.45922 |  0:01:14s
epoch 365| loss: 2.06811 | val_0_mae: 2.42718 |  0:01:14s
epoch 366| loss: 2.0609  | val_0_mae: 2.39239 |  0:01:14s
epoch 367| loss: 1.92086 | val_0_mae: 2.34748 |  0:01:14s
epoch 368| loss: 1.98775 | val_0_mae: 2.26451 |  0:01:15s
epoch 369| loss: 2.01433 | val_0_mae: 2.22848 |  0:01:15s
epoch 370| loss: 2.0305  | val_0_mae: 2.33813 |  0:01:15s
epoch 371| loss: 2.07442 | val_0_mae: 2.21809 |  0:01:15s

Early stopping occured at epoch 371 with best_epoch = 321 and best_val_0_mae = 2.17045
Best weights from best epoch are automatically used!
***********************************************

epoch 262| loss: 3.44542 | val_0_mae: 4.08527 |  0:00:53s
epoch 263| loss: 3.51116 | val_0_mae: 4.08286 |  0:00:53s
epoch 264| loss: 3.34049 | val_0_mae: 3.8352  |  0:00:54s
epoch 265| loss: 3.50042 | val_0_mae: 3.90573 |  0:00:54s
epoch 266| loss: 3.60183 | val_0_mae: 3.92441 |  0:00:54s
epoch 267| loss: 3.48295 | val_0_mae: 3.93441 |  0:00:54s
epoch 268| loss: 3.4562  | val_0_mae: 3.84515 |  0:00:54s
epoch 269| loss: 3.48119 | val_0_mae: 3.8124  |  0:00:55s
epoch 270| loss: 3.37337 | val_0_mae: 3.88227 |  0:00:55s
epoch 271| loss: 3.41693 | val_0_mae: 3.84305 |  0:00:55s
epoch 272| loss: 3.39385 | val_0_mae: 3.82717 |  0:00:55s
epoch 273| loss: 3.51624 | val_0_mae: 3.90832 |  0:00:55s
epoch 274| loss: 3.34951 | val_0_mae: 3.86662 |  0:00:56s
epoch 275| loss: 3.3177  | val_0_mae: 3.85511 |  0:00:56s
epoch 276| loss: 3.35393 | val_0_mae: 4.00935 |  0:00:56s
epoch 277| loss: 3.41844 | val_0_mae: 3.99497 |  0:00:56s
epoch 278| loss: 3.3371  | val_0_mae: 3.8556  |  0:00:56s
epoch 279| los

epoch 18 | loss: 9.70833 | val_0_mae: 16.113  |  0:00:03s
epoch 19 | loss: 9.69189 | val_0_mae: 12.77069|  0:00:04s
epoch 20 | loss: 9.40808 | val_0_mae: 12.92609|  0:00:04s
epoch 21 | loss: 9.26006 | val_0_mae: 12.93619|  0:00:04s
epoch 22 | loss: 8.91305 | val_0_mae: 14.05018|  0:00:04s
epoch 23 | loss: 8.65776 | val_0_mae: 14.94566|  0:00:04s
epoch 24 | loss: 8.45294 | val_0_mae: 15.07042|  0:00:05s
epoch 25 | loss: 8.4317  | val_0_mae: 11.99316|  0:00:05s
epoch 26 | loss: 8.28163 | val_0_mae: 11.53717|  0:00:05s
epoch 27 | loss: 8.06708 | val_0_mae: 11.16522|  0:00:05s
epoch 28 | loss: 8.17058 | val_0_mae: 11.46524|  0:00:05s
epoch 29 | loss: 7.87876 | val_0_mae: 11.77731|  0:00:06s
epoch 30 | loss: 7.81562 | val_0_mae: 10.79226|  0:00:06s
epoch 31 | loss: 7.63065 | val_0_mae: 11.24876|  0:00:06s
epoch 32 | loss: 7.6971  | val_0_mae: 11.08721|  0:00:06s
epoch 33 | loss: 7.55773 | val_0_mae: 10.73396|  0:00:06s
epoch 34 | loss: 7.49315 | val_0_mae: 10.04249|  0:00:07s
epoch 35 | los

epoch 301| loss: 2.38602 | val_0_mae: 2.71245 |  0:01:02s
epoch 302| loss: 2.48446 | val_0_mae: 2.6503  |  0:01:02s
epoch 303| loss: 2.46352 | val_0_mae: 2.73737 |  0:01:02s
epoch 304| loss: 2.45057 | val_0_mae: 2.80323 |  0:01:02s
epoch 305| loss: 2.4901  | val_0_mae: 2.7241  |  0:01:03s
epoch 306| loss: 2.36038 | val_0_mae: 2.67748 |  0:01:03s
epoch 307| loss: 2.36804 | val_0_mae: 2.65879 |  0:01:03s
epoch 308| loss: 2.42602 | val_0_mae: 2.50107 |  0:01:03s
epoch 309| loss: 2.46105 | val_0_mae: 2.50373 |  0:01:03s
epoch 310| loss: 2.49714 | val_0_mae: 2.61921 |  0:01:04s
epoch 311| loss: 2.38887 | val_0_mae: 2.6743  |  0:01:04s
epoch 312| loss: 2.39986 | val_0_mae: 2.46542 |  0:01:04s
epoch 313| loss: 2.29762 | val_0_mae: 2.47637 |  0:01:04s
epoch 314| loss: 2.42905 | val_0_mae: 2.52347 |  0:01:04s
epoch 315| loss: 2.36564 | val_0_mae: 2.49261 |  0:01:05s
epoch 316| loss: 2.52432 | val_0_mae: 2.59441 |  0:01:05s
epoch 317| loss: 2.54252 | val_0_mae: 2.59303 |  0:01:05s
epoch 318| los

In [8]:
#############################################################
# 8. Cross Val Score

list_segments_train = list(unique_segments_id_train)
batch_size = 8

kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=12)
df_val_all = pd.DataFrame()

for num_fold, (train_index, val_index) in tqdm(enumerate(kf.split(list_segments_train,
                                                             np.zeros(len(list_segments_train)))), 
                                               total=5, position=0):
    
    
    segments_train_fold = np.asarray(list_segments_train)[train_index]
    segments_val_fold = np.asarray(list_segments_train)[val_index]

    model = torch.load(f'./models/model_tabular_STFT_{num_fold}')

    y_pred_val = model.predict(df_X_train[features][df_X_train['segment_id'].isin(list(segments_val_fold))].values).squeeze()
    y_true_val = df_X_train['time_to_eruption'][df_X_train['segment_id'].isin(list(segments_val_fold))]

    df_tmp = pd.DataFrame({
            'pred' :  np.abs(y_pred_val)*(10**6),
            'y_true' : y_true_val*(10**6)
    })

    df_val_all = pd.concat([df_val_all, df_tmp], axis=0)

print('***'*20)
print(np.mean(np.abs(df_tmp['y_true'] - df_tmp['pred'])))
print('***'*20)


#############################################################

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5.0), HTML(value='')))


************************************************************
2233462.564347889
************************************************************


In [9]:
#############################################################
# 9. Inference

df_X_test = buildDataset(dict_segments_sequences_paths_test)
# del X_val_sequences, y_val_target, list_cv_pred, y_pred_cv, y_cv_target, df_cv, X_cv_sequences, y_cv_target
gc.collect()

list_models = [torch.load(f'./models/model_tabular_STFT_{num_fold}') for num_fold in range(5)]
y_test_pred = np.mean([model.predict(df_X_test[features].values).squeeze()], axis=0)
list_test_segments = df_X_test['segment_id']

df_submission = pd.DataFrame({
    'segment_id' : list_test_segments,
    'time_to_eruption' : np.abs(y_test_pred*(10**6))#np.clip(y_test_pred*(10**6), 6_000, np.inf)
})

df_submission.to_csv('./FinalSubmissions/' + 'submission_tabular_stft.csv', index=False)
df_submission.describe()

#############################################################

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4520.0), HTML(value='')))




Unnamed: 0,segment_id,time_to_eruption
count,4520.0,4520.0
mean,1066993000.0,25103340.0
std,616290400.0,12930760.0
min,860288.0,37234.31
25%,545899500.0,15330570.0
50%,1060695000.0,24668410.0
75%,1599284000.0,36704530.0
max,2147116000.0,48354460.0
