In [1]:
#############################################################
# 1. Libraries

import pandas as pd
import numpy as np 
import os
import glob
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold, train_test_split
import tensorflow as tf
tf.keras.backend.clear_session()

physical_devices = tf.config.list_physical_devices('GPU')

try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    print('Invalid device or cannot modify virtual devices once initialized.')

from tensorflow.keras import models, layers, regularizers, metrics, losses, optimizers
from tensorflow.keras.utils import Sequence

import matplotlib.pyplot as plt
import gc

import xgboost as xgb
import pickle
import scipy.signal
import torch
import albumentations

#############################################################

In [2]:
#############################################################
# 2. Paths & Global Variables

## 2.1 Paths

path = '../../../01_Data/'
path_sequences = path + '01_GeneratedSequences/'
path_spectograms_tiny = path + '02_GeneratedSpectograms_MelFeatures/'
path_spectograms_big = path + '03_GeneratedSpectograms_Scipy/'
path_spectograms_stft = path + '04_GeneratedSpectogramsSTFT/'

path_models_mha = [f'../models/model_mha_{num_fold}' for num_fold in range(5)]
path_models_spectogram_tiny = [f'../models/model_cnn2d_Tiny_{num_fold}' for num_fold in range(5)]
path_models_spectogram_big = [f'../models/model_cnn2d_Big_{num_fold}' for num_fold in range(5)]
path_models_spectogram_stft = [f'../models/model_cnn2d_STFT_{num_fold}' for num_fold in range(5)]
path_models_lgbm_stft = [f'../models/model_lgbm_STFT_{num_fold}' for num_fold in range(5)]


df_train = pd.read_csv(path + 'train.csv')
df_sample_submission = pd.read_csv(path + 'sample_submission.csv') 

train_paths = glob.glob(path + 'train/*')
test_paths = glob.glob(path + 'test/*')

unique_segments_id_train = set(df_train['segment_id'])
unique_segments_id_test = set(df_sample_submission['segment_id'])

dict_unique_segments_id = { v : k for k, v in enumerate(unique_segments_id_train)}
dict_unique_segments_id_inv = { k : v for k, v in enumerate(unique_segments_id_train)}

## 2.2 Global Variables

SEQ_LENGTH = 60_001

IMG_SIZE_TINY = (40, 118)
IMG_SIZE_BIG = (128, 235)
IMG_SIZE_STFT = (128, 469)

#############################################################

In [3]:
#############################################################
# 3. Global Functions

def buildSequences(df, dict_segment_paths, training=True, mask_value=-1.0):
    x = np.zeros((len(dict_segment_paths), SEQ_LENGTH, 10))
    if training:
        y = np.zeros(len(dict_segment_paths))
    for i, segment in enumerate(tqdm(dict_segment_paths, total=len(dict_segment_paths), position=0)):
        segment_path = dict_segment_paths[segment]
        df_tmp = pd.read_csv(segment_path)
        df_tmp = df_tmp.fillna(mask_value)
        x[i] = df_tmp.values[-SEQ_LENGTH:]
        if training:
            y[i] = df['time_to_eruption'][df['segment_id']==segment].values[0]
    if training:
        return x, y
    else:
        return x


def scale(x, mean_, std_):
    return (x - mean_) / std_


def unscale(x, mean_, std_):
    return (x * std_) + mean_


def monoToColor(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    # Stack X as [X,X,X]
#     X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    Xstd = (X - mean) / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Scale to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V


# 5.2 Augmentations

# def noiseInjection(batch_sequences, noise_factor=0.075):
#     noise = np.random.randn(batch_sequences.shape[0], batch_sequences.shape[1], batch_sequences.shape[2])
#     augmented_data = batch_sequences + noise_factor * noise
#     return augmented_data


# def timeShifting(batch_sequences, shift_max):
#     shift = np.random.randint(shift_max)
#     for sensor in range(10):
#         batch_sequences[:, :, sensor] = np.roll(batch_sequences[:, :, sensor], shift)
#     return batch_sequences

# def makeAugmentations(list_segments, dict_path_data, seq_length):
    
#     batch_sequences = np.asarray([np.load(dict_path_data[segment]) for segment in list_segments], dtype=np.float32)
    
#     list_augmentations = [0, 1, 2, 3]
#     current_augmentations = list(np.random.choice(list_augmentations, size=np.random.randint(1, 4) ,replace=False))
      
#     # Add random noise
#     if 0 in current_augmentations:
#         batch_sequences = noiseInjection(batch_sequences, noise_factor=0.05)   

#     # Time shifting
#     if 1 in current_augmentations:
#         batch_sequences = timeShifting(batch_sequences, shift_max=6_000) 

#     # Random batch sequence sensors slices to null
#     if 2 in current_augmentations:
#         num_random_sensors = np.random.choice([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
#         random_sensors = list(set(np.random.randint(0, 9, size=num_random_sensors)))
#         random_ini_position = np.random.randint(0, seq_length, size=num_random_sensors)
#         random_length = np.random.randint(random_ini_position, random_ini_position+6_000, size=num_random_sensors)
#         random_length -= random_ini_position

#         if num_random_sensors!=0:
#             for i, sensor in enumerate(random_sensors):
#                 batch_sequences[:, random_ini_position[i]:random_ini_position[i]+random_length[i], sensor] = 0.0

#     # Shut-down sensor
#     if 3 in current_augmentations:
#         sensor = np.random.randint(0, 9)
#         batch_sequences[:, :, sensor] = 0.0
        
#     return batch_sequences.astype(np.float32)


#############################################################

In [4]:
#############################################################
# 4. Preprocess

# Mha
dict_segments_sequences_paths_train = {
    segment : path_sequences + 'train/' + str(segment) + '.npy' for segment in unique_segments_id_train
}

dict_segments_sequences_paths_test = {
    segment : path_sequences + 'test/' + str(segment) + '.npy' for segment in unique_segments_id_test
}

# Spectogram - Tiny
dict_segments_spectograms_tiny_paths_train = {
    segment : path_spectograms_tiny + 'train/' + str(segment) + '/' for segment in unique_segments_id_train
}

dict_segments_spectograms_tiny_paths_test = {
    segment : path_spectograms_tiny + 'test/' + str(segment) + '/' for segment in unique_segments_id_test
}

# Spectogram - Big

dict_segments_spectograms_big_paths_train = {
    segment : path_spectograms_big + 'train/' + str(segment) + '/' for segment in unique_segments_id_train
}

dict_segments_spectograms_big_paths_test = {
    segment : path_spectograms_big + 'test/' + str(segment) + '/' for segment in unique_segments_id_test
}

# Spectogram - STFT

dict_segments_spectograms_stft_paths_train = {
    segment : path_spectograms_stft + 'train/' + str(segment) + '/' for segment in unique_segments_id_train
}

dict_segments_spectograms_stft_paths_test = {
    segment : path_spectograms_stft + 'test/' + str(segment) + '/' for segment in unique_segments_id_test
}

######

dict_positions_segments = {k : i for i, k in enumerate(dict_segments_sequences_paths_train.keys())}

df_train['time_to_eruption'] = df_train['time_to_eruption']/(10**6)

dict_labels = {
    segment : df_train['time_to_eruption'][df_train['segment_id']==segment].values.flatten()
    for segment in unique_segments_id_train
}

###

# dict_nans_train = getdDictsSpectoGramsNulls(dict_segment_paths_train)
# dict_nans_test = getdDictsSpectoGramsNulls(dict_segment_paths_test)

# np.save(path + 'dict_nans_train.npy', dict_nans_train)
# np.save(path + 'dict_nans_test.npy', dict_nans_test)

# dict_nans_train = np.load(path + 'dict_nans_train.npy', allow_pickle=True).flatten()[0]
# dict_nans_test = np.load(path + 'dict_nans_test.npy', allow_pickle=True).flatten()[0]


#############################################################

In [5]:
#############################################################
# 5. Global Functions

def getTinyTransforms():
    return albumentations.Compose([
            albumentations.OneOf([
                albumentations.GaussNoise(p=0.2),
                albumentations.Cutout(num_holes=4, max_h_size=6, max_w_size=6, fill_value=0, p=0.2),
            ], p=0.3),
    ])

def getBigTransforms():
    return albumentations.Compose([
            albumentations.OneOf([
                albumentations.GaussNoise(p=0.2),
                albumentations.Cutout(num_holes=8, max_h_size=12, max_w_size=12, fill_value=0, p=0.2),
            ], p=0.3),
        albumentations.OpticalDistortion(p=0.3),
        albumentations.ShiftScaleRotate(shift_limit=0.05, rotate_limit=1, p=0.5),
        albumentations.RandomCrop(IMG_SIZE_BIG[0]-10, IMG_SIZE_BIG[1]-10, p=0.5),
        albumentations.PadIfNeeded(min_height=IMG_SIZE_BIG[0], min_width=IMG_SIZE_BIG[1], value=0, p=1.0)
    ])

def getStftTransforms():
    return albumentations.Compose([
            albumentations.OneOf([
                albumentations.GaussNoise(p=0.2),
                albumentations.Cutout(num_holes=8, max_h_size=12, max_w_size=12, fill_value=0, p=0.2),
            ], p=0.3),
        albumentations.OpticalDistortion(p=0.3),
        albumentations.ShiftScaleRotate(shift_limit=0.05, rotate_limit=1, p=0.5),
        albumentations.RandomCrop(IMG_SIZE_STFT[0]-10, IMG_SIZE_STFT[1]-10, p=0.5),
        albumentations.PadIfNeeded(min_height=IMG_SIZE_STFT[0], min_width=IMG_SIZE_STFT[1], value=0, p=1.0)
    ])

#############################################################

In [6]:
#############################################################
# 6. Generator

# 5.1 MHA Model Data Generator
class MHAVolcanoSequencesGenerator(Sequence):
    
    def __init__(self, segments, path_sequences, batch_size, dict_labels, augmentations, shuffle=False, training=True):
        super(MHAVolcanoSequencesGenerator, self).__init__()
        
        self.dict_means = {0: 0.09421943291597953, 1: 0.9208114415834104, 2: -0.026617075839858038, 
                           3: 0.09724443370400684, 4: 1.704695380910225, 5: -0.1180321202370159, 6: 0.7667902421713446, 
                           7: 0.7804286101804458, 8: -0.2075797991904395, 9: 0.014516944212624944} 
        
        self.dict_stds =  {0: 1820.6211174856987, 1: 1931.0901612736805, 2: 1738.1671740163413, 
                           3: 1669.8837574619292, 4: 568.5221048211192, 5: 1848.4917466767877, 6: 1623.353060255481, 
                           7: 1618.2714709240895, 8: 1590.9403316558762, 9: 1906.41447528788}
        
        self.segments = segments
        self.path_sequences = path_sequences
        self.batch_size = batch_size
        self.dict_labels = dict_labels
        self.augmentations = augmentations
        self.shuffle = shuffle
        self.training = training
        self.on_epoch_end()
        
    def __len__(self):
        self.num_steps = int(np.ceil(len(self.segments) / self.batch_size))
        return self.num_steps
        
    def __getitem__(self, idx):
        indexes = self.indexes[idx*self.batch_size:(idx+1)*self.batch_size]
        list_batch_segments = [self.segments[k] for k in indexes]
        
        
        array_sequences = np.asarray([np.load(self.path_sequences[segment], allow_pickle=True)[-SEQ_LENGTH:, :]
                                     for segment in list_batch_segments])
        
        if self.augmentations:
            array_sequences = self.augmentBatch(array_sequences)
        
        array_sequences[:, :, 0] = scale(array_sequences[:, :, 0], self.dict_means[0], self.dict_stds[0])
        array_sequences[:, :, 1] = scale(array_sequences[:, :, 1], self.dict_means[1], self.dict_stds[1])
        array_sequences[:, :, 2] = scale(array_sequences[:, :, 2], self.dict_means[2], self.dict_stds[2])
        array_sequences[:, :, 3] = scale(array_sequences[:, :, 3], self.dict_means[3], self.dict_stds[3])
        array_sequences[:, :, 4] = scale(array_sequences[:, :, 4], self.dict_means[4], self.dict_stds[4])
        array_sequences[:, :, 5] = scale(array_sequences[:, :, 5], self.dict_means[5], self.dict_stds[5])
        array_sequences[:, :, 6] = scale(array_sequences[:, :, 6], self.dict_means[6], self.dict_stds[6])
        array_sequences[:, :, 7] = scale(array_sequences[:, :, 7], self.dict_means[7], self.dict_stds[7])
        array_sequences[:, :, 8] = scale(array_sequences[:, :, 8], self.dict_means[8], self.dict_stds[8])
        array_sequences[:, :, 9] = scale(array_sequences[:, :, 9], self.dict_means[9], self.dict_stds[9])
        
        if self.training:
            array_labels = np.asarray([self.dict_labels[segment] for segment in list_batch_segments])
            return array_sequences, array_labels
        else:
            return array_sequences
        
        
    def noiseInjection(self, batch_sequences, noise_factor=0.075):
        noise = np.random.randn(batch_sequences.shape[0], batch_sequences.shape[1], batch_sequences.shape[2])
        augmented_data = batch_sequences + noise_factor * noise
        return augmented_data
    
    
    def timeShifting(self, batch_sequences, shift_max):
        shift = np.random.randint(shift_max)
        for sensor in range(10):
            batch_sequences[:, :, sensor] = np.roll(batch_sequences[:, :, sensor], shift)
        return batch_sequences
       
    
    def augmentBatch(self, batch_sequences):
        
        # Add random noise
        if np.random.random() > 0.5:
            batch_sequences = self.noiseInjection(batch_sequences, noise_factor=0.005)   
            
        # Time shifting
        if np.random.random() > 0.5:
            batch_sequences = self.timeShifting(batch_sequences, shift_max=600) 
                           
        # Shut-down sensor
        if np.random.random() > 0.5:
            sensor = np.random.randint(0, 9)
            batch_sequences[:, :, sensor] = 0.0
                
        return batch_sequences
    
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.segments))
        if self.shuffle:
            np.random.shuffle(self.indexes)
        
        

# 5.2 Spectogram Model Data Generator
class SpectoGramVolcanoSequencesGenerator(Sequence):
    
    def __init__(self, segments, path_spectograms, batch_size, dict_labels, transforms, shuffle=False, training=True):
        super(SpectoGramVolcanoSequencesGenerator, self).__init__()
        self.segments = segments
        self.path_spectograms = path_spectograms
        self.batch_size = batch_size
        self.dict_labels = dict_labels
        self.shuffle = shuffle
        self.training = training
        self.transforms = transforms
        self.on_epoch_end()
        
    def __len__(self):
        self.num_steps = int(np.ceil(len(self.segments) / self.batch_size))
        return self.num_steps
        
    def __getitem__(self, idx):
        indexes = self.indexes[idx*self.batch_size:(idx+1)*self.batch_size]
        list_batch_segments = [self.segments[k] for k in indexes]
        
        array_spectograms_s0 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_0.npy', 
                                                            allow_pickle=True)
                                     for segment in list_batch_segments]).astype(np.uint8)
        array_spectograms_s1 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_1.npy', 
                                                            allow_pickle=True)
                                     for segment in list_batch_segments]).astype(np.uint8)
        array_spectograms_s2 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_2.npy', 
                                                            allow_pickle=True)
                                     for segment in list_batch_segments]).astype(np.uint8)
        array_spectograms_s3 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_3.npy', 
                                                            allow_pickle=True)
                                     for segment in list_batch_segments]).astype(np.uint8)
        array_spectograms_s4 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_4.npy', 
                                                            allow_pickle=True)
                                     for segment in list_batch_segments]).astype(np.uint8)
        array_spectograms_s5 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_5.npy', 
                                                            allow_pickle=True)
                                     for segment in list_batch_segments]).astype(np.uint8)
        array_spectograms_s6 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_6.npy', 
                                                            allow_pickle=True)
                                     for segment in list_batch_segments]).astype(np.uint8)
        array_spectograms_s7 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_7.npy', 
                                                            allow_pickle=True)
                                     for segment in list_batch_segments]).astype(np.uint8)
        array_spectograms_s8 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_8.npy', 
                                                            allow_pickle=True)
                                     for segment in list_batch_segments]).astype(np.uint8)
        array_spectograms_s9 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_9.npy', 
                                                            allow_pickle=True)
                                     for segment in list_batch_segments]).astype(np.uint8) 
        
        if self.transforms:
            data_s0, data_s1 = {'image':array_spectograms_s0}, {'image':array_spectograms_s1}
            data_s2, data_s3 = {'image':array_spectograms_s2}, {'image':array_spectograms_s3}
            data_s4, data_s5 = {'image':array_spectograms_s4}, {'image':array_spectograms_s5}
            data_s6, data_s7 = {'image':array_spectograms_s6}, {'image':array_spectograms_s7}
            data_s8, data_s9 = {'image':array_spectograms_s8}, {'image':array_spectograms_s9}
            
            array_spectograms_s0 = np.stack([self.transforms(image=x)['image'] for x in data_s0['image']], axis=0)
            array_spectograms_s1 = np.stack([self.transforms(image=x)['image'] for x in data_s1['image']], axis=0)
            array_spectograms_s2 = np.stack([self.transforms(image=x)['image'] for x in data_s2['image']], axis=0)
            array_spectograms_s3 = np.stack([self.transforms(image=x)['image'] for x in data_s3['image']], axis=0)
            array_spectograms_s4 = np.stack([self.transforms(image=x)['image'] for x in data_s4['image']], axis=0)
            array_spectograms_s5 = np.stack([self.transforms(image=x)['image'] for x in data_s5['image']], axis=0)
            array_spectograms_s6 = np.stack([self.transforms(image=x)['image'] for x in data_s6['image']], axis=0)
            array_spectograms_s7 = np.stack([self.transforms(image=x)['image'] for x in data_s7['image']], axis=0)
            array_spectograms_s8 = np.stack([self.transforms(image=x)['image'] for x in data_s8['image']], axis=0)
            array_spectograms_s9 = np.stack([self.transforms(image=x)['image'] for x in data_s9['image']], axis=0)
                 
        batch = (array_spectograms_s0/255, array_spectograms_s1/255, array_spectograms_s2/255, array_spectograms_s3/255, 
                 array_spectograms_s4/255, array_spectograms_s5/255, array_spectograms_s6/255, array_spectograms_s7/255, 
                 array_spectograms_s8/255, array_spectograms_s9/255)    
            
        if self.training:
            array_labels = np.asarray([self.dict_labels[segment] for segment in list_batch_segments])
            return batch, array_labels
        else:
            return batch, None
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.segments))
        if self.shuffle:
            np.random.shuffle(self.indexes)
        
    
        
        
# 5.3 STFT Model Data Generator
class STFTVolcanoSequencesGenerator(Sequence):
    
    def __init__(self, segments, path_spectograms, batch_size, dict_labels, transforms, shuffle=False, training=True):
        super(STFTVolcanoSequencesGenerator, self).__init__()
        
        self.segments = segments
        self.path_spectograms = path_spectograms
        self.batch_size = batch_size
        self.dict_labels = dict_labels
        self.transforms = transforms
        self.shuffle = shuffle
        self.training = training
        self.on_epoch_end()
        
    def __len__(self):
        self.num_steps = int(np.ceil(len(self.segments) / self.batch_size))
        return self.num_steps
        
    def __getitem__(self, idx):
        indexes = self.indexes[idx*self.batch_size:(idx+1)*self.batch_size]
        list_batch_segments = [self.segments[k] for k in indexes]
        
        array_spectograms_s0 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_0.npy', 
                                                            allow_pickle=True).transpose(1, 0)
                                     for segment in list_batch_segments]).astype(np.uint8) 
        array_spectograms_s1 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_1.npy', 
                                                            allow_pickle=True).transpose(1, 0) 
                                     for segment in list_batch_segments]).astype(np.uint8) 
        array_spectograms_s2 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_2.npy', 
                                                            allow_pickle=True).transpose(1, 0) 
                                     for segment in list_batch_segments]).astype(np.uint8) 
        array_spectograms_s3 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_3.npy', 
                                                            allow_pickle=True).transpose(1, 0)
                                     for segment in list_batch_segments]).astype(np.uint8) 
        array_spectograms_s4 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_4.npy', 
                                                            allow_pickle=True).transpose(1, 0)
                                     for segment in list_batch_segments]).astype(np.uint8) 
        array_spectograms_s5 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_5.npy', 
                                                            allow_pickle=True).transpose(1, 0)
                                     for segment in list_batch_segments]).astype(np.uint8) 
        array_spectograms_s6 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_6.npy', 
                                                            allow_pickle=True).transpose(1, 0)
                                     for segment in list_batch_segments]).astype(np.uint8) 
        array_spectograms_s7 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_7.npy', 
                                                            allow_pickle=True).transpose(1, 0)
                                     for segment in list_batch_segments]).astype(np.uint8) 
        array_spectograms_s8 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_8.npy', 
                                                            allow_pickle=True).transpose(1, 0) 
                                     for segment in list_batch_segments]).astype(np.uint8) 
        array_spectograms_s9 = np.asarray([np.load(f'{self.path_spectograms[segment]}{segment}_9.npy', 
                                                            allow_pickle=True).transpose(1, 0) 
                                     for segment in list_batch_segments]).astype(np.uint8)  
        
        if self.transforms:
            data_s0, data_s1 = {'image':array_spectograms_s0}, {'image':array_spectograms_s1}
            data_s2, data_s3 = {'image':array_spectograms_s2}, {'image':array_spectograms_s3}
            data_s4, data_s5 = {'image':array_spectograms_s4}, {'image':array_spectograms_s5}
            data_s6, data_s7 = {'image':array_spectograms_s6}, {'image':array_spectograms_s7}
            data_s8, data_s9 = {'image':array_spectograms_s8}, {'image':array_spectograms_s9}
            
            array_spectograms_s0 = np.stack([self.transforms(image=x)['image'] for x in data_s0['image']], axis=0)
            array_spectograms_s1 = np.stack([self.transforms(image=x)['image'] for x in data_s1['image']], axis=0)
            array_spectograms_s2 = np.stack([self.transforms(image=x)['image'] for x in data_s2['image']], axis=0)
            array_spectograms_s3 = np.stack([self.transforms(image=x)['image'] for x in data_s3['image']], axis=0)
            array_spectograms_s4 = np.stack([self.transforms(image=x)['image'] for x in data_s4['image']], axis=0)
            array_spectograms_s5 = np.stack([self.transforms(image=x)['image'] for x in data_s5['image']], axis=0)
            array_spectograms_s6 = np.stack([self.transforms(image=x)['image'] for x in data_s6['image']], axis=0)
            array_spectograms_s7 = np.stack([self.transforms(image=x)['image'] for x in data_s7['image']], axis=0)
            array_spectograms_s8 = np.stack([self.transforms(image=x)['image'] for x in data_s8['image']], axis=0)
            array_spectograms_s9 = np.stack([self.transforms(image=x)['image'] for x in data_s9['image']], axis=0)
                 
        batch = (array_spectograms_s0/255, array_spectograms_s1/255, array_spectograms_s2/255, array_spectograms_s3/255, 
                 array_spectograms_s4/255, array_spectograms_s5/255, array_spectograms_s6/255, array_spectograms_s7/255, 
                 array_spectograms_s8/255, array_spectograms_s9/255)       
            
        if self.training:
            array_labels = np.asarray([self.dict_labels[segment] for segment in list_batch_segments])
            return batch, array_labels
        else:
            return batch, None
    
    def on_epoch_end(self):
        self.indexes = np.arange(len(self.segments))
        if self.shuffle:
            np.random.shuffle(self.indexes)
        

# 5.4
def buildDataset(dict_paths):
            
    fs = 100                # sampling frequency 
    n = 256                 # FFT segment size
    max_f = 20              # ～20Hz

    delta_f = fs / n        # 0.39Hz
    delta_t = n / fs / 2    # 1.28s

    feature_set = []
    for segment_id in tqdm(dict_paths, total=len(dict_paths), position=0):
        data = np.load(dict_paths[segment_id])
        segment = [segment_id]
        for sensor in range(10):
            x = data[:, sensor]
            f, t, Z = scipy.signal.stft(x, fs = fs, window = 'hann', nperseg = n)
            f = f[:round(max_f/delta_f)+1]
            Z = np.abs(Z[:round(max_f/delta_f)+1]).T

            th = Z.mean() * 1 
            Z_pow = Z.copy()
            Z_pow[Z < th] = 0
            Z_num = Z_pow.copy()
            Z_num[Z >= th] = 1

            Z_pow_sum = Z_pow.sum(axis = 0)
            Z_num_sum = Z_num.sum(axis = 0)

            A_pow = Z_pow_sum[round(10/delta_f):].sum()
            A_num = Z_num_sum[round(10/delta_f):].sum()
            BH_pow = Z_pow_sum[round(5/delta_f):round(8/delta_f)].sum()
            BH_num = Z_num_sum[round(5/delta_f):round(8/delta_f)].sum()
            BL_pow = Z_pow_sum[round(1.5/delta_f):round(2.5/delta_f)].sum()
            BL_num = Z_num_sum[round(1.5/delta_f):round(2.5/delta_f)].sum()
            C_pow = Z_pow_sum[round(0.6/delta_f):round(1.2/delta_f)].sum()
            C_num = Z_num_sum[round(0.6/delta_f):round(1.2/delta_f)].sum()
            D_pow = Z_pow_sum[round(2/delta_f):round(4/delta_f)].sum()
            D_num = Z_num_sum[round(2/delta_f):round(4/delta_f)].sum()
            segment += [A_pow, A_num, BH_pow, BH_num, BL_pow, BL_num, C_pow, C_num, D_pow, D_num]

        feature_set.append(segment)

    cols = ['segment_id']
    for i in range(10):
        for j in ['A_pow', 'A_num','BH_pow', 'BH_num','BL_pow', 'BL_num','C_pow', 'C_num','D_pow', 'D_num']:
            cols += [f's{i+1}_{j}']
    feature_df = pd.DataFrame(feature_set, columns = cols)
    feature_df['segment_id'] = feature_df['segment_id'].astype('int')
    
    return feature_df
    
#############################################################

In [7]:
#############################################################
# 8. Build Test Set

qt_augmentations = 10
batch_size = 8

pbar = tqdm(total=(len(path_models_mha)*5), position=0)
list_segments_test = list(unique_segments_id_test)

paths = (path_models_lgbm_stft, path_models_spectogram_stft,
         path_models_spectogram_big, path_models_spectogram_tiny, path_models_mha)

for path in paths:
    dict_predictions = {'segment_id': {}}
    dict_predictions['segment_id'] = list(unique_segments_id_test)
    type_model = '_'.join(path[0].split('_')[-3:-1]).lower() if path[0].split('_')[-2].lower() == 'stft' else path[0].split('_')[-2].lower()
    if type_model in {'tiny', 'big', 'mha', 'cnn2d_stft'}:
        list_models = [models.load_model(sub_path, compile=False) for sub_path in path]
        
    print('==='*30)
    print(type_model)
    print('==='*30)
    
    if type_model in {'tiny'}:
        X_generator = SpectoGramVolcanoSequencesGenerator(list(dict_segments_spectograms_tiny_paths_test), 
                                                              dict_segments_spectograms_tiny_paths_test,
                                                              batch_size=batch_size, dict_labels=dict_labels,
                                                              transforms=getTinyTransforms(),
                                                              training=False, shuffle=False)     
    elif type_model in {'big'}:
        X_generator = SpectoGramVolcanoSequencesGenerator(list(dict_segments_spectograms_big_paths_test), 
                                                              dict_segments_spectograms_big_paths_test,
                                                              batch_size=batch_size, dict_labels=dict_labels, 
                                                              transforms=getBigTransforms(),
                                                              training=False, shuffle=False)

    elif type_model in {'mha'}:
        X_generator = MHAVolcanoSequencesGenerator(list(dict_segments_sequences_paths_test), 
                                                      dict_segments_sequences_paths_test,
                                                      batch_size=batch_size, dict_labels=dict_labels, 
                                                      augmentations=True, training=False, shuffle=False)   
    elif type_model in {'cnn2d_stft'}:
        X_generator = STFTVolcanoSequencesGenerator(list(dict_segments_spectograms_stft_paths_test), 
                                                    dict_segments_spectograms_stft_paths_test,
                                                    batch_size=batch_size, dict_labels=dict_labels, 
                                                    transforms=getStftTransforms(), training=False, shuffle=False)
    elif type_model in {'lgbm_stft'}:
         pass

    else:
        raise ValueError(f'Incorrect Type model, {type_model}')


    list_test_segments = list(unique_segments_id_test)
    array_predictions = np.zeros((len(list_test_segments)), dtype=np.float32)
    array_confidence = np.zeros((len(list_test_segments)), dtype=np.float32)
    
    if type_model not in {'lgbm_stft'}:
        array_predictions_std = np.zeros((len(list_test_segments)), dtype=np.float32)
        array_confidence_std = np.zeros((len(list_test_segments)), dtype=np.float32)
        list_predictions, list_conf = [], []
        for i in range(qt_augmentations):
            list_model_preds, list_model_confs = [], [] 
            for model in list_models:
                preds = model.predict(X_generator)
                list_model_preds.append(preds[0])
                list_model_confs.append(preds[1])
            preds = np.asarray(list_model_preds).mean(axis=0)
            confs = np.asarray(list_model_confs).mean(axis=0)
            list_predictions.append(preds)
            list_conf.append(confs[:, 2] - confs[:, 0])

        array_predictions = np.asarray(list_predictions).mean(axis=0).squeeze().astype(np.float32)
        array_confidence = np.asarray(list_conf).mean(axis=0).squeeze().astype(np.float32)
        array_predictions_std = np.asarray(list_predictions).std(axis=0).squeeze().astype(np.float32)
        array_confidence_std = np.asarray(list_conf).std(axis=0).squeeze().astype(np.float32)

        dict_predictions[type_model] = {
            'pred_mean': array_predictions, 'pred_std': array_predictions_std,
            'conf_mean' : array_confidence, 'conf_std' : array_confidence_std
        }
        
        dict_build_test_df = {
            'segment_id' : list_test_segments,
            f'{type_model}_pred_mean' : dict_predictions[type_model]['pred_mean'].squeeze(),
            f'{type_model}_pred_std' : dict_predictions[type_model]['pred_std'].squeeze(),
            f'{type_model}_conf_mean' : dict_predictions[type_model]['conf_mean'].squeeze(),
            f'{type_model}_conf_std' : dict_predictions[type_model]['conf_std'].squeeze()
        }
        
            
    else: 
        df_X_test = buildDataset(dict_segments_sequences_paths_test)
        features = [col for col in df_X_test.columns.tolist() if col not in ['segment_id', 'time_to_eruption']]
        X_test = df_X_test[features]
        list_models = [pickle.load(open(f'{sub_path}.pickle', 'rb')) for sub_path in path]
        array_predictions = np.mean([model.predict(X_test) for model in list_models], 0)
        array_confidence[:] = 1.0

        dict_predictions[type_model] = {'pred' : array_predictions, 'conf' : array_confidence}
        dict_build_test_df = {
            'segment_id' : list_test_segments,
            f'{type_model}_pred' : dict_predictions[type_model]['pred'].squeeze(),
            f'{type_model}_conf' : dict_predictions[type_model]['conf'].squeeze()
        }
        
        del df_X_test, X_test
        gc.collect()
    
    df_test_tmp = pd.DataFrame(dict_build_test_df)
    df_test_tmp.to_csv(f'./df_test_l2_{type_model}.csv', index=False)
    tf.keras.backend.clear_session()
    del list_models, array_predictions
    gc.collect()
    
    pbar.update(5)
    
        
pbar.close()


#############################################################

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25.0), HTML(value='')))

lgbm_stft


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=4520.0), HTML(value='')))


cnn2d_stft
big
tiny
mha



In [11]:
# df_submission = pd.DataFrame({
#     'segment_id' : df_test_tmp['segment_id'],
#     'time_to_eruption' : df_test_tmp['mha_pred_mean']*(10**6)
# })

# df_submission.to_csv('./99_Submissions_tta/' + 'submission_tta_mha.csv', index=False)
# df_submission.describe()

Unnamed: 0,segment_id,time_to_eruption
count,4520.0,4520.0
mean,1066993000.0,23958770.0
std,616290400.0,11663020.0
min,860288.0,413535.8
25%,545899500.0,14699550.0
50%,1060695000.0,24488900.0
75%,1599284000.0,33475150.0
max,2147116000.0,47882980.0


In [16]:
#############################################################
# 9. Make inference

experiment_name = '1.2849_27-12-2020-17-15-23'
type_models = ['mha', 'big', 'tiny', 'cnn2d_stft', 'lgbm_stft']
list_dfs= [f'df_test_l2_{arch}.csv' for arch in type_models]

for i, name_df_ in enumerate(list_dfs):
    df_ = pd.read_csv(f'./{name_df_}', index_col=False)
    if name_df_.split('_')[-1].split('.')[0] in {'tiny', 'big', 'stft'}:
        cols_ = [df_.columns[0]] + [col for col in list(df_.columns[1:])]
        df_.columns = cols_
    if i==0:
        df_test = df_.copy()
    else:
        df_test = pd.merge(df_test, df_, how='inner', on='segment_id')

#####

df_tabular_time = pd.read_csv('./df_time_tabular_predictions.csv')
df_tabular_stft2 = pd.read_csv('./df_stft_tabular_predictions.csv')

df_tabular_time.columns = ['segment_id', 'lgbm_time_pred_mean']
df_tabular_stft2.columns = ['segment_id', 'lgbm_stft_pred_mean']

df_tabular_time['lgbm_time_pred_mean'] = df_tabular_time['lgbm_time_pred_mean']/(10**6)
df_tabular_stft2['lgbm_stft_pred_mean'] = df_tabular_stft2['lgbm_stft_pred_mean']/(10**6)

df_added = pd.merge(df_tabular_time, df_tabular_stft2, how='inner', on=['segment_id'])
print(df_added.shape[0], df_tabular_time.shape[0], df_tabular_stft2.shape[0])
df_all_test = pd.merge(df_test, df_added, how='inner', on=['segment_id'])
print(df_all_test.shape[0], df_added.shape[0], df_test.shape[0])

#####

list_test_columns = [col.replace('cnn2d', 'cnn') if col.split('_')[0]=='cnn2d' else col for col in list(df_all_test.columns)]
list_test_columns = ['cnn_' + col if col.split('_')[0] in ['tiny', 'big'] else col for col in list_test_columns]
df_all_test.columns =  list_test_columns       
        
list_train_columns = [col for col in list(pd.read_csv(f'./df_all_train_l2.csv', index_col=False).columns) if col not in ['y_true']]            
df_all_test = df_all_test[list_train_columns]

test_columns = [col for col in df_all_test.columns if col not in ['segment_id']]

# XGBBOOST
list_models_xgb = [pickle.load(open(f'./StackedModels/{experiment_name}/xgb_{fold}.pickle', 'rb')) for fold in range(5)]
y_test_pred = np.asarray([model_xgb.predict(df_all_test[test_columns]) for model_xgb in list_models_xgb])
df_all_test['xgb'] = y_test_pred.mean(axis=0)

# CATBOOST
list_models_cat = [pickle.load(open(f'./StackedModels/{experiment_name}/cat_{fold}.pickle', 'rb')) for fold in range(5)]
y_test_pred = np.asarray([model_cat.predict(df_all_test[test_columns]) for model_cat in list_models_cat])
df_all_test['cat'] = y_test_pred.mean(axis=0)

# TABNET
list_models_tab = [torch.load(f'./StackedModels/{experiment_name}/tabnet_{fold}') for fold in range(5)]
y_test_pred = np.asarray([model_tab.predict(df_all_test[test_columns].values) for model_tab in list_models_tab])
df_all_test['tab'] = y_test_pred.mean(axis=0)

df_all_test.to_csv(f'./df_test_all.csv', index=False)

#############################################################

4520 4520 4520
4520 4520 4520


In [8]:
#############################################################
# 10. Submission

df_all_test = pd.read_csv('./df_test_all.csv')
df_submission = pd.DataFrame({
    'segment_id' : df_all_test['segment_id'],
    # 'time_to_eruption' : (0.2*df_all_test['xgb'] + 0.2*df_all_test['cat'] + 0.6*df_all_test['tab'])*(10**6),
    'time_to_eruption' : np.mean([df_all_test['xgb'], df_all_test['cat'], df_all_test['tab']], axis=0)*(10**6)
})
sub_path = '../FinalSubmissions/' + 'submission_l2_final_2.csv'
df_submission.to_csv(sub_path, index=False)
df_submission.describe()

#############################################################

Unnamed: 0,segment_id,time_to_eruption
count,4520.0,4520.0
mean,1066993000.0,24470010.0
std,616290400.0,11389210.0
min,860288.0,657868.6
25%,545899500.0,17361050.0
50%,1060695000.0,25098990.0
75%,1599284000.0,32888640.0
max,2147116000.0,47598140.0


In [4]:
# check preds
import pandas as pd

df = pd.read_csv(f'./df_test_all.csv')

df[['segment_id', 'cnn_tiny_pred_mean']]

df_submission = pd.DataFrame({
    'segment_id' : df['segment_id'],
    'time_to_eruption' : df['cnn_tiny_pred_mean'] * (10**6)
})

sub_path = '../FinalSubmissions/' + 'check_cnn_tiny.csv'
df_submission.to_csv(sub_path, index=False)
df_submission.describe()

Unnamed: 0,segment_id,time_to_eruption
count,4520.0,4520.0
mean,1066993000.0,23442420.0
std,616290400.0,10509460.0
min,860288.0,1036858.0
25%,545899500.0,16185500.0
50%,1060695000.0,26532510.0
75%,1599284000.0,30908970.0
max,2147116000.0,47291450.0


In [None]:
# !kaggle competitions submit -c predict-volcanic-eruptions-ingv-oe -f {sub_path} -m experiment_name