In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['test', 'train.csv', 'sample_submission.csv']


In [None]:
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR
from sklearn.metrics import mean_absolute_error

In [None]:
train = pd.read_csv('../input/train.csv', dtype={'acoustic_data': np.float32, 'time_to_failure': np.float32}).values

In [None]:
train [0:150000, 0 ] .mean(axis=0)

4.8841133

In [None]:
# pandas doesn't show us all the decimals
pd.options.display.precision = 15

In [None]:
rows = 150_000
segments = int(np.floor(train.shape[0] / rows))
print('train.shape',train.shape)
segments


train.shape (629145480, 2)


4194

# **Extracting Features**

In [None]:
def extract_features(z):
     return np.c_[z.mean(axis=0), 
                  z.std(axis=0),
                  z.max(axis=0),
                  z.min(axis=0),
                #  np.transpose(np.percentile(np.abs(z), q=[0, 50, 75, 100], axis=0)) .reshape(1,4)
                 ]

In [None]:
def create_X(x, last_index=None, n_steps=150, step_length=1000):
    if last_index == None:
        last_index=len(x)
       
    assert last_index - n_steps * step_length >= 0

    # Reshaping and approximate standardization with mean 5 and std 3.
    # ORIGINAL: I changed this becuase I got an No OpKernel was registered to support Op 'CudnnRNN' error
    #temp = (x[(last_index - n_steps * step_length):last_index].reshape(n_steps, -1) - 5 ) / 3
    # MY CHANGE: This doesn't fix things, I get the same errors
    temp = (x[(int(last_index) - n_steps * step_length):int(last_index)].reshape(n_steps,step_length ).astype(np.float32) - 5 ) / 3
    
    # Extracts features of sequences of full length 1000, of the last 100 values and finally also 
    # of the last 10 observations. 
    return np.c_[extract_features(temp),
                 extract_features(temp [ -n_steps // 10:]),
                 extract_features(temp [ -n_steps // 100:])]



# Creating Training Data

In [None]:
# Query "create_X" to figure out the number of features
n_features = create_X(train [0:150000,0]).shape[1]
print("Our RNN is based on %i features"% n_features)     # 18 features each row of segment ie 150x18 features of 150000 chunk input

n_steps=150
step_length=1000
maxsize=train .shape[0]
seg = int(np.floor(maxsize / (n_steps*step_length))) 
batch_size = seg-1   # (4193,) 
xx=350


##############################################################################################
rows_initialize = np.zeros((seg), dtype=float)
print(rows_initialize.shape)

for seg1 in tqdm(range(1,seg)) :      # for loop from 1 to 4194 segment value
    rows_initialize [seg1] = seg1 * (n_steps*step_length) 

rows=np.delete(rows_initialize,0)    # (4193,)

print(rows.shape)

########################################################################################
batch_size=batch_size-xx    # training data
#batch_size=xx              # validation data
split_point=xx
second_earthquake = rows[xx]



##########################################################################################

if batch_size < 1000  :    # validation set 
               rows_1 = rows[:split_point+1]    #  0:350 
        
if batch_size > 1000 :   # training set
               rows_1 = rows[split_point+1 :]    # (351,) ie 351:4193    
            

       
    # Initialize feature matrices and targets
samples_tr= np.zeros((rows_1.shape[0], step_length, n_features), dtype=float)   # (16, 150, 18)  for validation (350, 1, 24)  for training ( 3843, 1, 24) 
targets_tr = np.zeros(rows_1.shape[0], )    # (16,)  for validation (350)    for training ( 3843)
        
for j, row in enumerate(rows_1):             # 16 for validation (350)    for training ( 3843)
    samples_tr[j] = create_X(train[:, 0], last_index=row, n_steps=n_steps, step_length=step_length)
    targets_tr[j] = train[int(row - 1), 1]         
    
    
################################################################################################

print('samples_tr shape', samples_tr.shape)
print('targets_tr shape', targets_tr.shape)




100%|██████████| 4193/4193 [00:00<00:00, 1146835.13it/s]

Our RNN is based on 12 features
(4194,)
(4193,)





samples_tr shape (3842, 1000, 12)
targets_tr shape (3842,)


# Creating Validation data

In [None]:
#batch_size=batch_size-xx    # training data
batch_size=xx              # validation data
split_point=xx
second_earthquake = rows[xx]

##########################################################################################

if batch_size < 1000  :    # validation set 
               rows_1 = rows[:split_point+1]    #  0:350 
        
if batch_size > 1000 :   # training set
               rows_1 = rows[split_point+1 :]    # (351,) ie 351:4193    
            
       
    # Initialize feature matrices and targets
samples_vd= np.zeros((rows_1.shape[0], step_length, n_features), dtype=float)   #   for validation (350, 1, 24) 
targets_vd = np.zeros(rows_1.shape[0], )    #  for validation (350)    
        
for j, row in enumerate(rows_1):             # 16 for validation (350)    for training ( 3843)
    samples_vd[j] = create_X(train[:, 0], last_index=row, n_steps=n_steps, step_length=step_length)
    targets_vd[j] = train[int(row - 1), 1]  
    
    
print('samples_vd shape', samples_vd.shape)
print('targets_vd shape',targets_vd.shape)  
#print('rows_1 shape',rows_1.shape[0])
    

samples_vd shape (351, 1000, 12)
targets_vd shape (351,)


# Define the model

In [None]:
from keras.models import Sequential
from keras.layers import Dense, CuDNNGRU, SimpleRNN, LSTM ,  Dropout, Activation, Flatten, Input, Conv1D, MaxPooling1D
from keras.optimizers import adam
from keras.callbacks import ModelCheckpoint
from keras.optimizers import RMSprop
import datetime


Using TensorFlow backend.


In [None]:
import keras 

In [None]:
now = datetime.datetime.now

In [None]:
## CNN combined with LSTM Model 
i = (1000, 12)
model = Sequential ()
##1st model
model.add(Conv1D(5, 3, activation='relu', input_shape= i))
model.add(MaxPooling1D(2))
model.add(LSTM(50,  return_sequences=True))
model.add(LSTM(10))
model.add(Dense(240))
model.add(Dense(120))
model.add(Dense(60))
model.add(Dense(30))
model.add(Dense(1))



 

In [None]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_2 (Conv1D)            (None, 998, 5)            185       
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 499, 5)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 499, 50)           11200     
_________________________________________________________________
lstm_4 (LSTM)                (None, 10)                2440      
_________________________________________________________________
dense_2 (Dense)              (None, 240)               2640      
_________________________________________________________________
dense_3 (Dense)              (None, 120)               28920     
_________________________________________________________________
dense_4 (Dense)              (None, 60)                7260      
__________

# Compile and fit model

In [None]:
import keras
from keras.optimizers import RMSprop
opt = keras.optimizers.adam(lr=.005)

model.compile(loss="mae",
              optimizer=opt, metrics=['mean_absolute_error'])
             # metrics=['accuracy'])


batch_size = 128 # mini-batch with 32 examples
epochs = 50
t=now()
history = model.fit(
    samples_tr, targets_tr,
    batch_size=batch_size,
    epochs=epochs,
    verbose=1,
    validation_data=(samples_vd  ,targets_vd ))
print('Training time: %s' % (now() - t))       

Instructions for updating:
Use tf.cast instead.
Train on 3842 samples, validate on 351 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50


Epoch 48/50
Epoch 49/50
Epoch 50/50
Training time: 0:42:45.084169


# Load submission file


In [None]:
submission = pd.read_csv('../input/sample_submission.csv', index_col='seg_id', dtype={"time_to_failure": np.float32})

## Prepare submission data
Load each test data, create the feature matrix, get numeric prediction

In [None]:
for i, seg_id in enumerate(tqdm(submission.index)):
  #  print(i)
    seg = pd.read_csv('../input/test/' + seg_id + '.csv')
    x = seg['acoustic_data'].values
    submission.time_to_failure[i] = model.predict(np.expand_dims(create_X(x), 0))

submission.head()

100%|██████████| 2624/2624 [28:32<00:00,  1.42it/s]


Unnamed: 0_level_0,time_to_failure
seg_id,Unnamed: 1_level_1
seg_00030f,5.116579055786133
seg_0012b5,4.282918453216553
seg_00184e,4.95469331741333
seg_003339,7.273122310638428
seg_0042cc,4.48853874206543


In [None]:
submission.to_csv('submission.csv')