In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold, RepeatedKFold, train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression

In [None]:
data = pd.read_csv('../input/LANL-Earthquake-Prediction/train.csv', dtype={'acoustic_data': np.int16, 'time_to_failure':np.float64})

train, test = train_test_split(data, test_size=0.3, shuffle=False) # Split data into 70% training and 30% test
train.head(10)
test.head(10)

del data

In [None]:
print(train.shape)
print(test.shape)

In [None]:
acoustic_data_ = train['acoustic_data'].values[::75]  
time_to_failure_ = train['time_to_failure'].values[::75]

fig, ax1 = plt.subplots(figsize=(16, 8))
plt.title("Trends of acoustic_data and time_to_failure. 2% of data (sampled)")
plt.plot(acoustic_data_, color='b')
ax1.set_ylabel('acoustic_data', color='b')
plt.legend(['acoustic_data'])
ax2 = ax1.twinx()
plt.plot(time_to_failure_, color='g')
ax2.set_ylabel('time_to_failure', color='g')
plt.legend(['time_to_failure'], loc=(0.875, 0.9))
plt.grid(False)

In [None]:
acoustic_data_ = train['acoustic_data'].values[:6291455]  
time_to_failure_ = train['time_to_failure'].values[:6291455]

fig, ax1 = plt.subplots(figsize=(16, 8))
plt.title("More detailed acoustic data (1% of data)")
plt.plot(acoustic_data_, color='b')
ax1.set_ylabel('acoustic_data', color='b')
plt.legend(['acoustic_data'])
ax2 = ax1.twinx()
plt.plot(time_to_failure_, color='g')
ax2.set_ylabel('time_to_failure', color='g')
plt.legend(['time_to_failure'], loc=(0.875, 0.9))
plt.grid(False)

del acoustic_data_
del time_to_failure_

In [None]:
rows = 150000 # Amount of rows per segment
train_segments = int(np.floor(train.shape[0] / rows)) # Amount of segments in dataset
print("Amount of segments: ", train_segments)

x_train = pd.DataFrame(index=range(train_segments), dtype=np.float64, columns=['mean', 'std', 'min', 'max', 'skew', 'kurtosis', 'Imean', 'Rmean', 'Imin', 'Rmin', 'Imax', 'Rmax', 'max_to_min'])
y_train = pd.DataFrame(index=range(train_segments), dtype=np.float64, columns=['time_to_failure'])

In [None]:
def extract_features(seg_id, seg, X):
    values = pd.Series(seg['acoustic_data'].values)
    values_fft = np.fft.fft(values)
    values_real = np.real(values_fft)
    values_imag = np.imag(values_fft)
    
    X.loc[seg_id, 'mean'] = values.mean()
    X.loc[seg_id, 'std'] = values.std()
    X.loc[seg_id, 'min'] = values.min()
    X.loc[seg_id, 'max'] = values.max()
    X.loc[seg_id, 'skew'] = values.skew()
    X.loc[seg_id, 'kurtosis'] = values.kurt()
    
    X.loc[seg_id, 'Imean'] = values_imag.mean()
    X.loc[seg_id, 'Rmean'] = values_real.mean()
    
    X.loc[seg_id, 'Imin'] = values_imag.min()
    X.loc[seg_id, 'Rmin'] = values_real.min()
    
    X.loc[seg_id, 'Imax'] = values_imag.max()
    X.loc[seg_id, 'Rmax'] = values_real.max()
    
    X.loc[seg_id, 'max_to_min'] = values.max() - values.min()

In [None]:
for seg_id in range(train_segments):
    seg = train.iloc[seg_id * rows : seg_id * rows + rows] # Select segment data points
    extract_features(seg_id, seg, x_train) # Extract the features for this segment
    y_train.loc[seg_id, 'time_to_failure'] = seg['time_to_failure'].values[-1] # Copy corresponding output (time_to_failure)

In [None]:
print("Output shape: ", x_train.shape)
x_train.head(10)

In [None]:
test_segments = int(np.floor(test.shape[0] / rows))
print("Amount of test segments: ", test_segments)

x_test = pd.DataFrame(index=range(test_segments), dtype=np.float64, columns=['mean', 'std', 'min', 'max', 'skew', 'kurtosis', 'Imean', 'Rmean', 'Imin', 'Rmin', 'Imax', 'Rmax', 'max_to_min'])
y_test = pd.DataFrame(index=range(test_segments), dtype=np.float64, columns=['time_to_failure'])

for seg_id in range(test_segments):
    seg = test.iloc[seg_id * rows : seg_id * rows + rows]
    extract_features(seg_id, seg, x_test)
    y_test.loc[seg_id, 'time_to_failure'] = seg['time_to_failure'].values[-1]

print("Test output shape: ", x_test.shape)
x_test.head(10)

In [None]:
train_scaler = StandardScaler().fit(x_train)
test_scaler = StandardScaler().fit(x_test)

x_train_scaled = train_scaler.transform(x_train)
x_test_scaled = test_scaler.transform(x_test)

In [None]:
del train
del test

In [None]:
def train_model(x_train, y_train, x_test, y_test, model):
    model.fit(x_train, y_train.values.flatten())
    
    train_pred = model.predict(x_train)
    test_pred = model.predict(x_test)
    
    train_score = mean_squared_error(y_train, train_pred) # Calculate MSE for training data
    test_score = mean_squared_error(y_test, test_pred) # Calculate MSE for test data
    
    print("Train MSE: ", train_score)
    print("Test MSE: ", test_score)
    
    plt.scatter(y_train.values.flatten(), train_pred, label="Train")
    plt.scatter(y_test.values.flatten(), test_pred, label="Test")
    plt.plot([(0, 0), (16, 16)], [(0, 0), (16, 16)], color='g')
    plt.xlim([0, 16])
    plt.ylim([0, 16])
    plt.xlabel("Expected")
    plt.ylabel("Actual")
    plt.legend()

In [None]:
train_model(x_train_scaled, y_train, x_test_scaled, y_test, LinearRegression())

In [None]:
train_model(x_train_scaled, y_train, x_test_scaled, y_test, RandomForestRegressor())

In [None]:
submission = pd.read_csv('../input/LANL-Earthquake-Prediction/sample_submission.csv', index_col='seg_id')
submission_x_test = pd.DataFrame(columns=x_train.columns, dtype=np.float64, index=submission.index)

for seg_id in submission_x_test.index:
    seg = pd.read_csv('../input/LANL-Earthquake-Prediction/test/' + seg_id + '.csv')
    extract_features(seg_id, seg, submission_x_test)

submission_x_test_scaled = train_scaler.transform(submission_x_test)

model = RandomForestRegressor()
model.fit(x_train_scaled, y_train.values.flatten())
submission['time_to_failure'] = model.predict(submission_x_test_scaled)
submission.to_csv('submission.csv', index=True)
print(submission)