In [1]:
import gc
import os
import time
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import lightgbm as lgb
from scipy import stats
from scipy.signal import hann
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from scipy.signal import hilbert
from scipy.signal import convolve
from sklearn.svm import NuSVR, SVR
from catboost import CatBoostRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold,StratifiedKFold, RepeatedKFold
warnings.filterwarnings("ignore")
from sklearn.metrics import mean_squared_error

In [2]:
origin_train_data = pd.read_csv("train/train.csv")

In [3]:
print("Orignial Dataset contains {} rows, {} columns"
      .format(origin_train_data.shape[0], origin_train_data.shape[1]))

Orignial Dataset contains 629145480 rows, 2 columns


In [4]:
segments = np.floor(origin_train_data.shape[0] / 150000)
print("The original dataset contain {} segments".format(segments))

The original dataset contain 4194.0 segments


In [5]:
def get_overlapped_seg_begin_point(data_length, skipping = 150000, window_size = 150000):
    last_valid_pos = data_length - data_length % window_size - window_size + 1
    return list(range(0, last_valid_pos, skipping))

In [6]:
begin_poses = get_overlapped_seg_begin_point(origin_train_data.shape[0], 50000)

In [7]:
def add_trend_feature(arr, abs_values=False):
    idx = np.array(range(len(arr)))
    if abs_values:
        arr = np.abs(arr)
    lr = LinearRegression()
    lr.fit(idx.reshape(-1, 1), arr)
    return lr.coef_[0]

def classic_sta_lta(x, length_sta, length_lta):
    sta = np.cumsum(x ** 2)
    # Convert to float
    sta = np.require(sta, dtype=np.float)
    # Copy for LTA
    lta = sta.copy()
    # Compute the STA and the LTA
    sta[length_sta:] = sta[length_sta:] - sta[:-length_sta]
    sta /= length_sta
    lta[length_lta:] = lta[length_lta:] - lta[:-length_lta]
    lta /= length_lta
    # Pad zeros
    sta[:length_lta - 1] = 0
    # Avoid division by zero by setting zero values to tiny float
    dtiny = np.finfo(0.0).tiny
    idx = lta < dtiny
    lta[idx] = dtiny
    return sta / lta

In [8]:
def create_features(seg_id, seg, X):
    xc = pd.Series(seg['acoustic_data'].values)
    
    X.loc[seg_id, 'mean'] = xc.mean()
    X.loc[seg_id, 'std'] = xc.std()
    X.loc[seg_id, 'max'] = xc.max()
    X.loc[seg_id, 'min'] = xc.min()
    
    X.loc[seg_id, 'mean_change_abs'] = np.mean(np.diff(xc))
    X.loc[seg_id, 'mean_change_rate'] = np.mean(np.nonzero((np.diff(xc) / xc[:-1]))[0])
    X.loc[seg_id, 'abs_max'] = np.abs(xc).max()
    X.loc[seg_id, 'abs_min'] = np.abs(xc).min()
    
    X.loc[seg_id, 'std_first_50000'] = xc[:50000].std()
    X.loc[seg_id, 'std_last_50000'] = xc[-50000:].std()
    X.loc[seg_id, 'std_first_10000'] = xc[:10000].std()
    X.loc[seg_id, 'std_last_10000'] = xc[-10000:].std()
    
    X.loc[seg_id, 'avg_first_50000'] = xc[:50000].mean()
    X.loc[seg_id, 'avg_last_50000'] = xc[-50000:].mean()
    X.loc[seg_id, 'avg_first_10000'] = xc[:10000].mean()
    X.loc[seg_id, 'avg_last_10000'] = xc[-10000:].mean()
    
    X.loc[seg_id, 'min_first_50000'] = xc[:50000].min()
    X.loc[seg_id, 'min_last_50000'] = xc[-50000:].min()
    X.loc[seg_id, 'min_first_10000'] = xc[:10000].min()
    X.loc[seg_id, 'min_last_10000'] = xc[-10000:].min()
    
    X.loc[seg_id, 'max_first_50000'] = xc[:50000].max()
    X.loc[seg_id, 'max_last_50000'] = xc[-50000:].max()
    X.loc[seg_id, 'max_first_10000'] = xc[:10000].max()
    X.loc[seg_id, 'max_last_10000'] = xc[-10000:].max()
    
    X.loc[seg_id, 'max_to_min'] = xc.max() / np.abs(xc.min())
    X.loc[seg_id, 'max_to_min_diff'] = xc.max() - np.abs(xc.min())
    X.loc[seg_id, 'count_big'] = len(xc[np.abs(xc) > 500])
    X.loc[seg_id, 'sum'] = xc.sum()
    
    X.loc[seg_id, 'mean_change_rate_first_50000'] = np.mean(np.nonzero((np.diff(xc[:50000]) / xc[:50000][:-1]))[0])
    X.loc[seg_id, 'mean_change_rate_last_50000'] = np.mean(np.nonzero((np.diff(xc[-50000:]) / xc[-50000:][:-1]))[0])
    X.loc[seg_id, 'mean_change_rate_first_10000'] = np.mean(np.nonzero((np.diff(xc[:10000]) / xc[:10000][:-1]))[0])
    X.loc[seg_id, 'mean_change_rate_last_10000'] = np.mean(np.nonzero((np.diff(xc[-10000:]) / xc[-10000:][:-1]))[0])
    
    X.loc[seg_id, 'q95'] = np.quantile(xc, 0.95)
    X.loc[seg_id, 'q99'] = np.quantile(xc, 0.99)
    X.loc[seg_id, 'q05'] = np.quantile(xc, 0.05)
    X.loc[seg_id, 'q01'] = np.quantile(xc, 0.01)
    
    X.loc[seg_id, 'abs_q95'] = np.quantile(np.abs(xc), 0.95)
    X.loc[seg_id, 'abs_q99'] = np.quantile(np.abs(xc), 0.99)
    X.loc[seg_id, 'abs_q05'] = np.quantile(np.abs(xc), 0.05)
    X.loc[seg_id, 'abs_q01'] = np.quantile(np.abs(xc), 0.01)
    
    X.loc[seg_id, 'trend'] = add_trend_feature(xc)
    X.loc[seg_id, 'abs_trend'] = add_trend_feature(xc, abs_values=True)
    X.loc[seg_id, 'abs_mean'] = np.abs(xc).mean()
    X.loc[seg_id, 'abs_std'] = np.abs(xc).std()
    
    X.loc[seg_id, 'mad'] = xc.mad()
    X.loc[seg_id, 'kurt'] = xc.kurtosis()
    X.loc[seg_id, 'skew'] = xc.skew()
    X.loc[seg_id, 'med'] = xc.median()
    
    X.loc[seg_id, 'Hilbert_mean'] = np.abs(hilbert(xc)).mean()
    X.loc[seg_id, 'Hann_window_mean'] = (convolve(xc, hann(150), mode='same') / sum(hann(150))).mean()
    X.loc[seg_id, 'Moving_average_700_mean'] = xc.rolling(window=700).mean().mean(skipna=True)
    X.loc[seg_id, 'Moving_average_1500_mean'] = xc.rolling(window=1500).mean().mean(skipna=True)
    X.loc[seg_id, 'Moving_average_3000_mean'] = xc.rolling(window=3000).mean().mean(skipna=True)
    X.loc[seg_id, 'Moving_average_6000_mean'] = xc.rolling(window=6000).mean().mean(skipna=True)
    ewma = pd.Series.ewm
    X.loc[seg_id, 'exp_Moving_average_300_mean'] = (ewma(xc, span=300).mean()).mean(skipna=True)
    X.loc[seg_id, 'exp_Moving_average_3000_mean'] = ewma(xc, span=3000).mean().mean(skipna=True)
    X.loc[seg_id, 'exp_Moving_average_30000_mean'] = ewma(xc, span=6000).mean().mean(skipna=True)
    no_of_std = 2
    X.loc[seg_id, 'MA_700MA_std_mean'] = xc.rolling(window=700).std().mean()
    X.loc[seg_id,'MA_700MA_BB_high_mean'] = (X.loc[seg_id, 'Moving_average_700_mean'] + no_of_std * X.loc[seg_id, 'MA_700MA_std_mean']).mean()
    X.loc[seg_id,'MA_700MA_BB_low_mean'] = (X.loc[seg_id, 'Moving_average_700_mean'] - no_of_std * X.loc[seg_id, 'MA_700MA_std_mean']).mean()
    X.loc[seg_id, 'MA_400MA_std_mean'] = xc.rolling(window=400).std().mean()
    X.loc[seg_id,'MA_400MA_BB_high_mean'] = (X.loc[seg_id, 'Moving_average_700_mean'] + no_of_std * X.loc[seg_id, 'MA_400MA_std_mean']).mean()
    X.loc[seg_id,'MA_400MA_BB_low_mean'] = (X.loc[seg_id, 'Moving_average_700_mean'] - no_of_std * X.loc[seg_id, 'MA_400MA_std_mean']).mean()
    X.loc[seg_id, 'MA_1000MA_std_mean'] = xc.rolling(window=1000).std().mean()
    
    X.loc[seg_id, 'iqr'] = np.subtract(*np.percentile(xc, [75, 25]))
    X.loc[seg_id, 'q999'] = np.quantile(xc,0.999)
    X.loc[seg_id, 'q001'] = np.quantile(xc,0.001)
    X.loc[seg_id, 'ave10'] = stats.trim_mean(xc, 0.1)
    
    for windows in [10, 100, 1000]:
        x_roll_std = xc.rolling(windows).std().dropna().values
        x_roll_mean = xc.rolling(windows).mean().dropna().values
        
        X.loc[seg_id, 'ave_roll_std_' + str(windows)] = x_roll_std.mean()
        X.loc[seg_id, 'std_roll_std_' + str(windows)] = x_roll_std.std()
        X.loc[seg_id, 'max_roll_std_' + str(windows)] = x_roll_std.max()
        X.loc[seg_id, 'min_roll_std_' + str(windows)] = x_roll_std.min()
        X.loc[seg_id, 'q01_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.01)
        X.loc[seg_id, 'q05_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.05)
        X.loc[seg_id, 'q95_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.95)
        X.loc[seg_id, 'q99_roll_std_' + str(windows)] = np.quantile(x_roll_std, 0.99)
        X.loc[seg_id, 'av_change_abs_roll_std_' + str(windows)] = np.mean(np.diff(x_roll_std))
        X.loc[seg_id, 'av_change_rate_roll_std_' + str(windows)] = np.mean(np.nonzero((np.diff(x_roll_std) / x_roll_std[:-1]))[0])
        X.loc[seg_id, 'abs_max_roll_std_' + str(windows)] = np.abs(x_roll_std).max()
        
        X.loc[seg_id, 'ave_roll_mean_' + str(windows)] = x_roll_mean.mean()
        X.loc[seg_id, 'std_roll_mean_' + str(windows)] = x_roll_mean.std()
        X.loc[seg_id, 'max_roll_mean_' + str(windows)] = x_roll_mean.max()
        X.loc[seg_id, 'min_roll_mean_' + str(windows)] = x_roll_mean.min()
        X.loc[seg_id, 'q01_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.01)
        X.loc[seg_id, 'q05_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.05)
        X.loc[seg_id, 'q95_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.95)
        X.loc[seg_id, 'q99_roll_mean_' + str(windows)] = np.quantile(x_roll_mean, 0.99)
        X.loc[seg_id, 'av_change_abs_roll_mean_' + str(windows)] = np.mean(np.diff(x_roll_mean))
        X.loc[seg_id, 'av_change_rate_roll_mean_' + str(windows)] = np.mean(np.nonzero((np.diff(x_roll_mean) / x_roll_mean[:-1]))[0])
        X.loc[seg_id, 'abs_max_roll_mean_' + str(windows)] = np.abs(x_roll_mean).max()

In [9]:
def create_features_2(seg_id, seg, X):
    X.loc[seg_id, "mean"] = seg.mean()
    X.loc[seg_id, "std"]  = seg.std()
    X.loc[seg_id, "95_quantile"] = np.quantile(seg, 0.95)
    X.loc[seg_id, "50_quantile"] = np.quantile(seg, 0.5)
    X.loc[seg_id, "10_quantile"] = np.quantile(seg, 0.1)
    X.loc[seg_id, "5_quantile"] = np.quantile(seg, 0.05)
    
    X.loc[seg_id, "first_5000_mean"] = seg.iloc[:5000].mean()
    X.loc[seg_id, "first_5000_std"] = seg.iloc[:5000].std()
    X.loc[seg_id, "95_quantile_f5000"] = np.quantile(seg.iloc[:5000], 0.95)
    X.loc[seg_id, "50_quantile_f5000"] = np.quantile(seg.iloc[:5000], 0.50)
    X.loc[seg_id, "10_quantile_f5000"] = np.quantile(seg.iloc[:5000], 0.10)
    X.loc[seg_id, "5_quantile_f5000"]  = np.quantile(seg.iloc[:5000], 0.05)
    
    X.loc[seg_id, "mid_5000_mean"] = seg.iloc[5000:10000].mean()
    X.loc[seg_id, "mide_5000_std"] = seg.iloc[5000:10000].std()
    X.loc[seg_id, "95_quantile_m5000"] = np.quantile(seg.iloc[5000:10000], 0.95)
    X.loc[seg_id, "50_quantile_m5000"] = np.quantile(seg.iloc[5000:10000], 0.50)
    X.loc[seg_id, "10_quantile_m5000"] = np.quantile(seg.iloc[5000:10000], 0.10)
    X.loc[seg_id, "5_quantile_m5000"]  = np.quantile(seg.iloc[5000:10000], 0.05)
    
    X.loc[seg_id, "last_5000_mean"] = seg.iloc[-5000:].mean()
    X.loc[seg_id, "last_5000_std"] = seg.iloc[-5000:].std()
    X.loc[seg_id, "95_quantile_l5000"] = np.quantile(seg.iloc[-5000:], 0.95)
    X.loc[seg_id, "50_quantile_l5000"] = np.quantile(seg.iloc[-5000:], 0.50)
    X.loc[seg_id, "10_quantile_l5000"] = np.quantile(seg.iloc[-5000:], 0.10)
    X.loc[seg_id, "5_quantile_l5000"]  = np.quantile(seg.iloc[-5000:], 0.05)
    
    rolling_window_100_mean = seg.rolling(100).mean().dropna()
    rolling_window_100_std = seg.rolling(100).std().dropna()
    X.loc[seg_id, "rw100_mean_mean"] = rolling_window_100_mean.mean()
    X.loc[seg_id, "rw100_mean_std"] = rolling_window_100_mean.std()
    X.loc[seg_id, "rw100_std_mean"] = rolling_window_100_std.mean()
    X.loc[seg_id, "rw100_std_std"] = rolling_window_100_std.std()
    X.loc[seg_id, "rw100_mean_diff_mean"] = rolling_window_100_mean.diff().dropna().mean()
    X.loc[seg_id, "rw100_mean_diff_std"] = rolling_window_100_mean.diff().dropna().std()
    X.loc[seg_id, "rw100_std_diff_mean"] = rolling_window_100_std.diff().dropna().mean()
    X.loc[seg_id, "rw100_std_diff_std"] = rolling_window_100_std.diff().dropna().std()
    
    rolling_window_1000_mean = seg.rolling(1000).mean().dropna()
    rolling_window_1000_std = seg.rolling(1000).std().dropna()
    X.loc[seg_id, "rw1000_mean_mean"] = rolling_window_1000_mean.mean()
    X.loc[seg_id, "rw1000_mean_std"] = rolling_window_1000_mean.std()
    X.loc[seg_id, "rw1000_std_mean"] = rolling_window_1000_std.mean()
    X.loc[seg_id, "rw1000_std_std"] = rolling_window_1000_std.std()
    X.loc[seg_id, "rw1000_mean_diff_mean"] = rolling_window_1000_mean.diff().dropna().mean()
    X.loc[seg_id, "rw1000_mean_diff_std"] = rolling_window_1000_mean.diff().dropna().std()
    X.loc[seg_id, "rw1000_std_diff_mean"] = rolling_window_1000_std.diff().dropna().mean()
    X.loc[seg_id, "rw1000_std_diff_std"] = rolling_window_1000_std.diff().dropna().std()
    
    rolling_window_5000_mean = seg.rolling(5000).mean().dropna()
    rolling_window_5000_std = seg.rolling(5000).std().dropna()
    X.loc[seg_id, "rw5000_mean_mean"] = rolling_window_5000_mean.mean()
    X.loc[seg_id, "rw5000_mean_std"] = rolling_window_5000_mean.std()
    X.loc[seg_id, "rw5000_std_mean"] = rolling_window_5000_std.mean()
    X.loc[seg_id, "rw5000_std_std"] = rolling_window_5000_std.std()
    X.loc[seg_id, "rw5000_mean_diff_mean"] = rolling_window_5000_mean.diff().dropna().mean()
    X.loc[seg_id, "rw5000_mean_diff_std"] = rolling_window_5000_mean.diff().dropna().std()
    X.loc[seg_id, "rw5000_std_diff_mean"] = rolling_window_5000_std.diff().dropna().mean()
    X.loc[seg_id, "rw5000_std_diff_std"] = rolling_window_5000_std.diff().dropna().std()

In [10]:
train_X = pd.DataFrame(index=range(int(segments)), dtype=np.float64)
train_y = pd.DataFrame(index=range(int(segments)), dtype=np.float64, columns=['time_to_failure'])
# iterate over all segments
for seg_id in tqdm_notebook(range(len(begin_poses))):
    begin_point = begin_poses[seg_id]
    seg = origin_train_data.iloc[begin_point:begin_point + 150000]
    create_features(seg_id, seg, train_X)
    train_y.loc[seg_id, 'time_to_failure'] = seg['time_to_failure'].values[-1]

HBox(children=(IntProgress(value=0, max=12580), HTML(value='')))

KeyboardInterrupt: 

In [12]:
train_X = pd.DataFrame(index=range(int(segments)), dtype=np.float64)
train_y = pd.DataFrame(index=range(int(segments)), dtype=np.float64, columns=['time_to_failure'])
# iterate over all segments
for seg_id in tqdm_notebook(range(len(begin_poses))):
    begin_point = begin_poses[seg_id]
    seg = origin_train_data.iloc[begin_point:begin_point + 150000]
    create_features_2(seg_id, seg["acoustic_data"], train_X)
    train_y.loc[seg_id, 'time_to_failure'] = seg['time_to_failure'].values[-1]

HBox(children=(IntProgress(value=0, max=12580), HTML(value='')))

In [10]:
train_X = pd.read_csv("./train_X with non_fft, 50000 skipping").drop(columns=["Unnamed: 0"])
train_y = pd.read_csv("./train_y with non_fft, 50000 skipping")['time_to_failure']

In [11]:
print(train_X.keys())

Index(['mean', 'std', 'max', 'min', 'mean_change_abs', 'mean_change_rate',
       'abs_max', 'abs_min', 'std_first_50000', 'std_last_50000',
       ...
       'std_roll_mean_1000', 'max_roll_mean_1000', 'min_roll_mean_1000',
       'q01_roll_mean_1000', 'q05_roll_mean_1000', 'q95_roll_mean_1000',
       'q99_roll_mean_1000', 'av_change_abs_roll_mean_1000',
       'av_change_rate_roll_mean_1000', 'abs_max_roll_mean_1000'],
      dtype='object', length=134)


# Gradient Boost Regressor from sklearn

In [13]:
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor
kf = KFold(n_splits=10,shuffle=True)
#all_crit = ["friedman_mse", "mse", "mae"]
performance_map = {}
#for crit in all_crit:
for estimator in [100, 150, 200, 250]:
    for depth in [3,5,8,10,15]:
        all_res = []
        for train_index, test_index in kf.split(train_X):
            X_train, X_test = train_X.iloc[train_index], train_X.iloc[test_index]
            y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index]
            model = GradientBoostingRegressor(loss = 'lad', n_estimators=estimator, max_depth=depth)            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = mean_absolute_error(y_pred, y_test)
            print("mae " + str(estimator) + " " + str(depth) + ": {}".format(acc))
            all_res.append(acc)
        sing_res = sum(all_res) / len(all_res)
        performance_map[crit + " " + str(estimator) + " " + str(depth)] = sing_res

mae 100 3: 1.9807034277241633


KeyboardInterrupt: 

In [None]:
performance_map

In [13]:
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor
kf = KFold(n_splits=10,shuffle=True)
# Feature from Kaggle without ttf
performance_map = {}
#for crit in all_crit:
for estimator in [100, 150, 200, 250]:
    for depth in [3,5,8,10,15]:
        all_res = []
        for train_index, test_index in kf.split(train_X):
            X_train, X_test = train_X.iloc[train_index], train_X.iloc[test_index]
            y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index]
            model = GradientBoostingRegressor(loss = 'lad', n_estimators=estimator, max_depth=depth)            
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = mean_absolute_error(y_pred, y_test)
            print("mae " + str(estimator) + " " + str(depth) + ": {}".format(acc))
            all_res.append(acc)
        sing_res = sum(all_res) / len(all_res)
        performance_map["mae" + " " + str(estimator) + " " + str(depth)] = sing_res

mae 100 3: 2.036027476383842
mae 100 3: 2.0048389016980384
mae 100 3: 1.9319344666872509
mae 100 3: 2.0199291414959797
mae 100 3: 1.9992133165086141
mae 100 3: 1.929467560843402
mae 100 3: 1.9805003043297809
mae 100 3: 1.971274805673163
mae 100 3: 1.9965976650213835
mae 100 3: 2.0593125367818232
mae 100 5: 1.9213336379331325
mae 100 5: 1.9408464725609567
mae 100 5: 1.901435100844496
mae 100 5: 1.9936313886468373
mae 100 5: 1.987283832934347
mae 100 5: 1.9681274316078452
mae 100 5: 1.8902146644336904
mae 100 5: 1.9279478300278483
mae 100 5: 2.000603452684451
mae 100 5: 2.00789378068973
mae 100 8: 1.8743483997611108
mae 100 8: 1.9834236193642718
mae 100 8: 1.862381387892857
mae 100 8: 1.9406947384093183
mae 100 8: 1.9966805129382845
mae 100 8: 1.8841611642466927
mae 100 8: 2.00267699457772
mae 100 8: 1.8850807957570506
mae 100 8: 1.8285944562355096
mae 100 8: 1.9835704620722148
mae 100 10: 1.9001885640796254
mae 100 10: 1.907983594402614
mae 100 10: 1.8770044994815112
mae 100 10: 1.81330

In [16]:
performance_map

{'mae 100 3': 1.992909617542328,
 'mae 100 5': 1.9539317592363337,
 'mae 100 8': 1.924161253125503,
 'mae 100 10': 1.8952008990029554,
 'mae 100 15': 1.883430812212368,
 'mae 150 3': 1.9813249641190578,
 'mae 150 5': 1.9404788163296203,
 'mae 150 8': 1.9050412006745379,
 'mae 150 10': 1.8909768792041874,
 'mae 150 15': 1.8864429775313714,
 'mae 200 3': 1.9748647349292745,
 'mae 200 5': 1.9372617753394596,
 'mae 200 8': 1.8981214862369604,
 'mae 200 10': 1.8809249403477648,
 'mae 200 15': 1.8950632116740835,
 'mae 250 3': 1.9752884888895372,
 'mae 250 5': 1.9338264547331776,
 'mae 250 8': 1.89080214248234,
 'mae 250 10': 1.8768221797077527,
 'mae 250 15': 1.8869172444010887}

# Random Forest Regressor from sklearn

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [29]:
all_RFR_result = []
RFR_performance_map = {}
for estimator in [100, 150, 200, 250]:
    for depth in [3,5,8,10,15]:
        all_RFR_result = []
        for train_index, test_index in kf.split(train_X):
            X_train, X_test = train_X.iloc[train_index], train_X.iloc[test_index]
            y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index]
            model = RandomForestRegressor(max_depth=depth, random_state=0,
                                                  n_estimators=estimator,)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = mean_absolute_error(y_pred, y_test)
            print("mae " + str(estimator) + " " + str(depth) + ": {}".format(acc))
            all_RFR_result.append(acc)
        sing_res = sum(all_RFR_result) / len(all_RFR_result)
        RFR_performance_map[crit + " " + str(estimator) + " " + str(depth)] = sing_res

mae 100 3: 2.161387051140885
mae 100 3: 2.054604259042986
mae 100 3: 2.1581014511056393
mae 100 3: 2.212773796292269
mae 100 3: 2.110614115467644
mae 100 3: 2.1396863255239627
mae 100 3: 2.2514432712498476
mae 100 3: 2.2031328452603067
mae 100 3: 2.081217089628747
mae 100 3: 2.1237516525986875
mae 100 5: 2.099740616480439
mae 100 5: 2.1075818407858953
mae 100 5: 2.065043470351204
mae 100 5: 2.159921135948344
mae 100 5: 2.1449122923165995
mae 100 5: 2.1478832837964648
mae 100 5: 2.0755009876833976
mae 100 5: 2.085501346484551
mae 100 5: 2.060753139261848
mae 100 5: 2.100066265324292
mae 100 8: 2.149663316319457
mae 100 8: 2.0438438274127324
mae 100 8: 2.048436610581725
mae 100 8: 2.091792451991077
mae 100 8: 1.9833956971381281
mae 100 8: 2.0961451539917317
mae 100 8: 2.1244267691121217
mae 100 8: 2.0540567231038507
mae 100 8: 2.158621319934064
mae 100 8: 2.0770264449363616
mae 100 10: 2.141219300892099
mae 100 10: 2.100818888385697
mae 100 10: 2.083874909656423
mae 100 10: 2.00249792441

KeyboardInterrupt: 

In [30]:
RFR_performance_map

{'friedman_mse 100 3': 2.1496711857310977,
 'friedman_mse 100 5': 2.1046904378433036,
 'friedman_mse 100 8': 2.0827408314521247,
 'friedman_mse 100 10': 2.0868295555479492,
 'friedman_mse 100 15': 2.084469527118274,
 'friedman_mse 150 3': 2.1490434422334954,
 'friedman_mse 150 5': 2.1025923163267377,
 'friedman_mse 150 8': 2.086341999696324,
 'friedman_mse 150 10': 2.0812572888966967,
 'friedman_mse 150 15': 2.084983039303137,
 'friedman_mse 200 3': 2.1501319962787933,
 'friedman_mse 200 5': 2.1030188430083214,
 'friedman_mse 200 8': 2.084644606608139,
 'friedman_mse 200 10': 2.079097126326261}

In [None]:
sum(all_RFR_result) / len(all_RFR_result)

In [None]:
print(X_train.keys()[model.feature_importances_.argsort()][::-1])

In [None]:
model.feature_importances_

In [None]:
all_RFR_result = []
RFR_performance_map = {}
for estimator in [100, 150, 200, 250]:
    for depth in [3,5,8,10,15]:
        all_RFR_result = []
        for train_index, test_index in kf.split(train_X):
            X_train, X_test = train_X.iloc[train_index], train_X.iloc[test_index]
            y_train, y_test = train_y.iloc[train_index], train_y.iloc[test_index]
            model = RandomForestRegressor(max_depth=depth, random_state=0,
                                                  n_estimators=estimator,)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = mean_absolute_error(y_pred, y_test)
            print("mae " + str(estimator) + " " + str(depth) + ": {}".format(acc))
            all_RFR_result.append(acc)
        sing_res = sum(all_RFR_result) / len(all_RFR_result)
        RFR_performance_map[" " + str(estimator) + " " + str(depth)] = sing_res

mae 100 3: 2.0469906958061395
mae 100 3: 2.0439816628788634
mae 100 3: 2.1065289161254093
mae 100 3: 2.0104419259332147
mae 100 3: 2.058890188510088
mae 100 3: 2.0681201504707682
mae 100 3: 2.0630779807830275
mae 100 3: 2.1239547744051
mae 100 3: 2.112927186569614
mae 100 3: 2.1426240799940452
mae 100 5: 2.076036718938432
mae 100 5: 2.130295501509978
mae 100 5: 1.962372352222085
mae 100 5: 1.964847986135427
mae 100 5: 2.11591591171807
mae 100 5: 2.0638687311437156
mae 100 5: 2.031378536903276
mae 100 5: 2.0565995384280162
mae 100 5: 2.013712403290341
mae 100 5: 1.9734881683685856
mae 100 8: 1.9892503542201585
mae 100 8: 1.9656838798252643
mae 100 8: 1.9185612997784363
mae 100 8: 1.9874033720701518
mae 100 8: 2.0326810929904187
mae 100 8: 1.9891404403746396
mae 100 8: 2.014141913353543
mae 100 8: 1.9496976884692128
mae 100 8: 2.0475623215256364
mae 100 8: 2.1077118631723857
mae 100 10: 1.906630812126869
mae 100 10: 1.9885526019990538
mae 100 10: 1.993176409906161
mae 100 10: 1.965244604

# Nerual Network from pytorch

In [None]:
import torch
dtype = torch.float
device = torch.device("cpu")
in_d, H_1, H_2,H_3, H_4, D_out = train_X.shape[1], 100, 80,60,20, 1
X_np = train_X.values
X = torch.from_numpy(X_np)

In [None]:
X.shape

In [None]:
y_np = train_y.values
y = torch.from_numpy(y_np)

In [None]:
model = torch.nn.Sequential(
    torch.nn.Linear(in_d, H_1),
    torch.nn.ReLU(),
    torch.nn.Linear(H_1, H_2),
    torch.nn.ReLU(),
    torch.nn.Linear(H_2, H_3),
    torch.nn.ReLU(),
    torch.nn.Linear(H_4, D_out),)

In [None]:
model

In [None]:
loss_fn = torch.nn.L1Loss(reduction='mean')
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for t in range(10000):
    y_pred = model(X.float())
    loss = loss_fn(y.float(), y_pred)
    if(t%100 == 0):
        print(t, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [90]:
loss = loss_fn(y.float(), y_pred)

In [91]:
loss.item()

20.530147552490234