In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression, Ridge, SGDRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, mean_absolute_error, make_scorer

In [None]:
train_df = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
%%time
window_sizes = [10, 25, 50, 100]

for window in window_sizes:
    train_df["rolling_mean_" + str(window)] = train_df['signal'].rolling(window=window).mean()
    train_df["rolling_std_" + str(window)] = train_df['signal'].rolling(window=window).std()
    train_df["rolling_var_" + str(window)] = train_df['signal'].rolling(window=window).var()
    train_df["rolling_min_" + str(window)] = train_df['signal'].rolling(window=window).min()
    train_df["rolling_max_" + str(window)] = train_df['signal'].rolling(window=window).max()
    
    train_df["rolling_min_max_ratio_" + str(window)] = train_df["rolling_min_" + str(window)] / train_df["rolling_max_" + str(window)]
    train_df["rolling_min_max_diff_" + str(window)] = train_df["rolling_max_" + str(window)] - train_df["rolling_min_" + str(window)]
    
    a = (train_df['signal'] - train_df['rolling_min_' + str(window)]) / (train_df['rolling_max_' + str(window)] - train_df['rolling_min_' + str(window)])
    train_df["norm_" + str(window)] = a * (np.floor(train_df['rolling_max_' + str(window)]) - np.ceil(train_df['rolling_min_' + str(window)]))
    
train_df = train_df.replace([np.inf, -np.inf], np.nan)    
train_df.fillna(0, inplace=True)

train_y = train_df['open_channels']
train_x = train_df.drop(columns=['time', 'open_channels'])

del train_df

In [None]:
train_y = kagg_train['open_channels']
train_x = kagg_train.drop(columns=['time', 'open_channels'])

In [None]:
scaler = StandardScaler()
scaler.fit(train_x)
train_x_scaled = pd.DataFrame(scaler.transform(train_x), columns=train_x.columns)

del train_x

In [None]:
for window in window_sizes:
    test_df["rolling_mean_" + str(window)] = test_df['signal'].rolling(window=window).mean()
    test_df["rolling_std_" + str(window)] = test_df['signal'].rolling(window=window).std()
    test_df["rolling_var_" + str(window)] = test_df['signal'].rolling(window=window).var()
    test_df["rolling_min_" + str(window)] = test_df['signal'].rolling(window=window).min()
    test_df["rolling_max_" + str(window)] = test_df['signal'].rolling(window=window).max()
    
    test_df["rolling_min_max_ratio_" + str(window)] = test_df["rolling_min_" + str(window)] / test_df["rolling_max_" + str(window)]
    test_df["rolling_min_max_diff_" + str(window)] = test_df["rolling_max_" + str(window)] - test_df["rolling_min_" + str(window)]

    
    a = (test_df['signal'] - test_df['rolling_min_' + str(window)]) / (test_df['rolling_max_' + str(window)] - test_df['rolling_min_' + str(window)])
    test_df["norm_" + str(window)] = a * (np.floor(test_df['rolling_max_' + str(window)]) - np.ceil(test_df['rolling_min_' + str(window)]))
    
test_df = test_df.replace([np.inf, -np.inf], np.nan)    
test_df.fillna(0, inplace=True)

In [None]:
test_x_scaled = pd.DataFrame(scaler.transform(test_df), columns=test_df.columns)

In [None]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)

params = {'num_leaves': 128,
          'min_data_in_leaf': 64,
          'objective': 'huber',
          'max_depth': -1,
          'learning_rate': 0.005,
          "boosting": "gbdt",
          "bagging_freq": 5,
          "bagging_fraction": 0.8,
          "bagging_seed": 11,
          "metric": 'mae',
          "verbosity": -1,
          'reg_alpha': 0.1,
          'reg_lambda': 0.3
         }

In [None]:
oof = np.zeros(len(train_x_scaled))
prediction = np.zeros(len(test_x_scaled))
scores = []

for fold_n, (train_index, valid_index) in enumerate(folds.split(train_x_scaled)):
    print('Fold', fold_n, 'started at', time.ctime())
    X_train, X_valid = train_x_scaled.iloc[train_index], train_x_scaled.iloc[valid_index]
    y_train, y_valid = train_y.iloc[train_index], train_y.iloc[valid_index]
    
    model = lgb.LGBMRegressor(**params, n_estimators = 5000, n_jobs = -1)
    model.fit(X_train, y_train, 
            eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric='mae',
            verbose=500, early_stopping_rounds=200)

    y_pred_valid = model.predict(X_valid)
    y_pred = model.predict(test_x_scaled, num_iteration=model.best_iteration_)
    
    oof[valid_index] = y_pred_valid.reshape(-1,)
    scores.append(mean_absolute_error(y_valid, y_pred_valid))

    prediction += y_pred

prediction /= n_fold

In [None]:
sub = pd.read_csv("data/sample_submission.csv", dtype={'time':str})

sub['open_channels'] = np.round(prediction).astype(np.int)
sub.to_csv("submission.csv", index=False, float_format='%.4f')