In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score
from sklearn.ensemble import HistGradientBoostingRegressor, ExtraTreesRegressor

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
train_extra = pd.read_csv('./data/train_new.csv')
test_extra = pd.read_csv('./data/test_new.csv')

duplicate_train = [c for c in train_extra.columns if c in train.columns]
duplicate_test = [c for c in test_extra.columns if c in test.columns]
train_extra = train_extra.drop(columns=duplicate_train)
test_extra = test_extra.drop(columns=duplicate_test)

train = pd.concat([train.reset_index(drop= True), train_extra], axis = 1)
test = pd.concat([test.reset_index(drop= True), test_extra], axis = 1)

In [None]:
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor 
from catboost import CatBoostRegressor

In [None]:
print(train.shape, test.shape)
print(list(train.columns))
print(list(test.columns))

In [None]:
all_columns = list(train.columns)
features = []
for col in train.columns:
    if col not in ["Y1", "Y2", "id", "time"]:
        features.append(col)

In [None]:
print(features)

In [None]:
train_targets = train.drop(columns=["Y1", "Y2"])
test_ids = test.drop(columns=["id"])

full = pd.concat([train_targets, test_ids], ignore_index = True)

In [None]:
print(full.shape)

In [None]:
full = full.sort_values("time").reset_index(drop = True)

In [None]:
lag_list = [1, 2, 3, 5, 10]

In [None]:
for feature in features:
    series = full[feature]
    for k in lag_list:
        col_name = f"{feature}_lag{k}"
        full[col_name] = series.shift(k)

In [None]:
example = features[0]
expected = f"{example}_lag1"
if expected not in full.columns:
    print("Missing")

In [None]:
for feature in features:
    s = full[feature]
    full[f"{feature}_diff1"] = s.diff(1)
    full[f"{feature}_diff5"] = s.diff(5)

In [None]:
r_windows = [5, 20]
for feature in features:
    s = full[feature]
    for w in r_windows:
        full[f"{feature}_rmean{w}"] = s.rolling(window = w, min_periods= 1).mean()


In [None]:
for feature in features:
    s = full[feature]
    for w in r_windows:
        full[f"{feature}_rstd{w}"] = s.rolling(window=w, min_periods=1).std().fillna(0.0)

for feature in features:
    s = full[feature]
    full[f"{feature}_rmin5"] = s.rolling(window = 5, min_periods = 1).min()
    full[f"{feature}_rmax5"] = s.rolling(window = 5, min_periods = 1).max()

In [None]:
full = full.copy()

In [None]:
no_train_rows = len(train)
train_fe = full.iloc[:no_train_rows].copy()
test_fe = full.iloc[no_train_rows:].copy()

In [None]:
print(train_fe.shape)
print(test_fe.shape)

In [None]:
train_time_max = train_fe["time"].max()
test_time_min = test_fe["time"].min()

In [None]:
print(int(train_time_max))
print(int(test_time_min))

In [None]:
for col in train_fe.columns:
    if col != "time":
        train_fe[col] = np.tanh(train_fe[col])
        test_fe[col] = np.tanh(test_fe[col])

In [None]:
max_lag = 10
train_xy = train_fe.iloc[max_lag:].copy()
train_xy[["Y1","Y2"]] = train[["Y1","Y2"]].iloc[max_lag:].values

In [None]:
x = train_xy.drop(columns=["Y1", "Y2", "time"])
y1 = train_xy["Y1"].values
y2 = train_xy["Y2"].values

In [None]:
print(x.shape)
print(len(y1))
print(len(y2))

In [None]:
x = x.astype(np.float32)

In [None]:
tscv = TimeSeriesSplit(n_splits = 5)
fold = 0
for index, value in tscv.split(x):
    train_end = int(train_xy.iloc[index]["time"].max())
    value_start = int(train_xy.iloc[value]["time"].min())
    value_end = int(train_xy.iloc[value]["time"].max())
    fold += 1

In [None]:
print(train_end)
print(value_start)

In [None]:
def cv(x, y, splits = 5):
    tscv2 = TimeSeriesSplit(n_splits= splits)
    r2_scores = []
    hgb_params = dict(max_iter = 300, learning_rate = 0.06, max_leaf_nodes = 128, min_samples_leaf= 10, l2_regularization=0.0, random_state= 40)
    et_params = dict(n_estimators = 1200, min_samples_leaf = 3, max_features = 0.7, bootstrap = True, n_jobs = -1, max_samples = 0.9, random_state = 40, max_depth = None)

    fold_id = 0
    for index, value in tscv2.split(x):
        x_train = x.iloc[index]
        x_value = x.iloc[value]
        y_train = y[index]
        y_value = y[value]

        hgb = HistGradientBoostingRegressor(**hgb_params)
        et = ExtraTreesRegressor(**et_params)

        hgb.fit(x_train, y_train)
        et.fit(x_train, y_train)

        predict_hgb = hgb.predict(x_value)
        predict_et = et.predict(x_value)

        predict_b = 0.5 * predict_hgb + 0.5 * predict_et
        fold_r2 = r2_score(y_value, predict_b)
        r2_scores.append(fold_r2)

        fold_id += 1
    r2_scores = np.array(r2_scores)
    print(r2_scores.mean())
    return r2_scores


In [None]:
r2_y1 = cv(x, y1, splits = 5)
r2_y2 = cv(x, y2, splits = 5)
avg_cv = (r2_y1.mean() + r2_y2.mean()) / 2

In [None]:
print((r2_y1[-1] + r2_y2[-1]) / 2)

In [None]:
x_test = test_fe.drop(columns = ["time"]).copy()
x_test = x_test[x.columns]

x_test = x_test.astype(np.float32)

train_cols = list(x.columns)
test_cols = list(x_test.columns)

def cv_single(x, y, model, splits = 5):
    tscv3 = TimeSeriesSplit(n_splits= splits)
    scores = []
    for index, value in tscv3.split(x):
        m = model.__class__(**model.get_params())
        m.fit(x.iloc[index], y[index])
        scores.append(r2_score(y[value], m.predict(x.iloc[value])))
    return np.array(scores)

hgb_cv = HistGradientBoostingRegressor(max_iter = 300, learning_rate = 0.06, max_leaf_nodes = 128, min_samples_leaf= 10, l2_regularization=0.0, random_state= 40)
et_cv = ExtraTreesRegressor(n_estimators = 1200, min_samples_leaf = 3, max_features = 0.7, bootstrap = True, n_jobs = -1, max_samples = 0.9, random_state = 40, max_depth = None)

r2_hgb_y1 = cv_single(x, y1, hgb_cv, splits = 5)
r2_et_y1 = cv_single(x, y1, et_cv, splits = 5)

r2_hgb_y2 = cv_single(x, y2, hgb_cv, splits = 5)
r2_et_y2 = cv_single(x, y2, et_cv, splits = 5)


In [None]:
lgbm_params = dict(
    n_estimators = 1200, learning_rate = 0.03,
    num_leaves = 63, max_depth = -1,
    subsample = 0.9, colsample_bytree = 0.8,
    reg_lambda = 1.0, reg_alpha = 0.0,
    min_child_samples = 20, n_jobs = -1, random_state = 40
)

xgb_params = dict(
    n_estimators = 1200, learning_rate = 0.03,
    max_depth = 8, subsample = 0.9, colsample_bytree = 0.8,
    reg_lambda = 1.0, reg_alpha = 0.0,
    min_child_weight = 2, tree_method = "hist", n_jobs = -1, random_state = 40
)

cat_params = dict(
    depth = 8, learning_rate = 0.03, iterations = 1200,
    l2_leaf_reg = 3.0, loss_function = "RMSE",
    random_seed = 40, verbose = False
)

In [None]:
cv_scores_y1 = {"hgb": r2_hgb_y1.mean(), "et": r2_et_y1.mean()}
cv_scores_y2 = {"hgb": r2_hgb_y2.mean(), "et": r2_et_y2.mean()}

In [None]:
r2_lgbm_y1 = cv_single(x, y1, LGBMRegressor(**lgbm_params), splits = 5).mean()
r2_lgbm_y2 = cv_single(x, y2, LGBMRegressor(**lgbm_params), splits = 5).mean()
cv_scores_y1["lgbm"] = r2_lgbm_y1
cv_scores_y2["lgbm"] = r2_lgbm_y2

r2_xgb_y1 = cv_single(x, y1, XGBRegressor(**xgb_params), splits = 5).mean()
r2_xgb_y2 = cv_single(x, y2, XGBRegressor(**xgb_params), splits = 5).mean()
cv_scores_y1["xgb"] = r2_xgb_y1
cv_scores_y2["xgb"] = r2_xgb_y2

r2_cat_y1 = cv_single(x, y1, CatBoostRegressor(**cat_params), splits = 5).mean()
r2_cat_y2 = cv_single(x, y2, CatBoostRegressor(**cat_params), splits = 5).mean()
cv_scores_y1["cat"] = r2_cat_y1
cv_scores_y2["cat"] = r2_cat_y2


In [None]:
use_m_y1 = [name for name, s in cv_scores_y1.items() if s > 0.0]
use_m_y2 = [name for name, s in cv_scores_y2.items() if s > 0.0]

In [None]:
print(use_m_y1)
print(use_m_y2)

In [None]:
# Y1 final models
final_hgb_y1 = HistGradientBoostingRegressor(max_iter = 700, learning_rate = 0.06, max_leaf_nodes = 128, min_samples_leaf= 10, l2_regularization=0.0, random_state= 40).fit(x, y1)
final_et_y1 = ExtraTreesRegressor(n_estimators = 1200, min_samples_leaf = 3, max_features = 0.7, bootstrap = True, n_jobs = -1, max_samples = 0.9, random_state = 40, max_depth = None).fit(x, y1)

# Y2 final models
final_hgb_y2 = HistGradientBoostingRegressor(max_iter = 700, learning_rate = 0.06, max_leaf_nodes = 128, min_samples_leaf= 10, l2_regularization=0.0, random_state= 40).fit(x, y2)
final_et_y2 = ExtraTreesRegressor(n_estimators = 1200, min_samples_leaf = 3, max_features = 0.7, bootstrap = True, n_jobs = -1, max_samples = 0.9, random_state = 40, max_depth = None).fit(x, y2)

models_y1 = {"hgb": final_hgb_y1, "et": final_et_y1}
models_y2 = {"hgb": final_hgb_y2, "et": final_et_y2}

In [None]:
models_y1["lgbm"] = LGBMRegressor(**lgbm_params).fit(x,y1)
models_y2["lgbm"] = LGBMRegressor(**lgbm_params).fit(x,y2)

models_y1["xgb"] = XGBRegressor(**xgb_params).fit(x,y1)
models_y2["xgb"] = XGBRegressor(**xgb_params).fit(x,y2)

models_y1["cat"] = CatBoostRegressor(**cat_params).fit(x,y1)
models_y2["cat"] = CatBoostRegressor(**cat_params).fit(x,y2)

In [None]:
P1 = [models_y1[name].predict(x_test) for name in use_m_y1]
P2 = [models_y2[name].predict(x_test) for name in use_m_y2]
predicted_y1 = np.mean(P1, axis = 0)
predicted_y2 = np.mean(P2, axis = 0)

In [None]:
print(float(np.mean(predicted_y1)))
print(float(np.std(predicted_y1)))
print(float(np.mean(predicted_y2)))
print(float(np.std(predicted_y2)))

In [None]:
sub = pd.DataFrame({
    'id': test['id'],
    'Y1': predicted_y1,
    'Y2': predicted_y2,
})

sub.to_csv('preds.csv', index=False)
print(sub.shape)