In [None]:
import os
import numpy as np 
import pandas as pd 
import random
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMRegressor
import time
from datetime import datetime
# auxiliary function, from datetime to timestamp
totimestamp = lambda s: np.int32(time.mktime(datetime.strptime(s, "%d/%m/%Y").timetuple()))
import gresearch_crypto

import warnings
warnings.filterwarnings("ignore")

In [None]:
df_train = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/train.csv')
df_train.head()

In [None]:
asset_data = pd.read_csv('/kaggle/input/g-research-crypto-forecasting/asset_details.csv')
asset_data

In [None]:
# Feature engineering part

def log_return(series, periods=1):
    return np.log(series).diff(periods=periods)

def zscore(x, window):
    r = x.rolling(window=window, min_periods = 1)
    m = r.mean()
    s = r.std(ddof=0)
    z = (x-m)/s
    return z

# 蜡烛图中当日最高价与最高开盘价（收盘价）之差（上烛线）
def upper_shadow(df): return df['High'] - np.maximum(df['Close'], df['Open'])
# 蜡烛图中当日最低价与最低收盘价（开盘价）之差（下烛线）
def lower_shadow(df): return np.minimum(df['Close'], df['Open']) - df['Low']
#15个时间周期内最长上烛线
def upper_shadow_15(df): return df['High'].rolling(window=15, min_periods=1).max() - np.maximum(df['Close'], df['Open']).shift(15) #15个数据点取一次
#15个时间周期内最长下烛线
def lower_shadow_15(df): return np.minimum(df['Close'], df['Open']).shift(15) - df['Low'].rolling(window=15, min_periods=1).min()

#当日最高价与收盘价或开盘价的比值                                                    
def upper_shadow_percent(df): return (df['High'] / np.maximum(df['Close'], df['Open'])) -1
#当日最低价与收盘价或开盘价的比值
def lower_shadow_percent(df): return (np.minimum(df['Close'], df['Open']) / df['Low']) -1
#15个时间周期内最高价与收盘价或开盘价的比值                                           
def upper_shadow_15_perc(df): return (df['High'].rolling(window=15, min_periods=1).max() / np.maximum(df['Close'], df['Open']).shift(15)) -1
def lower_shadow_15_perc(df): return (np.minimum(df['Close'], df['Open']).shift(15) / df['Low'].rolling(window=15, min_periods=1).min()) -1

# 区间开盘价振幅 = 最高开盘价与最低开盘价之比
def open_amp(df): return df['Open'].rolling(window=15, min_periods=1).max() / df['Open'].rolling(window=15, min_periods=1).min()
# 区间收盘价振幅 = 最高收盘价与最低收盘价之比
def close_amp(df): return df['Close'].rolling(window=15, min_periods=1).max() / df['Close'].rolling(window=15, min_periods=1).min()
# 区间最高价振幅 = 最大最高价与最小最高价之比
def high_amp(df): return df['High'].rolling(window=15, min_periods=1).max() / df['High'].rolling(window=15, min_periods=1).min()
# 区间最低价振幅 = 最大最低价与最小最低价之比
def low_amp(df): return df['Low'].rolling(window=15, min_periods=1).max() / df['Low'].rolling(window=15, min_periods=1).min()
# 最高价与最低价振幅 = 区间最高价与区间最低价之比
def high_amp_low(df): return df['High'].rolling(window=15, min_periods=1).max() / df['Low'].rolling(window=15, min_periods=1).min()
# 区间内最高价与最低价最大振幅
def high_amp_low_day(df): return (df['High']/df['Low']).rolling(window=15, min_periods=1).max()

# 区间成交量振幅 = 最大成交量与最小成交量之比
def vol_amp(df): return df['Volume'].rolling(window=15, min_periods=1).max() / df['Volume'].rolling(window=15, min_periods=1).min()
# 区间成交总数振幅 = 最大成交金额与最小成交金额之比
def amount_amp(df): return df['VWAP'].rolling(window=15, min_periods=1).max() / df['VWAP'].rolling(window=15, min_periods=1).min()
# 区间成交总数振幅 = 最大成交金额与最小成交金额之比
def count_amp(df): return df['Count'].rolling(window=15, min_periods=1).max() / df['Count'].rolling(window=15, min_periods=1).min()


In [None]:
def get_features(df):
    # feature engineering

    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['upper_shadow'] = upper_shadow(df)
    df_feat['lower_shadow'] = lower_shadow(df)
    df_feat['upper_shadow_15'] = upper_shadow_15(df)
    df_feat['lower_shadow_15'] = lower_shadow_15(df)
    df_feat['upper_shadow_percent'] = upper_shadow_percent(df)
    df_feat['lower_shadow_percent'] = lower_shadow_percent(df)
    df_feat['upper_shadow_15_perc'] = upper_shadow_15_perc(df)
    df_feat['lower_shadow_15_perc'] = lower_shadow_15_perc(df)
    df_feat['open_amp'] = open_amp(df)
    df_feat['close_amp'] = close_amp(df)
    df_feat['high_amp'] = high_amp(df)
    df_feat['low_amp'] = low_amp(df)
    df_feat['high_amp_low'] = high_amp_low(df)
    df_feat['high_amp_low_day'] = high_amp_low_day(df)
    df_feat['zscoreH'] = zscore(df['High'],15)
    df_feat['zscoreL'] = zscore(df['Low'],15)
    df_feat['zscoreC'] = zscore(df['Close'],15)
    df_feat['zscoreO'] = zscore(df['Open'],15)
    df_feat['vol_amp'] = vol_amp(df)
    df_feat['count_amp'] = count_amp(df)
    df_feat['amount_amp'] = amount_amp(df)
    df_feat['log_return5'] = log_return(df.VWAP,periods=5)
    df_feat['log_return1'] = log_return(df.VWAP,periods=1).abs()
    
    
    return df_feat

In [None]:
from sklearn.preprocessing import StandardScaler

def get_data_for_asset(df_train, asset_id):
    # Get X and y
    
    df = df_train[df_train["Asset_ID"] == asset_id].set_index("timestamp") 
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    
    # select training and test periods
    train_window = [totimestamp("01/05/2021"), totimestamp("30/05/2021")]
    test_window = [totimestamp("01/06/2021"), totimestamp("30/06/2021")]
    # divide data into train and test, compute X and y
    # we aim to build simple regression models using a window_size of 1
    
    X_train = X.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  # filling NaN's with zeros
    y_train = y.loc[train_window[0]:train_window[1]].fillna(0).to_numpy()  

    X_test = X.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 
    y_test = y.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 
# standard   
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, y_train, X_test_scaled, y_test

In [None]:

def get_testdata_for_asset(df_train, asset_id):
    # Get X and y
    
    df = df_train[df_train["Asset_ID"] == asset_id].set_index("timestamp") 
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    
    # select training and test periods
    test_window = [totimestamp("01/06/2021"), totimestamp("30/06/2021")]
    # divide data into train and test, compute X and y
    # we aim to build simple regression models using a window_size of 1

    X_test = X.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 
    y_test = y.loc[test_window[0]:test_window[1]].fillna(0).to_numpy() 

    
    return X_test, y_test

In [None]:
def model_training(X,y):
    # Model training
    
    model = LGBMRegressor(n_estimators=5000,num_leaves=700,learning_rate=0.1)
    model.fit(X, y)
    
    return model

In [None]:
%%time
Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(asset_data['Asset_ID'], asset_data['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    X_train_scaled, y_train, X_test_scaled, y_test = get_data_for_asset(df_train, asset_id)    
    model = model_training(X_train_scaled,y_train)
    Xs[asset_id], ys[asset_id], models[asset_id] = X_train_scaled, y_train, model

In [None]:
#print("Check the model and it's possibility for the prediction")
X_test, y_test = get_testdata_for_asset(df_train, asset_id)
df_X_test = pd.DataFrame(X_test)
x = df_X_test.iloc[1]
print(df_X_test.iloc[1])
#y_pred = models[0].predict([x])
#y_pred[0]

In [None]:
print(df_X_test)

In [None]:
#prediction test
df_test_timestamp= df_test[df_test["Asset_ID"] == 0].set_index("timestamp") 
print(df_test_timestamp)
test_features = get_features(df_test_timestamp)
model = models[0]
print(test_features.iloc[1])
ytest_pred = model.predict([test_features.iloc[1]])
print(ytest_pred)

In [None]:
sub_df = pd.DataFrame()
for asset_id in range(14): 
    df_asset_series= df_test[df_test["Asset_ID"] == asset_id].set_index("timestamp") 
    model = models[asset_id]
    asset_features = get_features(df_asset_series)
    ytest_pred = []
    for i in range(4):  
        y_pred = model.predict([test_features.iloc[i]])
#    print(y_pred)
        ytest_pred.append(y_pred)
    ar_test_pred = np.array(ytest_pred)
    group_num = df_asset_series['group_num']
    row_id = df_asset_series['row_id']
    result = pd.concat([group_num,row_id], axis=1)
    result['Target'] = ar_test_pred
#    print(result)
    sub_df = pd.concat([sub_df,result], axis=0)
submission = sub_df.sort_values(by=['row_id'])
    
    


In [None]:
submission.to_csv('/kaggle/working/submission.csv')