In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/g-research-crypto-forecasting/example_sample_submission.csv
/kaggle/input/g-research-crypto-forecasting/asset_details.csv
/kaggle/input/g-research-crypto-forecasting/example_test.csv
/kaggle/input/g-research-crypto-forecasting/train.csv
/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv
/kaggle/input/g-research-crypto-forecasting/gresearch_crypto/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/g-research-crypto-forecasting/gresearch_crypto/__init__.py


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import time
import xgboost as xgb
import gresearch_crypto
import traceback

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, RepeatedKFold
from sklearn.metrics import mean_squared_error
from matplotlib.colors import to_rgba

## <center>**EDA**</center>

### **train.csv - Column Description**
- **timestamp**: All timestamps are returned as second Unix timestamps (the number of seconds elapsed since 1970-01-01 00:00:00.000 UTC). Timestamps in this dataset are multiple of 60, indicating minute-by-minute data.
- **Asset_ID**: The asset ID corresponding to one of the crypto currencies (e.g. Asset_ID = 1 for Bitcoin). The mapping from Asset_ID to crypto asset is contained in asset_details.csv.
- **Count**: Total number of trades in the time interval (last minute).
- **Open**: Opening price of the time interval (in USD).
- **High**: Highest price reached during time interval (in USD).
- **Low**: Lowest price reached during time interval (in USD).
- **Close**: Closing price of the time interval (in USD).
- **Volume**: Quantity of asset bought or sold, displayed in base currency USD.
- **VWAP**: The average price of the asset over the time interval, weighted by volume. VWAP is an aggregated form of trade data.
- **Target**: Residual log-returns for the asset over a 15 minute horizon.

In [3]:
# Read Data

def read_data(nrows=None):
    data = pd.read_csv('../input/g-research-crypto-forecasting/train.csv', nrows=nrows)
    asset_details = pd.read_csv('../input/g-research-crypto-forecasting/asset_details.csv')
    return data, asset_details

### **Checking Nulls And Inf**

In [4]:
def check_null_and_inf(data):
    print("Numbers of Nulls in Data:")
    print(data.isnull().sum(), end='\n\n')
    print("Inf in Data:")
    print(np.where(np.isinf(data)==True)[0])

In [5]:
# Drop Infinite and Nan
def drop_inf_and_nan(data):
    data.replace([np.inf, -np.inf], np.nan, inplace=True)
    data.dropna(axis=0, inplace=True)
    data.isnull().sum()
    
    return data

## <center>**DataSet**</center>

### Hyperparameters

In [6]:
FEATURES = ['Count', 'Close','High', 'Low', 'Open', 'VWAP', 'Volume']
PARAMS = {
    'colsample_bytree': [0.5, 0.7],
    'n_estimators': range(520, 600, 40),
    'learning_rate': [0.01, 0.03, 0.05],
    'max_depth': range(11, 14, 1),
}

In [7]:
# def split_train_test(data, features):
#     X = data[features]
#     Y = data['Target']
#     X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)
    
#     return X_train, X_test, Y_train, Y_test

In [8]:
def crypto_df(asset_id, data):
    df = data[data["Asset_ID"] == asset_id].set_index("timestamp")
    return df

In [9]:
# Two new features from the competition tutorial
def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']

In [10]:
def get_features(df):
    df_feat = df[FEATURES].copy()   
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)

    return df_feat

## <center>**Visualization**</center>

### **DATA DISTRIBUTION**

Training Data Distribution among differnet Assets (Crypto Currencies)

In [11]:
def plot_dis(data):
    asset_count = []
    for i in range(14):
        count = (data["Asset_ID"] == i).sum()
        asset_count.append(count)

    fig = sns.countplot(x="Asset_ID", data=data)
    fig.ticklabel_format(style='sci', axis='y')
    # fig.set_xticklabels(asset_details.sort_values("Asset_ID")["Asset_Name"].tolist(), rotation=-30, horizontalalignment='left')
    fig.set(xlabel='Assets', ylabel='Number of Rows')

### **CANDELSTICK CHARTS**

In [12]:
def candelstick_chart(data,title):
    candlestick = go.Figure(data = [go.Candlestick(x =data.index, 
                                               open = data[('Open')], 
                                               high = data[('High')], 
                                               low = data[('Low')], 
                                               close = data[('Close')])])
    candlestick.update_xaxes(title_text = 'Minutes',
                             rangeslider_visible = True)

    candlestick.update_layout(
    title = {
        'text': '{:} Candelstick Chart'.format(title),
        'y':0.90,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})

    candlestick.update_yaxes(title_text = 'Price in USD', ticksuffix = '$')

    return candlestick

### **AREA PLOT**

In [13]:
# Volumn trade

def vol_traded(data):
    fig = plt.stackplot(data.index, data.Volume, color='thistle')
    return fig

## <center>**XG-Boosting**</center>

In [14]:
def xgb_cv(X, Y, params):
    data_dmatrix = xgb.DMatrix(data=X, label=Y)

    start_time = time.time()
    cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                        num_boost_round=50, early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)
    end_time = time.time()
    run_time = end_time - start_time
    
    return cv_results, run_time

## <center>**Pipeline**</center>

### Data (Training, Testing)

In [15]:
data, asset_details = read_data()

In [16]:
data.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [17]:
data.shape

(24236806, 10)

In [18]:
asset_details

Unnamed: 0,Asset_ID,Weight,Asset_Name
0,2,2.397895,Bitcoin Cash
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
3,5,1.386294,EOS.IO
4,7,2.079442,Ethereum Classic
5,6,5.894403,Ethereum
6,9,2.397895,Litecoin
7,11,1.609438,Monero
8,13,1.791759,TRON
9,12,2.079442,Stellar


In [19]:
# check_null_and_inf(data)

In [20]:
data = drop_inf_and_nan(data)

In [21]:
# check_null_and_inf(data)

In [22]:
data['Asset_ID'].unique()

array([ 2,  0,  1,  5,  7,  6,  9, 11, 13, 12,  3,  8, 10,  4])

### Visualization

In [23]:
# Plot data adistribution
# plot_dis(data)

In [24]:
# btc = crypto_df(1, data)
# eth = crypto_df (6, data)

In [25]:
# Plot candle stick

# btc_plot = candelstick_chart(btc[-100:], "Bitcoin")
# btc_plot.show()

In [26]:
# Plot volumn
# vol_traded(btc[-50:])

### Model Training

In [27]:
# def get_train_test_for_asset(data, asset_id):
#     data = data[data["Asset_ID"] == asset_id]
#     X_train, X_test, Y_train, Y_test = split_train_test(data, FEATURES)
#     return X_train, X_test, Y_train, Y_test

In [28]:
def get_xgb_model(X_train, Y_train):
    estimator = xgb.XGBRegressor(
        n_estimators=500,
        max_depth=11,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.7,
        missing=-999,
        random_state=2020,
        tree_method='gpu_hist'
    )

    
    start_time = time.time()
    estimator.fit(X_train, Y_train)
    end_time = time.time()
    
    time_elapsed = round(end_time - start_time)
    
    return estimator, time_elapsed

In [29]:
def get_xgb_model_cv(X_train, Y_train):
    estimator = xgb.XGBRegressor(
        objective = "reg:squarederror",
        nthread = 4,
    )
    estimator.fit(X_train, Y_train)
    
    cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1)
    
    random_search = RandomizedSearchCV(
        estimator = estimator,
        param_distributions = PARAMS,
        n_jobs = -1,
        cv = cv,
        random_state=1
    )
    
    start_time = time.time()
    random_search.fit(X_train, Y_train)
    end_time = time.time()
    
    time_elapsed = round(end_time - start_time)
    best_estimator = random_search.best_estimator_.get_params()

    xgb_best = xgb.XGBRegressor(
        objective = "reg:squarederror",
        nthread = 4,
        colsample_bytree = best_estimator['colsample_bytree'],
        n_estimators = best_estimator['n_estimators'],
        learning_rate = best_estimator['learning_rate'],
        max_depth = best_estimator['max_depth'],
        subsample = 0.9,
        random_state = 1,
        missing = -999,
        tree_method='gpu_hist'
        )

    xgb_best.fit(X_train, Y_train)
    
    return xgb_best, time_elapsed

In [30]:
# Loop over all assets

def loop_over():
    Xs_train = {}
    ys_train = {}
    models = {}
    time_total = 0

    for asset_id, asset_name in zip(asset_details['Asset_ID'], asset_details['Asset_Name']):

        X_train, Y_train = data[data["Asset_ID"] == asset_id][FEATURES], data[data["Asset_ID"] == asset_id]['Target']
        X_train = get_features(X_train)
        Xs_train[asset_id], ys_train[asset_id] = X_train.reset_index(drop=True), Y_train.reset_index(drop=True)

#         print('Training model for "{}":'.format(asset_details[asset_details['Asset_ID'] == asset_id]['Asset_Name'].iloc[0]))
    #     models[asset_id] = get_xgb_model_cv(X_train.iloc[:10], Y_train.iloc[:10])
        model, time_elapsed = get_xgb_model(X_train, Y_train)
        models[asset_id] = model
#         print("Exicuted time: {} seconds.\n".format(time_elapsed))
        time_total += time_elapsed

#     print('Total time elapsed:', time_total)
    
    return Xs_train, ys_train, models


In [31]:
Xs_train, ys_train, models = loop_over()

In [32]:
# print('X train size:', Xs_train[1].shape)
## print('X test size:', Xs_test[1].shape)
# print('Y train size:', ys_train[1].shape)
## print('Y test size:', ys_test[1].shape)

In [33]:
## Save the models

# import pickle

# for index, asset_id in enumerate(asset_details['Asset_ID']):
#     filename = str(asset_id) + '.pkl'
#     with open(filename, 'wb') as file:
#         pickle.dump(models[index], file)

In [34]:
# # Load the models

# load_models = {}

# for index, asset_id in enumerate(asset_details['Asset_ID']):
#     filename = str(asset_id) + '.pkl'
#     with open(filename, 'rb') as file:
#         load_models[index] = pickle.load(file)
        
# load_models

In [35]:
# for i in range(len(asset_details)):
#     predicted = models[i].predict(Xs_test[i])
#     print(mean_squared_error(ys_test[i], predicted))

In [36]:
### Submit

In [37]:
env = gresearch_crypto.make_env()
iter_test = env.iter_test()

for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():
        
        if models[row['Asset_ID']] is not None:
            try:
                model = models[row['Asset_ID']]
                x_test = get_features(row)
                y_pred = model.predict(pd.DataFrame([x_test]))[0]
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
            except:
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
        
    env.predict(df_pred)


This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
