# Import data and load libraries

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/directional-forecasting-in-cryptocurrencies/sample_submission.csv
/kaggle/input/directional-forecasting-in-cryptocurrencies/train.csv
/kaggle/input/directional-forecasting-in-cryptocurrencies/test.csv


In [2]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas import read_csv, set_option
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from xgboost import XGBClassifier, plot_importance
from bayes_opt import BayesianOptimization


In [3]:
train_df = pd.read_csv('/kaggle/input/directional-forecasting-in-cryptocurrencies/train.csv')
test_df = pd.read_csv('/kaggle/input/directional-forecasting-in-cryptocurrencies/test.csv')

In [4]:
print('train set shape:', train_df.shape)
print('test set shape:', test_df.shape)

train set shape: (2122438, 11)
test set shape: (909617, 11)


In [5]:
train_df.head()

Unnamed: 0,timestamp,open,high,low,close,volume,quote_asset_volume,number_of_trades,taker_buy_base_volume,taker_buy_quote_volume,target
0,1525471260,0.9012,0.9013,0.9012,0.9013,134.98,121.646459,4.0,125.08,112.723589,1.0
1,1525471320,0.90185,0.90195,0.90185,0.90195,1070.54,965.505313,12.0,879.94,793.612703,0.0
2,1525471380,0.9014,0.9014,0.90139,0.90139,2293.06,2066.963991,5.0,0.0,0.0,0.0
3,1525471440,0.90139,0.9014,0.90138,0.90139,6850.59,6175.000909,19.0,1786.3,1610.149485,0.0
4,1525471500,0.90139,0.90139,0.9013,0.9013,832.3,750.222624,3.0,784.82,707.4289,0.0


In [6]:
# Convert timestamp to date
train_df['Date'] = pd.to_datetime(train_df['timestamp'], unit='s')
test_df['Date'] = pd.to_datetime(test_df['timestamp'], unit='s')
# Set the date as the index
train_df.set_index('Date', inplace=True)
test_df.set_index('Date', inplace=True)
#drop the timestampt column
train_df.drop('timestamp', axis = 1 ,inplace = True)
test_df.drop('timestamp', axis = 1 ,inplace = True)

# Feature engineering

Build new intuitive and informative features from the given ones

In [7]:
def create_price_related_features(df): 

  #capture change in price in the current minute
  df['price_change'] = df['close'] - df['open']

  #capture percentage change in price in the current minute
  df['price_change_pct'] = (df['close'] - df['open'])/df['open']

  #captures price range in the current minute
  df['price_range'] = df['high'] - df['low']

  #captures price range pct in the current minute
  df['price_range_pct'] = (df['high'] - df['low'])/df['open']

  #captures the strength and direction of price movement in relation to its volatility
  df['price_momentum_ratio'] = (df['close'] - df['open'])/(df['high'] - df['low'])

  #average price during the minute
  df['avg_price'] = (df['high'] + df['low'] + df['open'] +df['close'])/4

  #bullish or bearish momentum in the current minute
  df['bullish_momentum'] = (df['close'] > df['open']).astype(int)
  df['bearish_momentum'] = (df['close'] < df['open']).astype(int)

  # proximity of the close price to the high and low within the minute
  df['close_to_high'] = (df['close'] - df['high']) / (df['high'] - df['low'])
  df['close_to_low'] = (df['close'] - df['low']) / (df['high'] - df['low'])

  #is the current high price higher than the previous one (previous minute)
  df['is_new_high'] = (df['high'] > df['high'].shift(1)).astype(int)
  #is the current low price lower than the previous one (previous minute)
  df['is_new_low'] = (df['low'] < df['low'].shift(1)).astype(int)

  return df

In [8]:
def create_volume_related_features(df): 
  
  # captures strength of buying pressure
  df['taker_buy_ratio'] = df['taker_buy_base_volume'] / df['volume']

  # captures the proportion of buying activity from takers.
  df['taker_buy_quote_ratio'] = df['taker_buy_quote_volume'] / df['quote_asset_volume']

  # captures change in volume from the previous minute
  df['volume_change'] = df['volume'] - df['volume'].shift(1)

  # captures change in quote_asset_volume from the previous minute
  df['qav_change'] = df['quote_asset_volume'] - (df['quote_asset_volume'].shift(1))

  # avg number of trades per unit volume
  df['trades_per_volume'] = df['number_of_trades'] / df['volume']

  #avg number of trades per USDT value of trades
  df['trades_per_quote_volume'] = df['number_of_trades'] / df['quote_asset_volume']

  #Volume Weighted Average Price for the minute
  df['vwap'] = df['quote_asset_volume'] / df['volume']
  # buying pressure in the last 5mins
  df['buy_pressure'] = (df['taker_buy_base_volume'] > (df['volume'].rolling(window=5).mean())).astype(int)

  return df


In [9]:
def create_time_features(df):
  df['minute'] = df.index.minute
  df['hour'] = df.index.hour
  df['day_of_week'] = df.index.dayofweek
  df['month'] = df.index.month
  df['day'] = df.index.day
  df['is_weekend'] = df.index.dayofweek.isin([5, 6]).astype(int)

  return df

In [10]:
def diff_between_features(df, column, shift_size):
   #difference between current column and a previous column: capture the price movement over a specific time interval
  df[f'{column}_diff_{shift_size}'] = df[column] - df[column].shift(shift_size)
  return df

In [11]:
#create the new features
for df in [train_df, test_df]:
    df = create_price_related_features(df)
    df = create_volume_related_features(df)
    df = create_time_features(df)

In [12]:
#create the new features 
for df in [train_df, test_df]: 
    for col in ['price_range', "price_change"]:
      for shift in range(1,11):
        df = diff_between_features(df, col, shift)

# Model Training

In [13]:
# Calculate scale_pos_weight to help with class imbalance
def calculate_scole_pos_weight(df):
  pos_count = df['target'].sum()
  neg_count = len(df) - pos_count
  scale_pos_weight = (neg_count / pos_count if pos_count > 0 else 1).round(3)
  return scale_pos_weight

In [14]:
# Calculate scale_pos_weight to help with class imbalance - to be used in xgb classifier
scale_pos_weight = calculate_scole_pos_weight(train_df)

In [15]:
#columns used to make predictions
predictors = [
 'volume',
 'quote_asset_volume',
 'number_of_trades',
 'taker_buy_base_volume',
 'taker_buy_quote_volume',
 'taker_buy_ratio',
 'taker_buy_quote_ratio',
 'volume_change',
 'qav_change',
 'trades_per_volume',
 'trades_per_quote_volume',
 'vwap',
 'buy_pressure',
 'price_change',
 'price_change_pct',
 'price_range',
 'price_range_pct',
 'price_momentum_ratio',
 'avg_price',
 'bullish_momentum',
 'bearish_momentum',
 'close_to_high',
 'close_to_low',
 'is_new_high',
 'is_new_low',
 'minute',
 'hour',
 'day_of_week',
 #'month',
 'day',
 'is_weekend',
 'price_range_diff_1',
 'price_range_diff_2',
 'price_range_diff_3',
 'price_range_diff_4',
 'price_range_diff_5',
 'price_range_diff_6',
 'price_range_diff_7',
 'price_range_diff_8',
 'price_range_diff_9',
 'price_range_diff_10',
 'price_change_diff_1',
 'price_change_diff_2',
 'price_change_diff_3',
 'price_change_diff_4',
 'price_change_diff_5',
 'price_change_diff_6',
 'price_change_diff_7',
 'price_change_diff_8',
 'price_change_diff_9',
 'price_change_diff_10']

In [16]:
#train XGBoost classifier
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    scale_pos_weight=scale_pos_weight,
    max_depth=4,
    learning_rate=0.28738383094941533,
    colsample_bytree=0.5363483031257387,
    subsample=0.8796829879580419,
    n_estimators=123,
)

xgb_model.fit(train_df[predictors], train_df['target'])
y_pred = xgb_model.predict(test_df[predictors])

# Result Submission

In [17]:
#Save results
pd.DataFrame({'row_id': test_df['row_id'], 'target': y_pred}).to_csv('sub_1.csv', index=False)