In [45]:
import glob
import os
import pickle
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import datetime as dt

from ta import add_all_ta_features

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#### Requirements
- pandas==0.25.1
- ta==0.4.7
- scikit-learn==21.3

#### Background on Trade Recommender Models

Trade recommender models were created with with the goal of predicting whether the price of a cryptocurrency will go up or down in the next time period (the period is determined by the specific model). If the time period for the model was 6hrs, and if the model predicted that the price will go up, that would mean that if you bought that cryptocurrency 6 hours after the prediction time (this time comes from the data point that the model is predicting off of), the price of the crypto should have gone up after 6 hours from the time that you bought it. 

100s of iterations of models were generated in this notebook and the best ones were selected from each exchange/trading pair based on which iteration returned the highest net profit. When training the random forest classifier models, performance was highly varied with different periods and parameters so there was no one size fits all model, and that resulted in the models having unique periods and parameters. The data was obtained from the respective exchanges via their api, and models were trained on 1 hour candlestick data from 2015 - Oct 2018. The test set contained data from Jan 2019 - Oct 2019 with a two month gap left between the train and test sets to prevent data leakage. The models' predictions output 0 (sell) and 1 (buy) and profit was calculated by backtesting on the 2019 test set. The profit calculation incorporated fees like in the real world and considered any consecutive "buy" prediction as a "hold" trade instead so that fees wouldn't have to be paid on those transactions. The final models were all profitable with gains anywhere from 40% - 95% within the Jan 1, 2019 to Oct 30, 2019 time period. Visualizations for how these models performed given a $10K portfolio can be viewed at https://github.com/Lambda-School-Labs/cryptolytic-ds/blob/master/finalized_notebooks/visualization/tr_performance_visualization.ipynb

The separate models created for each exchange/trading pair combination were:
- Bitfinex BTC/USD
- Bitfinex ETH/USD
- Bitfinex LTC/USD
- Coinbase Pro BTC/USD
- Coinbase Pro ETH/USD
- Coinbase Pro LTC/USD
- HitBTC BTC/USD
- HitBTC ETH/USD
- HitBTC LTC/USD

##### Folder Structure:

├── trade_recommender/                <-- The top-level directory for all trade recommender work
│   │
│   ├── trade_rec_models.ipynb        <-- Notebook for trade recommender models
│   │
│   ├── data/                         <-- Directory for csv files of 1 hr candle data
│   │     └── data.csv                
│   │
│   ├── pickles/                      <-- Directory for all trade rec models
│   │     └── models.pkl         
│   │              
│   ├── tr_pickles/                   <-- Directory for best trade rec models
          └── models.pkl              

### Get all csv filenames into a variable - 1 hr candles

In [10]:
csv_filenames = glob.glob('data/*.csv') # modify to your filepath for data
print(len(csv_filenames))
csv_filenames

9


['data/bitfinex_ltc_usd_3600.csv',
 'data/bitfinex_btc_usd_3600.csv',
 'data/coinbase_pro_eth_usd_3600.csv',
 'data/coinbase_pro_ltc_usd_3600.csv',
 'data/hitbtc_eth_usdt_3600.csv',
 'data/bitfinex_eth_usd_3600.csv',
 'data/coinbase_pro_btc_usd_3600.csv',
 'data/hitbtc_ltc_usdt_3600.csv',
 'data/hitbtc_btc_usdt_3600.csv']

# Functions

#### OHLCV Data Resampling

In [11]:
def resample_ohlcv(df, period):
    """ Changes the time period on cryptocurrency ohlcv data.
        Period is a string denoted by '{time_in_minutes}T'(ex: '1T', '5T', '60T')."""

    # Set date as the index. This is needed for the function to run
    df = df.set_index(['date'])

    # Aggregation function
    ohlc_dict = {'open':'first',                                                                                                    
                 'high':'max',                                                                                                       
                 'low':'min',                                                                                                        
                 'close': 'last',                                                                                                    
                 'base_volume': 'sum'}

    # Apply resampling
    df = df.resample(period, how=ohlc_dict, closed='left', label='left')
    
    return df

#### Filling NaNs

In [4]:
# resample_ohlcv function will create NaNs in df where there were gaps in the data.
# The gaps could be caused by exchanges being down, errors from cryptowatch or the 
# exchanges themselves

def fill_nan(df):
    """Iterates through a dataframe and fills NaNs with appropriate 
        open, high, low, close values."""

    # Forward fill close column.
    df['close'] = df['close'].ffill()

    # Backward fill the open, high, low rows with the close value.
    df = df.bfill(axis=1)

    return df

#### Feature Engineering

In [26]:
def feature_engineering(df, period):
    """Takes in a dataframe of 1 hour cryptocurrency trading data
        and returns a new dataframe with selected period, new technical analysis features,
        and a target.
    """
    
    # Add a datetime column to df
    df['date'] = pd.to_datetime(df['closing_time'], unit='s')
     
    # Convert df to selected period
    df = resample_ohlcv(df, period)
    
    # Add feature to indicate gaps in the data
    df['nan_ohlc'] = df['close'].apply(lambda x: 1 if pd.isnull(x) else 0)
    
    # Fill in missing values using fill function
    df = fill_nan(df)
    
    # Reset index 
    df = df.reset_index()
    
    # Create additional date features
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    
    # Add technical analysis features
    df = add_all_ta_features(df, "open", "high", "low", "close", "base_volume")
      
    # Replace infinite values with NaNs
    df = df.replace([np.inf, -np.inf], np.nan)
    
    # Drop any features whose mean of missing values is greater than 20%
    df = df[df.columns[df.isnull().mean() < .2]]
    
    # Replace remaining NaN values with the mean of each respective column and reset index
    df = df.apply(lambda x: x.fillna(x.mean()),axis=0)
    
    # Create a feature for close price difference 
    df['close_diff'] = (df['close'] - df['close'].shift(1))/df['close'].shift(1)    
    
    # Function to create target
    def price_increase(x):
        if (x-(.70/100)) > 0:
            return True
        else:
            return False
    
    # Create target
    target = df['close_diff'].apply(price_increase)
    
    # To make the prediction before it happens, put target on the next observation
    target = target[1:].values
    df = df[:-1]
    
    # Create target column
    df['target'] = target
    
    # Remove first row of dataframe bc it has a null target
    df = df[1:]
    
    # Pick features
    features = ['open', 'high', 'low', 'close', 'base_volume', 'nan_ohlc', 
                'year', 'month', 'day', 'volume_adi', 'volume_obv', 'volume_cmf', 
                'volume_fi', 'volume_em', 'volume_vpt', 'volume_nvi', 'volatility_atr', 
                'volatility_bbh', 'volatility_bbl', 'volatility_bbm', 'volatility_bbhi', 
                'volatility_bbli', 'volatility_kcc', 'volatility_kch', 'volatility_kcl', 
                'volatility_kchi', 'volatility_kcli', 'volatility_dch', 'volatility_dcl', 
                'volatility_dchi', 'volatility_dcli', 'trend_macd', 'trend_macd_signal', 
                'trend_macd_diff', 'trend_ema_fast', 'trend_ema_slow', 
                'trend_adx_pos', 'trend_adx_neg', 'trend_vortex_ind_pos', 
                'trend_vortex_ind_neg', 'trend_vortex_diff', 'trend_trix', 
                'trend_mass_index', 'trend_cci', 'trend_dpo', 'trend_kst', 
                'trend_kst_sig', 'trend_kst_diff', 'trend_ichimoku_a', 
                'trend_ichimoku_b', 'trend_visual_ichimoku_a', 'trend_visual_ichimoku_b', 
                'trend_aroon_up', 'trend_aroon_down', 'trend_aroon_ind', 'momentum_rsi', 
                'momentum_mfi', 'momentum_tsi', 'momentum_uo', 'momentum_stoch', 
                'momentum_stoch_signal', 'momentum_wr', 'momentum_ao',  
                'others_dr', 'others_dlr', 'others_cr', 'close_diff', 'date', 'target']

    df = df[features]
    
    return df

#### Profit and Loss function

In [27]:
def performance(X_test, y_preds):
    """ Takes in a test dataset and a model's predictions, calculates and returns
        the profit or loss. When the model generates consecutive buy predictions, 
        anything after the first one are considered a hold and fees are not added
        for the hold trades. """
    
    fee_rate = 0.35 
    
    # creates dataframe for features and predictions
    df_preds = X_test
    df_preds['y_preds'] = y_preds
    
    # creates column with 0s for False predictions and 1s for True predictions
    df_preds['binary_y_preds'] = df_preds['y_preds'].shift(1).apply(lambda x: 1 if x == True else 0)
    
    # performance results from adding the closing difference percentage of the rows where trades were executed
    performance = ((10000 * df_preds['binary_y_preds']*df_preds['close_diff']).sum())
    
    # calculating fees and improve trading strategy
    # creates a count list for when trades were triggered
    df_preds['preds_count'] = df_preds['binary_y_preds'].cumsum()
    
    # feature that determines the instance of whether the list increased
    df_preds['increase_count'] = df_preds['preds_count'].diff(1)
    
    # feature that creates signal of when to buy(1), hold(0), or sell(-1)
    df_preds['trade_trig'] = df_preds['increase_count'].diff(1)
    
    # number of total entries(1s)
    number_of_entries = (df_preds.trade_trig.values==1).sum()
    
    # performance takes into account fees given the rate at the beginning of this function
    pct_performance = ((df_preds['binary_y_preds']*df_preds['close_diff']).sum())
    
    # calculate the percentage paid in fees
    fees_pct = number_of_entries * 2 * fee_rate/100
    
    # calculate fees in USD 
    fees = number_of_entries * 2 * fee_rate / 100 * 10000
    
    # calculate net profit in USD
    performance_net = performance - fees
    
    # calculate net profit percent
    performance_net_pct = performance_net/10000

    return pct_performance, performance, fees, performance_net, performance_net_pct

#### Modeling Pipeline

In [35]:
def modeling_pipeline(csv_filenames, periods=['360T','720T','960T','1440T']):
    """Takes csv file paths of data for modeling, performs feature engineering,
        train/test split, creates a model, reports train/test score, and saves
        a pickle file of the model in a directory called /pickles. The best models
        are moved to a directory called tr_pickles at the end"""
    
    line = '------------'
    performance_list = []
    
    for file in csv_filenames:
        
        # define model name 
        name = file.split('/')[1][:-9]
        
        # read csv
        csv = pd.read_csv(file, index_col=0)
        
        for period in periods:
            
            max_depth_list = [17]
#             max_depth_list = [17, 20, 25, 27]
            for max_depth in max_depth_list:
                
                max_features_list = [40]
#                 max_features_list = [40, 45, 50, 55, 60]
                for max_features in max_features_list:
                    
                    print(line + name + ' ' + period + ' ' + str(max_depth) + ' ' + str(max_features) + line)
                    
                    # create a copy of the csv
                    df = csv.copy()

                    # engineer features
                    df = feature_engineering(df, period)

                    # train test split
                    train = df[df['date'] < '2018-10-30 23:00:00'] # cutoff oct 30 2018
                    test = df[df['date'] > '2019-01-01 23:00:00'] # cutoff jan 01 2019
                    print('train and test shape ({model}):'.format(model=name), train.shape, test.shape)

                    # features and target
                    features = df.drop(columns=['target', 'date']).columns.tolist()
                    target = 'target'

                    # define X, y vectors
                    X_train = train[features]
                    X_test = test[features]
                    y_train = train[target]
                    y_test = test[target]

                    # instantiate model
                    model = RandomForestClassifier(max_features=max_features, 
                                                   max_depth=max_depth, 
                                                   n_estimators=100, 
                                                   n_jobs=-1, 
                                                   random_state=42)
                    try:
                        # filter out datasets that are too small
                        if X_test.shape[0] > 500:
                            # fit model
                            model.fit(X_train, y_train)
                            print('model fitted')

                            # train accuracy
                            train_score = model.score(X_train, y_train)
                            print('train accuracy:', train_score)

                            # make predictions
                            y_preds = model.predict(X_test)
                            print('predictions made')

                            # test accuracy
                            score = accuracy_score(y_test, y_preds)
                            print('test accuracy:', score)

                            # get profit and loss
                            a, b, c, d, e = performance(X_test, y_preds)
                            print(f'net profits: {str(round(d,2))}')

                            # formatting for filename
                            t = period[:-1]

                            # download pickle
                            (pickle.dump(model, open('pickles/{model}_{t}_{max_features}_{max_depth}.pkl'
                                                    .format(model=name, t=t,
                                                            max_features=str(max_features),
                                                            max_depth=str(max_depth)), 'wb')))
                            print('{model} pickle saved!\n'.format(model=name))

                            # save net performance to list
                            performance_list.append([f'{name}', period, max_features, max_depth, a, b, c , d, e])

                        else:
                            print('{model} does not have enough data!\n'.format(model=name))
                            
                    except:
                        print('error with model')

    # create dataframe for model performance  
    df = pd.DataFrame(performance_list, columns = ['ex_tp', 'period', 'max_features',
                                                   'max_depth','pct_gain','gain', 'fees', 
                                                   'net_profit', 'pct_net_profit'])
    
    # sort by net profit descending and drop duplicates
    df2 = df.sort_values(by='net_profit', ascending=False).drop_duplicates(subset='ex_tp')
    
    # get the names, periods, max_features, max_depth for best models
    models = df2['ex_tp'].values
    periods = df2['period'].values
    max_features = df2['max_features'].values
    max_depth = df2['max_depth'].values
    
    # save the best models in a new directory /tr_pickles
    for i in range(len(models)):
        model_name = models[i] + '_' + periods[i][:-1] + '_' + str(max_features[i]) + '_' + str(max_depth[i])
        os.rename(f'pickles/{model_name}.pkl', f'tr_pickles/{models[i]}.pkl')
    
    # returning the dataframes for model performance
    # df1 contains performance for all models trained
    # df2 contains performance for best models
    return df, df2

In [None]:
periods=['360T']
df, df2 = modeling_pipeline(csv_filenames, periods)

In [None]:
df

In [None]:
df2

## training models with specific parameters

This part is not necessary if you do the above. It's for when you want to only train the best models if you know the parameters so you don't have to train 100s of models

In [None]:
def modeling_pipeline(csv_filenames, param_dict):
    """Takes csv file paths of data for modeling and parameters, performs feature engineering,
        train/test split, creates a model, reports train/test score, and saves
        a pickle file of the model in a directory called /pickles."""
    
    line = '------------'
    
    performance_list = []
    
    for file in csv_filenames:
        
        # define model name 
        name = file.split('/')[1][:-9]
        
        # read csv
        df = pd.read_csv(file, index_col=0)
        
        params = param_dict[name]
        print(params)
        period = params['period']
        print(period)
        max_features = params['max_features']
        max_depth = params['max_depth']

        print(line + name + ' ' + period + line)

        # engineer features
        df = feature_engineering(df, period)

        # train test split
        train = df[df['date'] < '2018-10-30 23:00:00'] # cutoff oct 30 2018
        test = df[df['date'] > '2019-01-01 23:00:00'] # cutoff jan 01 2019
        print('train and test shape ({model}):'.format(model=name), train.shape, test.shape)

        # features and target
        features = df.drop(columns=['target', 'date']).columns.tolist()
        target = 'target'

        # define X, y vectors
        X_train = train[features]
        X_test = test[features]
        y_train = train[target]
        y_test = test[target]

        # instantiate model
        model = RandomForestClassifier(max_features=max_features, 
                                       max_depth=max_depth, 
                                       n_estimators=100, 
                                       n_jobs=-1, 
                                       random_state=42)

        # fit model
        if X_train.shape[0] > 500:
            model.fit(X_train, y_train)
            print('model fitted')

            # train accuracy
            train_score = model.score(X_train, y_train)
            print('train accuracy:', train_score)

            # make predictions
            y_preds = model.predict(X_test)
            print('predictions made')

            # test accuracy
            score = accuracy_score(y_test, y_preds)
            print('test accuracy:', score)

            # get profit and loss
            a, b, c, d, e = performance(X_test, y_preds)
            print(f'net profits: {str(round(d,2))}')

            # formatting for filename
            t = period[:-1]

            # download pickle
            pickle.dump(model, open('pickles/{model}_{t}.pkl'.format(model=name, t=t,), 'wb'))
            print('{model} pickle saved!\n'.format(model=name))

            # save net performance to list
            performance_list.append([f'{name}', period, a, b, c , d, e])

        else:
            print('{model} does not have enough data!\n'.format(model=name))

    # create df of model performance 
    df = pd.DataFrame(performance_list, columns = ['ex_tp', 'period', 'pct_gain',
                                                   'gain', 'fees', 'net_profit', 'pct_net_profit'])
    
    # sort performance by net_profit and drop duplicates
    df2 = df.sort_values(by='net_profit', ascending=False).drop_duplicates(subset='ex_tp')
    models = df2['ex_tp'].values
    periods = df2['period'].values
    
    # move models to new dir tr_pickles
    for i in range(len(models)):
        model_name = models[i] + '_' + periods[i][:-1]
        os.rename(f'pickles/{model_name}.pkl', f'tr_pickles/{models[i]}.pkl')
    
    # returning the dataframes for model performance
    # df1 contains performance for all models trained
    # df2 contains performance for best models
    return df, df2

In [32]:
param_dict = {'bitfinex_ltc_usd': {'period': '1440T', 'max_features': 50, 'max_depth': 20}, 
              'hitbtc_ltc_usdt': {'period': '1440T', 'max_features': 45, 'max_depth': 27},
              'coinbase_pro_ltc_usd': {'period': '960T', 'max_features': 50, 'max_depth': 17},
              'hitbtc_btc_usdt': {'period': '360T', 'max_features': 40, 'max_depth': 17},
              'coinbase_pro_btc_usd': {'period': '960T', 'max_features': 55, 'max_depth': 25},
              'coinbase_pro_eth_usd': {'period': '960T', 'max_features': 50, 'max_depth': 27},
              'bitfinex_btc_usd': {'period': '1200T', 'max_features': 55, 'max_depth': 25},
              'bitfinex_eth_usd': {'period': '1200T', 'max_features': 60, 'max_depth': 20}
              }

# 'hitbtc_eth_usdt': {'period': '1440T', 'max_depth': 50}
# ^ this cant go in param dict bc its trained differently

csv_paths = csv_filenames.copy()
del csv_paths[4]
print(csv_paths)
print(len(csv_paths))
len(csv_filenames)

['data/bitfinex_ltc_usd_3600.csv', 'data/bitfinex_btc_usd_3600.csv', 'data/coinbase_pro_eth_usd_3600.csv', 'data/coinbase_pro_ltc_usd_3600.csv', 'data/bitfinex_eth_usd_3600.csv', 'data/coinbase_pro_btc_usd_3600.csv', 'data/hitbtc_ltc_usdt_3600.csv', 'data/hitbtc_btc_usdt_3600.csv']
8


9

In [None]:
df, df2 = modeling_pipeline(csv_paths)

#### train hitbtc eth_usdt model separately - was a special case where it performed better with less parameters

In [42]:
# for the hitbtc eth usdt model
def modeling_pipeline(csv_filenames):
    """Takes csv file paths of data for modeling, performs feature engineering,
        train/test split, creates a model, reports train/test score, and saves
        a pickle file of the model in a directory called /pickles."""
    
    line = '------------'
    
    performance_list = []
    
    for file in csv_filenames:
        
        # define model name 
        name = file.split('/')[1][:-9]
        
        # read csv
        df = pd.read_csv(file, index_col=0)  

        period = '1440T'
        print(period)

        print(line + name + ' ' + period + line)

        # engineer features
        df = feature_engineering(df, period)

        # train test split
        train = df[df['date'] < '2018-10-30 23:00:00'] # cutoff oct 30 2018
        test = df[df['date'] > '2019-01-01 23:00:00'] # cutoff jan 01 2019
        print('train and test shape ({model}):'.format(model=name), train.shape, test.shape)

        # features and target
        features = df.drop(columns=['target', 'date']).columns.tolist()
        target = 'target'

        # define X, y vectors
        X_train = train[features]
        X_test = test[features]
        y_train = train[target]
        y_test = test[target]

        # instantiate model
        model = RandomForestClassifier(max_depth=50, 
                                       n_estimators=100, 
                                       n_jobs=-1, 
                                       random_state=42)
        
        # filter out datasets that are too small
        if X_train.shape[0] > 500:
            # fit model
            model.fit(X_train, y_train)
            print('model fitted')

            # train accuracy
            train_score = model.score(X_train, y_train)
            print('train accuracy:', train_score)

            # make predictions
            y_preds = model.predict(X_test)
            print('predictions made')

            # test accuracy
            score = accuracy_score(y_test, y_preds)
            print('test accuracy:', score)

            # get profit and loss
            a, b, c, d, e = performance(X_test, y_preds)
            print(f'net profits: {str(round(d,2))}')

            # formatting for filename
            t = period[:-1]

            # download pickle
            pickle.dump(model, open('pickles/{model}_{t}.pkl'.format(model=name, t=t,), 'wb'))
            print('{model} pickle saved!\n'.format(model=name))

            # save net performance to list
            performance_list.append([f'{name}', period, a, b, c , d, e])

        else:
            print('{model} does not have enough data!\n'.format(model=name))

    # create df of model performance    
    df = pd.DataFrame(performance_list, columns = ['ex_tp', 'period', 'pct_gain',
                                                   'gain', 'fees', 'net_profit', 'pct_net_profit'])
    
    models = df2['ex_tp'].values
    periods = df2['period'].values
    
    # move model to new dir tr_pickles
    for i in range(len(models)):
        model_name = models[i] + '_' + periods[i][:-1]
        os.rename(f'pickles/{model_name}.pkl', f'tr_pickles/{models[i]}.pkl')
        
    # returning the dataframes for model performance
    # df1 contains performance for all models trained
    # df2 contains performance for best models
    return df, df2

In [43]:
filepath = ['data/hitbtc_eth_usdt_3600.csv']

In [44]:
df, df2 = modeling_pipeline(filepath)

1440T
------------hitbtc_eth_usdt 1440T------------
train and test shape (hitbtc_eth_usdt): (543, 69) (272, 69)
model fitted
train accuracy: 1.0
predictions made
test accuracy: 0.46691176470588236
net profits: 8874.99
hitbtc_eth_usdt pickle saved!



## What's next?

- neural networks
- implement NLP with data scraped from twitter to see how frequency of crypto discussion affects the predictions
- more exchange/trading pair support