# Crypto Trend Prediction Model


In [216]:
# Data manipulation library
import pandas as pd
import numpy as np
from sklearn import preprocessing


# Data sampling library 
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# Model library
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Regression library

from keras.models import Sequential
from keras.layers import Dense

# Evaluation library

from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import mean_absolute_percentage_error

# Indicator
from stockstats import StockDataFrame

# Other
import random

In [157]:
# Thanks to https://www.kaggle.com/tencars/392-crypto-currency-pairs-at-minute-resolution/version/948?select=etheur.csv

# Function for data manipulation

The next block contains the following functions:
* ```make_df``` that import the csv file and transform the timestamp to datetime
* ```meanfill``` that fill the minutes that are missing in the data
* ```trim``` that split the df into timeframe
* ```trend``` that find the trend in the current timeframe (1 crescend 0 decrescent)
* ```make_label``` that find the inversion trend point into the timeframe
* ```ema``` that calculate the Exponential Moving Average for the Close values

In [158]:
# Importing data and removing some row

def make_df(csv_name):
    
    df = pd.read_csv(csv_name)
    df = df.iloc[10: , :]
      
    # timestamp to datetime for finding missing minutes
    df['time'] = pd.to_datetime(df.time, unit='ms')
    
    return df

# filling missing data with mean from last and next known value

def meanfill(df):
    backfill = df.set_index('time').asfreq(freq='1Min', method='backfill')
    pad = df.set_index('time').asfreq(freq='1Min', method='pad')
    
    return pd.concat([backfill, pad]).groupby(level=0).mean()

# grouping dataframe for selected timeframe

def trim(dataframe, n):
    df_list = []
    i = 0
    while i+n <= len(dataframe):
        df_list.append(dataframe.iloc[i:n+i])
        i += n
    return df_list

# selecting trend and removing timeframe with all missing values

def trend(dataframe, timeframe):
    
    df_list = trim(dataframe, timeframe)
    for i in range(len(df_list)):
        index_min = df_list[i]['low'].idxmin()
        index_max = df_list[i]['high'].idxmax()
        
        if df_list[i]['open'].mean() == df_list[i]['open'].iloc[0]:
            df_list[i].loc[:,'trend'] = np.nan
        elif index_min < index_max:
            df_list[i].loc[:,'trend'] = 1
    return df_list

# Buy signal: 1
# Sell signal: -1
# Wait signal: 0

def make_label(lista, enable = False):
     
    # Variable for variation
    variation = 0
    
    for i in range(len(lista)-1):
        current_trend = lista[i]['trend'][0]
        next_trend = lista[i+1]['trend'][0]
        
        # Crescent trend: 1
        if current_trend == 0 and next_trend == 1:
            current_min = min(lista[i]['low'])
            next_min = min(lista[i+1]['low'])
            
            # The inversion point is the min of current or next timeframe
            if next_min < current_min:
                change = lista[i+1]['low'].idxmin()
                lista[i+1].loc[change,'label'] = 1
                
            else:
                change = lista[i]['low'].idxmin()
                lista[i].loc[change,'label'] = 1
               
                
        # Decrescent trend: -1
        elif current_trend == 1 and next_trend == 0 and enable == True:
            current_max = max(lista[i]['high'])
            next_max = max(lista[i+1]['high'])
            
            # The inversion point trend is the max of current or next trend
            if next_max > current_max:
                change = lista[i+1]['high'].idxmax()
                lista[i+1].loc[change,'label'] = -1
                
            else:
                change = lista[i]['high'].idxmax()
                lista[i].loc[change,'label'] = -1
                
                
        elif current_trend != next_trend:
            lista[i].loc[:,'label'] = np.nan
            
        

        
        variation += 100/min(lista[i]['low']*(max(lista[i]['high'])) - (min(lista[i]['low'])))
        
    
    print("\nAverage variation during timeframe: {}".format(variation/len(lista)))
    
    return lista


def get_indicators(df, timeframe = 180):
    
    df = StockDataFrame.retype(df)
    df['macd'] 
    df['tema']
    df['rsi_12']
    
    # Max of last timeframe rows
    df['max_high']= df['high'].rolling(timeframe).max().shift(1)
    
    # Min of last timeframe rows
    df['min_low']= df['low'].rolling(timeframe).min().shift(1)
    
    
    return df[timeframe:].drop(['rs_12', 'close_-1_s', 'close_-1_d', 'macdh'], axis = 1).dropna()


# Making the dataframe

```df_input``` uses all past function for creating the dataframe that we will use for the model training

In [159]:
def df_input(csv_name, timeframe = 180, trend = True):
    
    # Creating df
    df = make_df(csv_name)
    
    # Filling missing data and setting up the timeframe
    df = meanfill(df)
      
    # Making indicators
    df = get_indicators(df)
     
    # Creating samples for evaluate profit 
    f_sample, s_sample, t_sample, df = all_sample(df)
    
    if trend:
        
        # Creating trend and label column
        df.loc[:,'trend'] = 0
        df.loc[:,'label'] = 0
        
        # Creating timeframe and fill trend column
        df_list = trend(df, timeframe)

        # Filling label column
        df_list = make_label(df_list, True)
        
        df = pd.concat(df_list[timeframe:]).dropna().drop(['trend'], axis = 1)
      
    else:
        # Shift min_low and max_high by timeframe, so the i_th row rappresents the max/min of the next "timeframe" rows

        df['min_low'] = df['min_low'].shift(-timeframe)
        df['max_high'] = df['max_high'].shift(-timeframe)

    return df.drop(['high', 'low', 'close'], axis = 1).dropna(), f_sample, s_sample, t_sample
    

# Test and Training data

The label column is full of zero, if we train the model on the raw data we'll obtain a balanced accuracy of 98% but the model will predict just zero.

We used the under sampling method for avoid this fact.

In [208]:
# Sampling data

def under_sampling(X_train, y_train, class_number = 2):
    
    if class_number == 2:
        
        sampling_strategy = {0:6000 , 1:1000}
        
    else:
        sampling_strategy = {0:6000 , 1:1000 , -1:1000}
          
    rus = RandomUnderSampler(sampling_strategy = sampling_strategy)
    X_res, y_res = rus.fit_resample(X_train, y_train)
    
    return X_res, y_res


def data_split(df, tag = "label", drop = "", train_size = None):
    
    # Drop label column
    m_df = df.drop([tag], axis = 1)
    
    # Drop other column if needed
    if drop != "":
        m_df = m_df.drop([drop], axis = 1)
        
    
    # Splitting into input and output
    X = m_df.reset_index(drop=True)
    y = df[tag]
    
    
    # Splitting into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, train_size=train_size)

    if tag == "label":
        # undersampling the data
        X_train, y_train = under_sampling(X_train, y_train)
  
        
    return X_train, y_train, X_test, y_test
    

# Models

We used different model for the prediction:

* ```Neural Network```
* ```Decision Tree```
* ```Random Forest```

And evaluted all of them

In [161]:
def neural_network(X_train, y_train, X_test, y_test):
    
    mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(10,2,1), random_state=1, max_iter = 10000)
    
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_scaled = scaler.transform(X_train)
    
    mlp.fit(X_scaled, y_train)
    
    y_pred = mlp.predict(X_test)
    
    print("Neural Network balanced accuracy " + str(balanced_accuracy_score(y_test, y_pred)))
    return mlp
    
def decision_tree(X_train, y_train, X_test, y_test):
   
    # Creiamo il decision Tree
    
    clf = DecisionTreeClassifier()

    # Addestriamo l'albero
    clf = clf.fit(X_train,y_train)

    # Eseguiamo delle predizioni sul set di test
    y_pred = clf.predict(X_test)
    
    # Valutiamo la sua accuratezza
    
    print("Decision Tree Balanced Accuracy: " + str(balanced_accuracy_score(y_test, y_pred)))
    
    return clf
    
def random_forest(X_train, y_train, X_test, y_test):

    # Creo il modello
    clf=RandomForestClassifier(n_estimators=100)

    # Addestro il modello
    clf.fit(X_train,y_train)
    
    # Eseguo delle predizioni sul test di test
    y_pred=clf.predict(X_test)
    
    # Valuto l'accuratezza 
    print("Random Forest Balanced Accuracy: " + str(balanced_accuracy_score(y_test, y_pred)))
    
    return clf

# Evaluating profit

I evaluated the models with a simple trading algorithm.
But first we will need to take out some random sample from our data, we will evaluate the profit after the training.

In [162]:
def single_sample(df):
    s = np.random.choice(df.index[:-5],1)
    sample = df.iloc[df.index.get_loc(s[0]):df.index.get_loc(s[0])+43200]
    return sample

def all_sample(df):
    
    
    f_sample = single_sample(df)
    s_sample = single_sample(df)
    t_sample = single_sample(df)

    df = pd.concat([df, f_sample, s_sample, t_sample]).drop_duplicates(keep=False)
    
    return f_sample, s_sample, t_sample, df

def evaluate_profit(sample, model, model_name):
    
    b_price = profit = amount =  transaction_number = transaction_time  = 0
    signals = model.predict(sample)
    prices = sample['open']
  
    
    # Column with all open value
    
    for i in range(len(prices)):
        
        if b_price != 0:
            possible_profit = (prices[i] - b_price) * amount
        else:
            possible_profit = 0
        
        if signals[i] == 1 and b_price == 0:
            transaction_number += 1
            b_price = prices[i]
            amount = 100/b_price
            #print("Bought {} coin at {}".format(amount, b_price))
            
        if b_price != 0:
            
            transaction_time += 1
            
        if b_price != 0 and possible_profit >= 3 :
            profit += possible_profit - 0.2
            #print("Sold {} coin at {} with profit of {}\n".format(amount, prices[i], possible_profit))
            b_price = 0
    
    
    print("TOTAL PROFIT for {}: {}".format(model_name, profit))
    
    if(transaction_number != 0):
        print("Transaction number: {}, average holding for transaction: {}\n".format(transaction_number, (transaction_time/transaction_number)/60))
    

def baseline_profit(sample):
    
    b_price = profit = amount =  transaction_number = transaction_time  = 0
    prices = sample['open']
    
    # Column with all open value
    
    for i in range(len(prices)):
        
        r = random.randint(0,1)
        if b_price != 0:
            possible_profit = (prices[i] - b_price) * amount
        else:
            possible_profit = 0
        
        if r == 1 and b_price == 0:
            transaction_number += 1
            b_price = prices[i]
            amount = 100/b_price
            #print("Bought {} coin at {}".format(amount, b_price))
            
        if b_price != 0:
            
            transaction_time += 1
        
        if b_price != 0 and possible_profit >= 3:
            profit += possible_profit - 0.2
            #print("Sold {} coin at {} with profit of {}\n".format(amount, prices[i], possible_profit))
            b_price = 0
          
    
    print("\n\nBASELINE PROFIT: {}".format(profit))
    
    if(transaction_number != 0):
        print("Transaction number: {}, average holding for transaction: {}\n\n".format(transaction_number, (transaction_time/transaction_number)/60))
   

# Here start the game

That's what we did :

* Created the samples and the dataframe
* Splitted the data into test and training set
* Trained all the models
* Evaluated them with balanced accuracy
* Evaluted them with our simple trading algo

## BITCOIN

In [124]:
# Creating df and samples

df, f_sample, s_sample, t_sample = df_input("./btceur.csv", 180)

# Split data into test and training

X_train, y_train, X_test, y_test = data_split(df)

# Random forest for BTC
BTCrf = random_forest(X_train, y_train, X_test, y_test)

# Decision tree for BTC
BTCdt = decision_tree(X_train, y_train, X_test, y_test)

# Neural Network for BTC
BTCnn = neural_network(X_train, y_train, X_test, y_test)

#collections.Counter(y_pred)

f_sample = f_sample.drop(['high', 'low', 'close', 'trend', 'label'], axis = 1)
s_sample = s_sample.drop(['high', 'low', 'close', 'trend', 'label'], axis = 1)
t_sample = t_sample.drop(['high', 'low', 'close', 'trend', 'label'], axis = 1)


# First sample

baseline_profit(f_sample)

evaluate_profit(f_sample, BTCnn, "Neural Network")

evaluate_profit(f_sample, BTCdt, "Decision Tree")

evaluate_profit(f_sample, BTCrf, "Random Forest")

# Second sample

baseline_profit(s_sample)

evaluate_profit(s_sample, BTCnn, "Neural Network")

evaluate_profit(s_sample, BTCdt, "Decision Tree")

evaluate_profit(s_sample, BTCrf, "Random Forest")

# Third sample

baseline_profit(t_sample)

evaluate_profit(t_sample, BTCnn, "Neural Network")

evaluate_profit(t_sample, BTCdt, "Decision Tree")

evaluate_profit(t_sample, BTCrf, "Random Forest")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)



Average variation during timeframe: 3.06614817558918e-06
Random Forest Balanced Accuracy: 0.7629673931176763
Decision Tree Balanced Accuracy: 0.7008693737407912
Neural Network balanced accuracy 0.35735619678749425


BASELINE PROFIT: 11.368588960069296
Transaction number: 5, average holding for transaction: 143.98333333333332


TOTAL PROFIT for Neural Network: 11.645683505302838
Transaction number: 5, average holding for transaction: 143.99666666666664

TOTAL PROFIT for Decision Tree: 11.421542629376287
Transaction number: 5, average holding for transaction: 142.23333333333332

TOTAL PROFIT for Random Forest: 14.237194718689548
Transaction number: 6, average holding for transaction: 117.75833333333334



BASELINE PROFIT: 49.49376497470577
Transaction number: 9, average holding for transaction: 79.9925925925926


TOTAL PROFIT for Neural Network: 51.39554410910335
Transaction number: 10, average holding for transaction: 71.37666666666668

TOTAL PROFIT for Decision Tree: 28.59477254245682

## ETH

In [125]:
# Creating df and samples

df, f_sample, s_sample, t_sample = df_input("./etheur.csv", 180)

# Split data into test and training

X_train, y_train, X_test, y_test = data_split(df)

# Random forest for BTC
ETHrf = random_forest(X_train, y_train, X_test, y_test)

# Decision tree for BTC
ETHdt = decision_tree(X_train, y_train, X_test, y_test)

# Neural Network for BTC
ETHnn = neural_network(X_train, y_train, X_test, y_test)

#collections.Counter(y_pred)

f_sample = f_sample.drop(['high', 'low', 'close', 'trend', 'label'], axis = 1)
s_sample = s_sample.drop(['high', 'low', 'close', 'trend', 'label'], axis = 1)
t_sample = t_sample.drop(['high', 'low', 'close', 'trend', 'label'], axis = 1)

# First sample

baseline_profit(f_sample)

evaluate_profit(f_sample, ETHnn, "Neural Network")

evaluate_profit(f_sample, ETHdt, "Decision Tree")

evaluate_profit(f_sample, ETHrf, "Random Forest")

# Second sampel

baseline_profit(s_sample)

evaluate_profit(s_sample, ETHnn, "Neural Network")

evaluate_profit(s_sample, ETHdt, "Decsion Tree")

evaluate_profit(s_sample, ETHrf, "Random Forest")

# Thrid sample

baseline_profit(t_sample)

evaluate_profit(t_sample, ETHnn, "Neural Network")

evaluate_profit(t_sample, ETHdt, "Decision Tree")

evaluate_profit(t_sample, ETHrf, "Random Forest")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)



Average variation during timeframe: 0.002654895516053407
Random Forest Balanced Accuracy: 0.7349018417762835
Decision Tree Balanced Accuracy: 0.6710596103758304
Neural Network balanced accuracy 0.3359122176899552


BASELINE PROFIT: 57.87198953622203
Transaction number: 21, average holding for transaction: 34.26984126984127


TOTAL PROFIT for Neural Network: 61.109456065870006
Transaction number: 22, average holding for transaction: 32.72727272727273

TOTAL PROFIT for Decision Tree: 58.16718157925055
Transaction number: 21, average holding for transaction: 33.053174603174604

TOTAL PROFIT for Random Forest: 55.0127238561214
Transaction number: 20, average holding for transaction: 34.115833333333335



BASELINE PROFIT: 20.49596264217177
Transaction number: 8, average holding for transaction: 89.97916666666667


TOTAL PROFIT for Neural Network: 20.281045973103915
Transaction number: 8, average holding for transaction: 90.0

TOTAL PROFIT for Decsion Tree: 14.526746724291826
Transaction nu

## Crypto local MAX and MIN Prediction

In this section we will try to predict the Timeframe Local Minimum and Maximum in order to buy when the value is less then the pred_min and sell when the value is greater then the pred_max

In [260]:
# Model

def model(X_train, y_train, X_test, y_test):
    
    # create model
    model = Sequential()
    model.add(Dense(6, input_dim=6, kernel_initializer='normal', activation='relu'))
    model.add(Dense(3, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    
    
    # Compile model
    
    model.compile(loss="mean_absolute_percentage_error", optimizer='adam')
    
    # Train the model
    model.fit(X_train, y_train, epochs=50, verbose = 0)
    
    # Evalute the model
    
    y_pred = model.predict(X_test)
    
    print("Mean absolute percentage error: {}".format(mean_absolute_percentage_error(y_test, y_pred)))
    
    #print(cross_val_score(model, X_test, y_test, scoring = "neg_mean_absolute_percentage_error"))
    
    return model

## Evaluate profit

We just changed the if condition of the evaluate_profit function

In [296]:
def not_evaluate_profit(sample, model, model_name):
    
    b_price = profit = amount =  transaction_number = transaction_time  = 0
    signals = model.predict(sample)
    prices = sample['open']
  
    
    # Column with all open value
    
    for i in range(len(prices)):
        
        if b_price != 0:
            possible_profit = (prices[i] - b_price) * amount
        else:
            possible_profit = 0
        
        if signals[i] <= prices[i] and b_price == 0:
            transaction_number += 1
            b_price = prices[i]
            amount = 100/b_price
            #print("Bought {} coin at {}".format(amount, b_price))
            
        if b_price != 0:
            
            transaction_time += 1
            
        if b_price != 0 and possible_profit >= 3:
            profit += possible_profit - 0.2
            #print("Sold {} coin at {} with profit of {}\n".format(amount, prices[i], possible_profit))
            b_price = 0
    
    
    print("TOTAL PROFIT for {}: {}".format(model_name, profit))
    
    if(transaction_number != 0):
        print("Transaction number: {}, average holding for transaction: {}\n".format(transaction_number, (transaction_time/transaction_number)/60))
    

## BTC

Making df, train the model, evaluate it with mean absolute percentage error

In [305]:
btc, f_sample, s_sample, t_sample = df_input("./btceur.csv", 360, False)

X_train, y_train, X_test, y_test = data_split(btc, "min_low", drop = "max_high", train_size = 5000)

In [306]:
btc_model = model(X_train, y_train, X_test, y_test)

Mean absolute percentage error: 0.013601065151229575


In [307]:
f_sample = f_sample.drop(['high', 'low', 'close', 'max_high', 'min_low'], axis = 1)
s_sample = s_sample.drop(['high', 'low', 'close', 'max_high', 'min_low'], axis = 1)
t_sample = t_sample.drop(['high', 'low', 'close', 'max_high', 'min_low'], axis = 1)

In [308]:
# Evaluate profit comparing it with random trading

baseline_profit(f_sample)
not_evaluate_profit(f_sample, btc_model, "First Sample")

baseline_profit(s_sample)
not_evaluate_profit(s_sample, btc_model, "Second Sample")

baseline_profit(t_sample)
not_evaluate_profit(t_sample, btc_model, "Thirt Sample")



BASELINE PROFIT: 2.805895100605823
Transaction number: 2, average holding for transaction: 359.9916666666667


TOTAL PROFIT for First Sample: 2.828533758869696
Transaction number: 2, average holding for transaction: 360.0



BASELINE PROFIT: 12.525894200233473
Transaction number: 5, average holding for transaction: 143.97


TOTAL PROFIT for Second Sample: 12.054900579737499
Transaction number: 5, average holding for transaction: 144.0



BASELINE PROFIT: 3.3126437187524957
Transaction number: 2, average holding for transaction: 360.0


TOTAL PROFIT for Thirt Sample: 3.3126437187524957
Transaction number: 2, average holding for transaction: 360.0



## ETH

Making df, train the model, evaluate it with mean absolute percentage error

In [309]:
eth, f_sample, s_sample, t_sample = df_input("./etheur.csv", 360, False)

X_train, y_train, X_test, y_test = data_split(btc, tag = "min_low", drop = "max_high", train_size = 5000)

In [310]:
eth_model = model(X_train, y_train, X_test, y_test)

Mean absolute percentage error: 0.9988777511394428


In [311]:
f_sample = f_sample.drop(['high', 'low', 'close', 'max_high', 'min_low'], axis = 1)
s_sample = s_sample.drop(['high', 'low', 'close', 'max_high', 'min_low'], axis = 1)
t_sample = t_sample.drop(['high', 'low', 'close', 'max_high', 'min_low'], axis = 1)

In [312]:
# Evaluate profit comparing it with random trading

baseline_profit(f_sample)
not_evaluate_profit(f_sample, eth_model, "First Sample")

baseline_profit(s_sample)
not_evaluate_profit(s_sample, eth_model, "Second Sample")

baseline_profit(t_sample)
not_evaluate_profit(t_sample, eth_model, "Thirt Sample")





BASELINE PROFIT: 17.576202813112094
Transaction number: 7, average holding for transaction: 102.81666666666666


TOTAL PROFIT for First Sample: 17.530878710338587
Transaction number: 7, average holding for transaction: 102.85714285714286



BASELINE PROFIT: 17.34800250063522
Transaction number: 7, average holding for transaction: 102.84523809523809


TOTAL PROFIT for Second Sample: 20.296911723494393
Transaction number: 8, average holding for transaction: 90.0



BASELINE PROFIT: 5.911817787025914
Transaction number: 3, average holding for transaction: 239.98333333333332


TOTAL PROFIT for Thirt Sample: 8.876754105458577
Transaction number: 4, average holding for transaction: 180.0

