# Crypto Trend Prediction Model


In [340]:
# Data manipulation library
import pandas as pd
import numpy as np
from sklearn import preprocessing


# Data sampling library 
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# Model library
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Evaluation library

from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
import collections

In [341]:
# Thanks to https://www.kaggle.com/tencars/392-crypto-currency-pairs-at-minute-resolution/version/948?select=etheur.csv

# Function for data manipulation

The next block contains the following functions:
* ```make_df``` that import the csv file and transform the timestamp to datetime
* ```meanfill``` that fill the minutes that are missing in the data
* ```trim``` that split the df into timeframe
* ```trend``` that find the trend in the current timeframe (1 crescend 0 decrescent)
* ```make_label``` that find the inversion trend point into the timeframe
* ```ema``` that calculate the Exponential Moving Average for the Close values

In [342]:
# Importing data and removing some row

def make_df(csv_name):
    
    df = pd.read_csv(csv_name)
    df = df.iloc[10: , :]
    
    # Creating trend and label column
    df.loc[:,'trend'] = 0
    df.loc[:,'label'] = 0
    
    # timestamp to datetime for finding missing minutes
    df['time'] = pd.to_datetime(df.time, unit='ms')
    
    return df

# filling missing data with mean from last and next known value

def meanfill(df):
    backfill = df.set_index('time').asfreq(freq='1Min', method='backfill')
    pad = df.set_index('time').asfreq(freq='1Min', method='pad')
    
    return pd.concat([backfill, pad]).groupby(level=0).mean()

# grouping dataframe for selected timeframe

def trim(dataframe, n):
    df_list = []
    i = 0
    while i+n <= len(dataframe):
        df_list.append(dataframe.iloc[i:n+i])
        i += n
    return df_list

# selecting trend and removing timeframe with all missing values

def trend(dataframe, timeframe):
    
    df_list = trim(dataframe, timeframe)
    for i in range(len(df_list)):
        index_min = df_list[i]['low'].idxmin()
        index_max = df_list[i]['high'].idxmax()
        
        if df_list[i]['open'].mean() == df_list[i]['open'].iloc[0]:
            df_list[i].loc[:,'trend'] = np.nan
        elif index_min < index_max:
            df_list[i].loc[:,'trend'] = 1
    return df_list

# Buy signal: 1
# Sell signal: -1
# Wait signal: 0

def make_label(lista):
        
    for i in range(len(lista)-1):
        current_trend = lista[i]['trend'][0]
        next_trend = lista[i+1]['trend'][0]
        
        # Crescent trend: 1
        if current_trend == 0 and next_trend == 1:
            current_min = min(lista[i]['low'])
            next_min = min(lista[i+1]['low'])
            
            # The inversion point is the min of current or next timeframe
            if next_min < current_min:
                change = lista[i+1]['low'].idxmin()
                lista[i+1].loc[change,'label'] = 1
                
            else:
                change = lista[i]['low'].idxmin()
                lista[i].loc[change,'label'] = 1
               
                
        # Decrescent trend: -1
        elif current_trend == 1 and next_trend == 0:
            current_max = max(lista[i]['high'])
            next_max = max(lista[i+1]['high'])
            
            # The inversion point trend is the max of current or next trend
            if next_max > current_max:
                change = lista[i+1]['high'].idxmax()
                lista[i+1].loc[change,'label'] = -1
                
            else:
                change = lista[i]['high'].idxmax()
                lista[i].loc[change,'label'] = -1
                
                
        elif current_trend != next_trend:
            lista[i].loc[:,'label'] = np.nan
            
            
    return lista

# Exponential moving average

def ema(dataframe, span):
    sma = dataframe.rolling(window=span, min_periods=span).mean()[:span]
    rest = dataframe[span:]
    df1 = pd.concat([sma, rest]).ewm(span=span, adjust=False).mean()
    dataframe['ema'] = df1['close']
    dataframe['ema_volume'] = df1['volume']
    
    return dataframe

# Making the dataframe

```input_df``` uses all past function for creating the dataframe that we will use for the model training

In [343]:
def input_df(csv_name, timeframe):
    
    # Creating df
    df = make_df(csv_name)
    
    # Filling missing data and setting up the timeframe
    df = meanfill(df)
      
    # Making exponential moving average column
    df = ema(df, timeframe)
    
    # Creating samples for evaluate profit 
    f_sample, s_sample, t_sample, df = all_sample(df)
    
    # Creating timeframe and fill trend column
    df_list = trend(df, timeframe)
    
    # Filling label column
    df_list = make_label(df_list)
    
    return pd.concat(df_list[timeframe:]).dropna().drop(['high', 'low', 'close', 'trend'], axis = 1), f_sample, s_sample, t_sample
    

# Test and Training data

The label column is full of zero, if we train the model on the raw data we'll obtain a balanced accuracy of 98% but the model will predict just zero.

We used the under sampling method for avoid this fact.

In [344]:
# Sampling data

def under_sampling(X_train, y_train):
    sampling_strategy = {0:8000 , 1:1500 , -1:1500}
    rus = RandomUnderSampler(sampling_strategy = sampling_strategy)
    X_res, y_res = rus.fit_resample(X_train, y_train)
    
    return X_res, y_res


def data_split(df):
    
    # Drop label column
    
    m_df = df.drop(['label'], axis = 1)
    
    # Splitting into input and output
    
    X = m_df.reset_index(drop=True)
    y = df['label']
    
    # Splitting into train and test set

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    # undersampling the data
    
    X_train, y_train = under_sampling(X_train, y_train)
    
    return X_train, y_train, X_test, y_test
    

# Models

We used different model for the prediction:

* ```Neural Network```
* ```Decision Tree```
* ```Random Forest```

And evaluted all of them

In [345]:
def neural_network(X_train, y_train, X_test, y_test):
    
    mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(6,), random_state=1, max_iter = 10000)
    
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_scaled = scaler.transform(X_train)
    
    mlp.fit(X_scaled, y_train)
    
    y_pred = mlp.predict(X_test)
    
    print("Neural Networi balanced accuracy " + str(balanced_accuracy_score(y_test, y_pred)))
    return mlp
    
def decision_tree(X_train, y_train, X_test, y_test):
   
    # Creiamo il decision Tree
    
    clf = DecisionTreeClassifier()

    # Addestriamo l'albero
    clf = clf.fit(X_train,y_train)

    # Eseguiamo delle predizioni sul set di test
    y_pred = clf.predict(X_test)
    
    # Valutiamo la sua accuratezza
    
    print("Decision Tree Balanced Accuracy: " + str(balanced_accuracy_score(y_test, y_pred)))
    
    return clf
    
def random_forest(X_train, y_train, X_test, y_test):

    # Creo il modello
    clf=RandomForestClassifier(n_estimators=100)

    # Addestro il modello
    clf.fit(X_train,y_train)
    
    # Eseguo delle predizioni sul test di test
    y_pred=clf.predict(X_test)
    
    # Valuto l'accuratezza 
    print("Random Forest Balanced Accuracy: " + str(balanced_accuracy_score(y_test, y_pred)))
    
    return clf

# Evaluating profit

I know that you don't care about the balanced accuracy and all the things that I said.
You'r here just for profit, that's why I evaluated the models with a simple trading algorithm.
But first we will need to take out some random sample from our data, we will evaluate the profit after the training.

In [346]:
def single_sample(df):
    s = np.random.choice(df.index[:-5],1)
    sample = df.iloc[df.index.get_loc(s[0]):df.index.get_loc(s[0])+1800]
    return sample

def all_sample(df):
    
    
    f_sample = single_sample(df)
    s_sample = single_sample(df)
    t_sample = single_sample(df)

    df = pd.concat([df, f_sample, s_sample, t_sample]).drop_duplicates(keep=False)
    
    return f_sample, s_sample, t_sample, df

def evaluate_profit(sample, model, model_name):
    
    print("\nTESTING PROFIT FOR: {}\n".format(model_name))
    b_price = profit = amount = 0
    signals = model.predict(sample)
    prices = sample['open']
    
    # Column with all open value
    
    for i in range(len(prices)):
        
        if b_price != 0:
            possible_profit = (prices[i] - b_price) * amount
        else:
            possible_profit = 0
        
        if signals[i] == 1 and b_price == 0:
            b_price = prices[i]
            amount = 100/b_price
            print("Bought {} coin at {}".format(amount, b_price))
        
        if b_price != 0 and possible_profit >= 2 :
            profit += possible_profit - 1
            print("Sold {} coin at {} with profit of {}\n".format(amount, prices[i], possible_profit))
            b_price = 0
    
    print("TOTAL PROFIT: {}".format(profit))

# Here start the game

That's what we did :

* Created the samples and the dataframe
* Splitted the data into test and training set
* Trained all the models
* Evaluated them with balanced accuracy
* Evaluted them with our simple trading algo

## BITCOIN

In [347]:
# Creating df and samples

df, f_sample, s_sample, t_sample = input_df("./btceur.csv", 180)

# Split data into test and training

X_train, y_train, X_test, y_test = data_split(df)

# Random forest for BTC
BTCrf = random_forest(X_train, y_train, X_test, y_test)

# Decision tree for BTC
BTCdt = decision_tree(X_train, y_train, X_test, y_test)

# Neural Network for BTC
BTCnn = neural_network(X_train, y_train, X_test, y_test)

#collections.Counter(y_pred)

f_sample = f_sample.drop(['high', 'low', 'close', 'trend', 'label'], axis = 1)
s_sample = s_sample.drop(['high', 'low', 'close', 'trend', 'label'], axis = 1)
t_sample = t_sample.drop(['high', 'low', 'close', 'trend', 'label'], axis = 1)


# First sample

evaluate_profit(f_sample, BTCnn, "Neural Network")

evaluate_profit(f_sample, BTCdt, "Decision Tree")

evaluate_profit(f_sample, BTCrf, "Random Forest")

# Second sample

evaluate_profit(s_sample, BTCnn, "Neural Network")

evaluate_profit(s_sample, BTCdt, "Decision Tree")

evaluate_profit(s_sample, BTCrf, "Random Forest")

# Third sample

evaluate_profit(t_sample, BTCnn, "Neural Network")

evaluate_profit(t_sample, BTCdt, "Decision Tree")

evaluate_profit(t_sample, BTCrf, "Random Forest")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Random Forest Balanced Accuracy: 0.4244745256976576
Decision Tree Balanced Accuracy: 0.41751492703768006
Neural Networi balanced accuracy 0.6473529825446594

TESTING PROFIT FOR: Neural Network

Bought 0.045882083046570314 coin at 2179.5
Sold 0.045882083046570314 coin at 2236.8 with profit of 2.629043358568487

TOTAL PROFIT: 1.6290433585684871

TESTING PROFIT FOR: Decision Tree

Bought 0.04567670031516923 coin at 2189.3
Sold 0.04567670031516923 coin at 2236.8 with profit of 2.1696432649705386

Bought 0.0434398124298355 coin at 2302.0357226799997
Sold 0.0434398124298355 coin at 2360.7 with profit of 2.548365203112658

Bought 0.042366434595873685 coin at 2360.359113385
TOTAL PROFIT: 2.7180084680831964

TESTING PROFIT FOR: Random Forest

Bought 0.04510599909788002 coin at 2217.0
Sold 0.04510599909788002 coin at 2261.8 with profit of 2.020748759585033

Bought 0.0434398124298355 coin at 2302.0357226799997
Sold 0.0434398124298355 coin at 2360.7 with profit of 2.548365203112658

TOTAL PROFIT: 

## ETH

In [349]:
# Creating df and samples

df, f_sample, s_sample, t_sample = input_df("./etheur.csv", 180)

# Split data into test and training

X_train, y_train, X_test, y_test = data_split(df)

# Random forest for BTC
ETHrf = random_forest(X_train, y_train, X_test, y_test)

# Decision tree for BTC
ETHdt = decision_tree(X_train, y_train, X_test, y_test)

# Neural Network for BTC
ETHnn = neural_network(X_train, y_train, X_test, y_test)

#collections.Counter(y_pred)

f_sample = f_sample.drop(['high', 'low', 'close', 'trend', 'label'], axis = 1)
s_sample = s_sample.drop(['high', 'low', 'close', 'trend', 'label'], axis = 1)
t_sample = t_sample.drop(['high', 'low', 'close', 'trend', 'label'], axis = 1)

# First sample

evaluate_profit(f_sample, ETHnn, "Neural Network")

evaluate_profit(f_sample, ETHdt, "Decision Tree")

evaluate_profit(f_sample, ETHrf, "Random Forest")

# Second sampel

evaluate_profit(s_sample, ETHnn, "Neural Network")

evaluate_profit(s_sample, ETHdt, "Decsion Tree")

evaluate_profit(s_sample, ETHrf, "Random Forest")

# Thrid sample

evaluate_profit(t_sample, ETHnn, "Neural Network")

evaluate_profit(t_sample, ETHdt, "Decision Tree")

evaluate_profit(t_sample, ETHrf, "Random Forest")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


Random Forest Balanced Accuracy: 0.4238363113209241
Decision Tree Balanced Accuracy: 0.4077808919031858
Neural Networi balanced accuracy 0.5791797528322029

TESTING PROFIT FOR: Neural Network

Bought 0.5154373485902789 coin at 194.01
Sold 0.5154373485902789 coin at 198.31 with profit of 2.2163805989382053

Bought 0.5125445273058097 coin at 195.10500000000002
Sold 0.5125445273058097 coin at 199.05781425 with profit of 2.025993311293913

Bought 0.5079848741600934 coin at 196.85625515
TOTAL PROFIT: 2.2423739102321183

TESTING PROFIT FOR: Decision Tree

Bought 0.5124263387138098 coin at 195.15
Sold 0.5124263387138098 coin at 199.05781425 with profit of 2.0024669485011533

Bought 0.5029169181251257 coin at 198.84
TOTAL PROFIT: 1.0024669485011533

TESTING PROFIT FOR: Random Forest

Bought 0.5232588561561404 coin at 191.11
Sold 0.5232588561561404 coin at 195.04 with profit of 2.0564073046936207

Bought 0.5145621076463929 coin at 194.34
Sold 0.5145621076463929 coin at 198.31 with profit of 2.0