In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from collections import deque
import random
import warnings
warnings.filterwarnings('ignore')

In [2]:
SEQ_LENGTH = 60
PRED_PERIOD = 7
COIN_TO_PRED = 'ADA'

In [3]:
abs_path = '/Users/alex/desktop/python_work/python_projects/Crypto_Price_Predictor'
data_path = os.path.join(abs_path, 'Data')
os.chdir(data_path)

main_df = pd.DataFrame()

for file in os.listdir():
    df = pd.read_csv(file, date_parser=['snapped_at'], names=['date', f'{file[:-4]}_price',
                                                             f'{file[:-4]}_market_cap',
                                                             f'{file[:-4]}_total_volume'], )
    df = df[1:]
    df.set_index('date', inplace=True)
    df = df[[f'{file[:-4]}_price', f'{file[:-4]}_total_volume']]
    #df = df.iloc[-CLOSING_DAYS_LEN:]
    
    
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)

os.chdir('..')

In [4]:
main_df.tail()

Unnamed: 0_level_0,BTC_price,BTC_total_volume,ETH_price,ETH_total_volume,ADA_price,ADA_total_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-08-27 00:00:00 UTC,11465.002564032086,20869928785.800804,385.7524998749212,10454041361.377256,0.1144606140436145,448522382.0338251
2020-08-28 00:00:00 UTC,11300.398363810944,21595917446.04397,381.8376513210842,10466849103.532476,0.106835963062653,445998081.720968
2020-08-29 00:00:00 UTC,11519.118388160729,19271253870.82992,395.1382132386204,10196208352.394411,0.1092126650842092,372495384.20571584
2020-08-30 00:00:00 UTC,11481.481823317012,18415439613.538937,399.374543834064,9631959320.390598,0.1166042236595269,361328837.3964716
2020-08-31 00:00:00 UTC,11701.004008657852,19909556666.601448,428.2956791635218,12945668314.960146,0.1172077660023838,380535420.5953276


In [5]:
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

In [6]:
main_df['future'] = main_df[f'{COIN_TO_PRED}_price'].shift(-PRED_PERIOD)

main_df['target'] = list(map(classify, main_df[f'{COIN_TO_PRED}_price'], main_df['future']))

In [7]:
def data_split(dataframe, test_size):
    
    test_index_start = int(len(dataframe) * test_size)
    train = dataframe.iloc[:-test_index_start]
    test = dataframe.iloc[-test_index_start:]
    
    return train, test

In [8]:
train, test = data_split(main_df, test_size=.1)

In [9]:
def preprocess(df):
    df.drop('future', axis=1, inplace=True)
    for col in df.columns:
        if col != 'target':
            df[col] = df[col].apply(lambda x: pd.to_numeric(x))
            df[col] = df[col].pct_change()
            df.dropna(inplace=True)
            df[col] = preprocessing.scale(df[col].values)
            
    df.dropna(inplace=True)
    
    seq_data = []
    prev_days = deque(maxlen=SEQ_LENGTH)
    
    for i in df.values:
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LENGTH:
            seq_data.append([np.array(prev_days), i[-1]])
            
    random.shuffle(seq_data)
    
    buys = []
    sells = []
    
    for seq, target in seq_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
            
    random.shuffle(buys)
    random.shuffle(sells)
    
    lower = min(len(buys), len(sells))
    
    buys = buys[:lower]
    sells = sells[:lower]
    
    seq_data = buys+sells
    random.shuffle(seq_data)
    
    X = []
    y = []
    
    for seq, target in seq_data:
        X.append(seq)
        y.append(target)
    
    return np.array(X), y

In [10]:
X_train, y_train = preprocess(train)
X_test, y_test = preprocess(test)

In [11]:
print(f'Training Data: {len(X_train)},  Test Data: {len(X_test)}')
print(f'Do Not Buy: {y_train.count(0)},  Buy: {y_train.count(1)}')
print(f'Test Do Not Buy: {y_test.count(0)},  Test Buy: {y_test.count(1)}')

Training Data: 598,  Test Data: 180
Do Not Buy: 299,  Buy: 299
Test Do Not Buy: 90,  Test Buy: 90


In [12]:
import pickle

def pickle_data(data, filename):
    with open(filename, 'wb') as f:
        pickle.dump(data, f)
    
pickle_data(X_train, 'X_train.pickle')
pickle_data(X_test, 'X_test.pickle')
pickle_data(y_train, 'y_train.pickle')
pickle_data(y_test, 'y_test.pickle')