In [1]:
import sys
sys.path.append('../')
from libs.kraken_conn import kraken
import numpy as np
import pandas as pd
import datetime as dt

In [2]:
columns = {0:'datetime',1:'open',2:'high',3:'low',4:'close',5:'volume'}

btc_raw = pd.read_csv("data/raw/btcusd.csv", header=None)
eth_raw = pd.read_csv("data/raw/ethusd60.csv", header=None)

In [3]:
# make df for historical data
def make_df(raw_df):
    df = raw_df.loc[:,:5]
    df.rename(columns=columns, inplace=True)
    df = df[["datetime", "close"]]
    
    df["timestamp"] = df["datetime"].apply(lambda x: dt.datetime.fromtimestamp(x))
    df = df.drop(columns="datetime")
    df.set_index("timestamp", inplace=True)
    df.index=pd.to_datetime(df.index)
    return df

In [4]:
# make df for current data (needs to divide timestamp by 1000)
def crypto_curr(raw_df):
    df = raw_df.loc[:,:5]
    df.rename(columns=columns, inplace=True)
    df = df[["datetime", "close"]]
    
    df["timestamp"] = df["datetime"].apply(lambda x: dt.datetime.fromtimestamp(x/1000))
    df = df.drop(columns="datetime")
    df.set_index("timestamp", inplace=True)
    df.index=pd.to_datetime(df.index)
    return df

In [5]:
# fetchOHLCV will get ~last 30 days of hourly data

def get_recent(pair="BTC/USD"):
    k = kraken.fetchOHLCV(pair, "1h")
    
    df = pd.DataFrame()
    df = df.append(k)
    return crypto_curr(df)

In [6]:
btc = make_df(btc_raw)
eth = make_df(eth_raw)
btc_curr = get_recent()
eth_curr = get_recent(pair="ETH/USD")

In [7]:
btc.rename(columns={"close":"btc"}, inplace=True)
eth.rename(columns={"close":"eth"}, inplace=True)
btc_curr.rename(columns={"close":"btc"}, inplace=True)
eth_curr.rename(columns={"close":"eth"}, inplace=True)

In [8]:
# merge was used because concat failed even though dates matched for past data
# crypto_df = pd.concat([btc,eth], axis=1,join="inner")
crypto_df = btc.merge(eth, how="inner", on="timestamp")
crypto_recent = btc_curr.merge(eth_curr, how="inner", on="timestamp")


In [9]:
crypto_df.head()

Unnamed: 0_level_0,btc,eth
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-08-07 09:00:00,277.40328,3.0
2015-08-07 12:00:00,281.95386,3.0
2015-08-08 01:00:00,279.672,3.0
2015-08-08 04:00:00,283.77999,2.0
2015-08-08 11:00:00,273.3893,1.02


In [10]:
num_ticks = 60
symbols = ["btc","eth"]
returns_df_list = []
for symbol in symbols:
    name = f"{symbol}_24_Return"
    returns_df = pd.DataFrame()
    returns_df[name] = crypto_df[symbol].pct_change(periods=24).shift(-24)
    returns_df_list.append(returns_df)

In [11]:
shifted_df = pd.DataFrame()
for symbol in symbols:
    shifted_df[symbol] = crypto_df[symbol].copy()
    for i in range(1,num_ticks):
        name = f"{symbol}-{i}"
        shifted_df[name] = crypto_df[symbol].shift(i)
shifted_df.shape

(45978, 120)

In [12]:
shifted_df.head()

Unnamed: 0_level_0,btc,btc-1,btc-2,btc-3,btc-4,btc-5,btc-6,btc-7,btc-8,btc-9,...,eth-50,eth-51,eth-52,eth-53,eth-54,eth-55,eth-56,eth-57,eth-58,eth-59
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-08-07 09:00:00,277.40328,,,,,,,,,,...,,,,,,,,,,
2015-08-07 12:00:00,281.95386,277.40328,,,,,,,,,...,,,,,,,,,,
2015-08-08 01:00:00,279.672,281.95386,277.40328,,,,,,,,...,,,,,,,,,,
2015-08-08 04:00:00,283.77999,279.672,281.95386,277.40328,,,,,,,...,,,,,,,,,,
2015-08-08 11:00:00,273.3893,283.77999,279.672,281.95386,277.40328,,,,,,...,,,,,,,,,,


In [13]:
returns_df_list[0].shape

(45978, 1)

In [14]:
test_data = pd.read_csv("data/sample_test.csv")
test_shifted_df = pd.DataFrame()
for symbol in symbols:
    test_shifted_df[symbol] = test_data[symbol].copy()
    for i in range(1,num_ticks):
        name = f"{symbol}-{i}"
        test_shifted_df[name] = test_data[symbol].shift(i)
test_shifted_df = test_shifted_df.pct_change()
test_shifted_df.dropna(inplace=True)

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
btc_concat = pd.concat([shifted_df.pct_change(), returns_df_list[0]], axis=1)
btc_concat.dropna(inplace=True)
btc_concat.shape

eth_concat = pd.concat([shifted_df.pct_change(), returns_df_list[1]], axis=1)
eth_concat.dropna(inplace=True)

In [17]:
# X = eth_concat.iloc[:,:120].values
# y = eth_concat.iloc[:,-1].values

X = btc_concat.iloc[:,:120].values
y = btc_concat.iloc[:,-1].values


In [18]:
# eth_model = LinearRegression()
# eth_model.fit(X,y)
btc_model = LinearRegression()
btc_model.fit(X,y)

LinearRegression()

In [19]:
# replace with eth
predicted_btc = btc_model.predict(X)
#current_predicted_btc = btc_model.predict(test_shifted_df.tail(1))

current_predicted_btc = btc_model.predict(test_shifted_df)

In [20]:
arr1 = current_predicted_btc
arr2 = test_data["timestamp"][60:].values
arr2.shape
new_df = pd.DataFrame({"timestamp":arr2,"pct_change":arr1})
new_df["buy"] = new_df["pct_change"].apply(lambda x: 1 if x > 0.007 else 0)
new_df


Unnamed: 0,timestamp,pct_change,buy
0,2021-06-04 07:00:00,0.006052,0
1,2021-06-04 08:00:00,0.005687,0
2,2021-06-04 09:00:00,0.006086,0
3,2021-06-04 10:00:00,0.004955,0
4,2021-06-04 11:00:00,0.005703,0
5,2021-06-04 12:00:00,0.005155,0
6,2021-06-04 13:00:00,0.005603,0
7,2021-06-04 14:00:00,0.006295,0
8,2021-06-04 15:00:00,0.006406,0
9,2021-06-04 16:00:00,0.007163,1


In [21]:
# predicted_df = pd.DataFrame({"actual": y, "predicted":predicted_btc})
# predicted_df.reset_index(inplace=True)
# predicted_df.drop(columns=["index"], inplace=True)
# predicted_df.to_csv("plot_data.csv")

In [22]:
from sklearn.metrics import r2_score
r2 = r2_score(y,predicted_btc)
print(r2)

0.0037583232960800572


In [23]:
#test_shifted_df.tail(1)
test_data.tail(1)

Unnamed: 0,timestamp,btc,eth
719,2021-07-01 18:00:00,33501.5,2118.83


In [24]:
buy_df = pd.DataFrame()
buy_df["signal"] = btc_concat.iloc[:,-1].apply(lambda x: 1 if x > 0.01 else 0)
buy_df.head(20)

Unnamed: 0_level_0,signal
timestamp,Unnamed: 1_level_1
2015-08-15 08:00:00,0
2015-08-15 09:00:00,0
2015-08-15 10:00:00,0
2015-08-15 12:00:00,0
2015-08-15 13:00:00,0
2015-08-15 14:00:00,0
2015-08-15 16:00:00,0
2015-08-15 20:00:00,0
2015-08-15 22:00:00,0
2015-08-15 23:00:00,0


In [25]:
from sklearn.model_selection import train_test_split
X=X
y = buy_df["signal"].values
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y)

In [26]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver="lbfgs")
classifier.fit(X_train, y_train)
classifier.score(X_train, y_train)

0.6330912260313771

In [28]:
predictions = classifier.predict(X_test)
current_pred_btc_log = classifier.predict(test_shifted_df)
np.sum(current_pred_btc_log)

0

In [29]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[7253,    7],
       [4211,    3]], dtype=int64)

In [30]:
from sklearn.metrics import classification_report
target_names = ["Not Buy", "Buy"]
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

     Not Buy       0.63      1.00      0.77      7260
         Buy       0.30      0.00      0.00      4214

    accuracy                           0.63     11474
   macro avg       0.47      0.50      0.39     11474
weighted avg       0.51      0.63      0.49     11474



In [31]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=500)
rf_model = rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_pred_curr_btc = rf_model.predict(test_shifted_df)

In [32]:
print(np.sum(y_test), np.sum(rf_predictions), np.sum(rf_pred_curr_btc))

4214 1013 74


In [33]:
arr1 = rf_pred_curr_btc
arr2 = test_data["timestamp"][60:].values

rf_df = pd.DataFrame({"timestamp":arr2,"buy":arr1})

rf_df

Unnamed: 0,timestamp,buy
0,2021-06-04 07:00:00,0
1,2021-06-04 08:00:00,0
2,2021-06-04 09:00:00,0
3,2021-06-04 10:00:00,0
4,2021-06-04 11:00:00,0
5,2021-06-04 12:00:00,0
6,2021-06-04 13:00:00,0
7,2021-06-04 14:00:00,0
8,2021-06-04 15:00:00,0
9,2021-06-04 16:00:00,0


In [73]:
print(classification_report(y_test, rf_predictions, target_names=target_names))

              precision    recall  f1-score   support

     Not Buy       0.65      0.94      0.77      7260
         Buy       0.52      0.12      0.19      4214

    accuracy                           0.64     11474
   macro avg       0.58      0.53      0.48     11474
weighted avg       0.60      0.64      0.55     11474



In [34]:
from sklearn.ensemble import GradientBoostingClassifier

In [36]:
gb_classifier = GradientBoostingClassifier(n_estimators=30,
                                    learning_rate=0.5,
                                    max_features=10,
                                    max_depth=6)
gb_classifier.fit(X_train,y_train)


gb_predictions = gb_classifier.predict(X_test)

In [None]:

count = 0
models=[]
for x in range(10,100,10):
    for y in range(0.1,0.9,0.1):
        for z in range(4,20,4):
            for w in range(2,10,2):
                count += 1
                model = GradientBoostingClassifier(n_estimators=x,
                                    learning_rate=y,
                                    max_features=z,
                                    max_depth=w) 
                #run the fit
                # run the classifcation report
                # store best classification report along with x,y,z.w

In [37]:
gb_pred_curr_btc = gb_classifier.predict(test_shifted_df)
arr1 = gb_pred_curr_btc
arr2 = test_data["timestamp"][60:].values

gb_df = pd.DataFrame({"timestamp":arr2,"buy":arr1})

gb_df

Unnamed: 0,timestamp,buy
0,2021-06-04 07:00:00,0
1,2021-06-04 08:00:00,1
2,2021-06-04 09:00:00,0
3,2021-06-04 10:00:00,0
4,2021-06-04 11:00:00,0
5,2021-06-04 12:00:00,1
6,2021-06-04 13:00:00,1
7,2021-06-04 14:00:00,0
8,2021-06-04 15:00:00,1
9,2021-06-04 16:00:00,0


In [78]:
print(classification_report(y_test, gb_predictions, target_names=target_names))

              precision    recall  f1-score   support

     Not Buy       0.65      0.85      0.74      7260
         Buy       0.47      0.23      0.30      4214

    accuracy                           0.62     11474
   macro avg       0.56      0.54      0.52     11474
weighted avg       0.59      0.62      0.58     11474



In [79]:
print(np.sum(y_test), np.sum(gb_predictions))

4214 2030
