In [1]:
import numpy as np
import pandas as pd

def accuracy(true, pred):
    temp = np.sign(true*pred)+1
    return sum(temp)/len(temp)/2

In [80]:
btc_prices = pd.read_csv("Coinbase_BTCUSD_dailydata.csv", index_col = 6)
btc_prices.index = pd.to_datetime(btc_prices.index).strftime('%Y-%m-%d')

#get the right daterange
btc_prices = btc_prices[(btc_prices.index > '2021-01-01') & (btc_prices.index < '2021-10-01')].sort_index()

btc_prices['ret'] = btc_prices.close/btc_prices.close.shift(1)-1
btc_ret = btc_prices[["ret"]].fillna(0)

In [228]:
#get the time series nicely streamlined into the machine learning models
rolling_window = 5

train = btc_ret[(btc_ret.index > '2021-01-01') & (btc_ret.index <= '2021-09-01')]
test = btc_ret[(btc_ret.index > '2021-09-01')]

X = np.empty([len(train)-rolling_window,rolling_window])
for i in range(0,len(train)-rolling_window):
    X[i] = np.array(train.iloc[i:i+rolling_window,0])
y = np.array(train.iloc[rolling_window:,0])

X_test = np.empty([len(test)-rolling_window,rolling_window])
for i in range(0,len(test)-rolling_window):
    X_test[i] = np.array(test.iloc[i:i+rolling_window,0])
y_test = np.array(test.iloc[rolling_window:,0])


In [229]:
#naive method
naive_pred = np.roll(np.sign(test.ret),1)

In [230]:
#linear regression
from sklearn.linear_model import LinearRegression

linear_model = LinearRegression().fit(X, y)
linear_pred = linear_model.predict(X_test)

In [231]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(random_state=0).fit(X, np.sign(y))
log_pred = log_model.predict(X_test)

In [232]:
#evaluate benchmark models
pred_comp1 = test.iloc[rolling_window:].copy()
pred_comp1['naive'] = naive_pred[rolling_window:]
pred_comp1['simple_LinR']  = linear_pred
pred_comp1['simple_LogR'] = log_pred

print("naive: ", accuracy(pred_comp1.ret, pred_comp1.naive) )
print("linear: ", accuracy(pred_comp1.ret, pred_comp1.simple_LinR) )
print("logistic:, ", accuracy(pred_comp1.ret, pred_comp1.simple_LogR))

naive:  0.4166666666666667
linear:  0.4583333333333333
logistic:,  0.4166666666666667


In [None]:
#lstm not done yet

In [14]:
#import the vector embeddings
topic_vectors = pd.read_csv("string.csv", index_col = 0)
popular_subs = pd.read_csv("sub_list.csv", index_col = 0)
popular_subs.index = pd.to_datetime(popular_subs.index).strftime('%Y-%m-%d')
topic_vectors.index = popular_subs.index


In [233]:
#decompose using PCA
from sklearn.decomposition import PCA
#pca components k
k = 10

pca_model = PCA(n_components=k)
reduced_X = pca_model.fit_transform(topic_vectors)
topic_vectors_reduced = pd.DataFrame(reduced_X, index = topic_vectors.index)

vector_btc_ret = btc_ret.join(topic_vectors_reduced, how= "left")
vector_train = vector_btc_ret[(vector_btc_ret.index > '2021-01-01') & (vector_btc_ret.index <= '2021-09-01')].dropna()
vector_test = vector_btc_ret[(vector_btc_ret.index > '2021-09-01')].dropna()

vector_linear_model = LinearRegression().fit(vector_train.iloc[:,1:], vector_train.iloc[:,0])
vector_linear_model.score(vector_test.iloc[:,1:], vector_test.iloc[:,0])

0.10394007758061474

In [175]:
vector_linear_pred = vector_linear_model.predict(vector_test.iloc[:,1:])

In [183]:
pred_comp = vector_test[['ret']].copy()
pred_comp['regression'] = vector_linear_pred

In [184]:
pred_comp

Unnamed: 0,ret,regression
2021-09-02,0.008999,0.019475
2021-09-03,0.015131,0.022776
2021-09-04,-0.00164,-0.000555
2021-09-05,0.036966,-0.001251
2021-09-06,0.017564,-0.005279
2021-09-07,-0.110141,-0.015312
2021-09-08,-0.017788,0.003918
2021-09-09,0.007374,0.009181
2021-09-10,-0.033374,-0.034481
2021-09-11,0.007143,-0.023188


In [189]:
accuracy(pred_comp.ret, pred_comp.regression)

0.6206896551724138

In [190]:
#september descriptive stats for comparison
print("positive returns:", len(y_test[y_test>0])/len(y_test))
print("negative returns:", len(y_test[y_test<0])/len(y_test))

positive returns: 0.4583333333333333
negative returns: 0.5416666666666666
