In [2]:
import numpy as np
import pandas as pd

In [3]:
#import btc price history
btc_prices = pd.read_csv("btc_prices.csv", index_col = 0)
btc_prices.index = pd.to_datetime(btc_prices.index).strftime('%Y-%m-%d')
btc_prices['log_ret'] = np.log(btc_prices.Close/btc_prices.Close.shift(1))
btc_ret = btc_prices[["log_ret"]].dropna()

#get the right daterange
btc_ret_train = btc_ret[(btc_ret.index >= '2016-01-01') & (btc_ret.index < '2021-01-01')].sort_index()


In [4]:
topic_vectors = pd.read_csv("bitcoin/btc_2016.csv", index_col = 0)
for i in [2017,2018,2019,2020]:
    topic_vectors = topic_vectors.append(pd.read_csv(f"bitcoin/btc_{i}.csv", index_col = 0))
topic_vectors.index = pd.date_range('2016-01-01', periods=len(topic_vectors))
topic_vectors = btc_ret_train.join(topic_vectors, how= "left").dropna()

In [5]:
#decompose using PCA
from sklearn.decomposition import PCA
#pca components k
k = 30

pca_model = PCA(n_components=k,random_state=0)
reduced_X = pca_model.fit_transform(topic_vectors.iloc[:,1:])
topic_vectors_reduced = pd.DataFrame(reduced_X, index = topic_vectors.index)
topic_vectors_reduced = btc_ret_train.join(topic_vectors_reduced, how= "left").dropna()

#get the time series nicely streamlined into the machine learning models
rolling_window = 5
l = len(topic_vectors)
X_data = np.empty([l-rolling_window,rolling_window])
for i in range(0,l-rolling_window):
    X_data[i] = np.array(topic_vectors_reduced.iloc[i:i+rolling_window,0])
X_data = np.concatenate((X_data,topic_vectors_reduced.iloc[rolling_window-1:-1,1:]),axis=1)
y_data = np.array(topic_vectors_reduced.iloc[rolling_window:,0])

X = X_data[:-200]
X_tune = X_data[-200:-100]
X_test = X_data[-100:]


y = y_data[:-200]
y_tune = y_data[-200:-100]
y_test = y_data[-100:]

In [44]:
#linear regression
from sklearn.linear_model import LinearRegression

def no_text(x):
    return np.squeeze(x[:,0:5])

linear_model = LinearRegression().fit(no_text(X), y)
linear_pred = linear_model.predict(no_text(X_test))>0

In [118]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(penalty = 'none', random_state =0).fit(no_text(X), y>0)
log_pred = log_model.predict(no_text(X_test))

In [104]:
X.shape

(1618, 35)

With Vector Embeddings now

In [46]:
#linear with vector embeddings
vector_linear_model = LinearRegression().fit(X, y)
vector_linear_pred = vector_linear_model.predict(X_test)>0
vector_linear_model.score(X_tune, y_tune)

0.01320151509287848

In [47]:
#logistic with vector embeddings
vector_logistic_model = LogisticRegression(random_state=0).fit(X,np.sign(y))
vector_logistic_pred = vector_logistic_model.predict(X_test)>0
vector_logistic_model.score(X_tune, np.sign(y_tune))

0.49

In [114]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


In [124]:
#score results
def score_model(true, pred, columns = ""):
    cm = confusion_matrix(true,pred)
    l = np.sum(cm)
    cm = cm/l
    s = [accuracy_score(true, pred), matthews_corrcoef(true, pred), f1_score(true,pred), cm[0,0], cm[1,1],cm[0,1],cm[1,0]]
    return pd.DataFrame(data = s, index = ['accuracy', 'matthew_corr', 'f1', 'tn', 'tp','fp','fn'], columns = [columns])


In [125]:
benchmarks = ['linear', 'log', 'vector_linear','vector_logistic']
pd.concat([score_model((y_test>0), eval(i+"_pred"),i ) for i in benchmarks], 1)

Unnamed: 0,linear,log,vector_linear,vector_logistic
accuracy,0.62,0.64,0.64,0.64
matthew_corr,0.033283,0.131145,0.254785,0.179374
f1,0.75641,0.777778,0.7,0.73913
tn,0.03,0.01,0.22,0.13
tp,0.59,0.63,0.42,0.51
fp,0.34,0.36,0.15,0.24
fn,0.04,0.0,0.21,0.12


In [123]:
np.sum(confusion_matrix(log_pred, y_test>0))

100