In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

#score results
def score_model(true, pred, columns = ""):
    cm = confusion_matrix(true,pred)
    l = np.sum(cm)
    cm = cm/l
    s = [accuracy_score(true, pred), matthews_corrcoef(true, pred), f1_score(true,pred), cm[0,0], cm[1,1],cm[0,1],cm[1,0]]
    return pd.DataFrame(data = s, index = ['accuracy', 'matthew_corr', 'f1', 'tn', 'tp','fp','fn'], columns = [columns])


In [4]:
#import eth price history
eth_prices = pd.read_csv("eth_prices.csv", index_col = 0)
eth_prices.index = pd.to_datetime(eth_prices.index).strftime('%Y-%m-%d')
eth_prices['log_ret'] = np.log(eth_prices.Close/eth_prices.Close.shift(1))
eth_ret = eth_prices[["log_ret"]].dropna()

#get the right daterange
eth_ret_train = eth_ret[(eth_ret.index >= '2014-01-01') & (eth_ret.index < '2021-11-01')].sort_index()

topic_vectors = pd.read_csv("bitcoin/eth_2014.csv", index_col = 0)
for i in [2015, 2016, 2017,2018,2019,2020, 2021]:
    topic_vectors = topic_vectors.append(pd.read_csv(f"bitcoin/eth_{i}.csv", index_col = 0))
topic_vectors.index = pd.date_range('2014-01-01', periods=len(topic_vectors)).strftime('%Y-%m-%d')
topic_vectors = eth_ret_train.join(topic_vectors, how= "left").dropna()

In [21]:
#decompose using PCA for eth
from sklearn.decomposition import PCA
#pca components k
k = 5

pca_model = PCA(n_components=k,random_state=0)
reduced_X = pca_model.fit_transform(topic_vectors.iloc[:,1:])
topic_vectors_reduced = pd.DataFrame(reduced_X, index = topic_vectors.index)
topic_vectors_reduced = eth_ret_train.join(topic_vectors_reduced, how= "left").dropna()

#get the time series nicely streamlined into the machine learning models
rolling_window = 5
l = len(topic_vectors)
X_data = np.empty([l-rolling_window,rolling_window])
for i in range(0,l-rolling_window):
    X_data[i] = np.array(topic_vectors_reduced.iloc[i:i+rolling_window,0])
X_data = np.concatenate((X_data,topic_vectors_reduced.iloc[rolling_window-1:-1,1:]),axis=1)
y_data = np.array(topic_vectors_reduced.iloc[rolling_window:,0])

X = X_data[:-304]
#X_tune = X_data[-500:-300]
X_test = X_data[-304:]


y = y_data[:-304]
#y_tune = y_data[-500:-300]
y_test = y_data[-304:]

In [23]:
#import btc price history
btc_prices = pd.read_csv("btc_prices.csv", index_col = 0)
btc_prices.index = pd.to_datetime(btc_prices.index).strftime('%Y-%m-%d')
btc_prices['log_ret'] = np.log(btc_prices.Close/btc_prices.Close.shift(1))
btc_ret = btc_prices[["log_ret"]].dropna()

#get the right daterange
btc_ret_train = btc_ret[(btc_ret.index >= '2016-01-01') & (btc_ret.index < '2021-11-01')].sort_index()

topic_vectors = pd.read_csv("bitcoin/btc_2016.csv", index_col = 0)
for i in [2017,2018,2019,2020, 2021]:
    topic_vectors = topic_vectors.append(pd.read_csv(f"bitcoin/btc_{i}.csv", index_col = 0))
topic_vectors.index = pd.date_range('2016-01-01', periods=len(topic_vectors)).strftime('%Y-%m-%d')
topic_vectors = btc_ret_train.join(topic_vectors, how= "left").dropna()

In [40]:
#decompose using PCA for btc
from sklearn.decomposition import PCA
#pca components k
k = 5

pca_model = PCA(n_components=k,random_state=0)
reduced_X = pca_model.fit_transform(topic_vectors.iloc[:,1:])
topic_vectors_reduced = pd.DataFrame(reduced_X, index = topic_vectors.index)
topic_vectors_reduced = btc_ret_train.join(topic_vectors_reduced, how= "left").dropna()

#get the time series nicely streamlined into the machine learning models
rolling_window = 5
l = len(topic_vectors)
X_data = np.empty([l-rolling_window,rolling_window])
for i in range(0,l-rolling_window):
    X_data[i] = np.array(topic_vectors_reduced.iloc[i:i+rolling_window,0])
X_data = np.concatenate((X_data,topic_vectors_reduced.iloc[rolling_window-1:-1,1:]),axis=1)
y_data = np.array(topic_vectors_reduced.iloc[rolling_window:,0])

train_cutoff = -3200
test_cutoff = -304

X = X_data[train_cutoff:-608]
#X_tune = X_data[-200:-100]
X_test = X_data[-608:test_cutoff]


y = y_data[train_cutoff:-608]
#y_tune = y_data[-200:-100]
y_test = y_data[-608:test_cutoff]

In [41]:
#linear regression
from sklearn.linear_model import LinearRegression

def no_text(x):
    return np.squeeze(x[:,0:5])

linear_model = LinearRegression().fit(no_text(X), y)
linear_pred = linear_model.predict(no_text(X_test))>0

#Logistic Regression
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(penalty = 'none', random_state =0, class_weight="balanced").fit(no_text(X), y>0)
log_pred = log_model.predict(no_text(X_test))

#linear with vector embeddings
vector_linear_model = LinearRegression().fit(X, y)
vector_linear_pred = vector_linear_model.predict(X_test)>0

#logistic with vector embeddings
vector_logistic_model = LogisticRegression(random_state=0, class_weight="balanced").fit(X,np.sign(y))
vector_logistic_pred = vector_logistic_model.predict(X_test)>0

benchmarks = ['linear', 'log', 'vector_linear','vector_logistic']
pd.concat([score_model((y_test>0), eval(i+"_pred"),i ) for i in benchmarks], 1)



Unnamed: 0,linear,log,vector_linear,vector_logistic
accuracy,0.572368,0.529605,0.552632,0.496711
matthew_corr,0.040857,0.05888,0.079068,0.029856
f1,0.714912,0.562691,0.617978,0.470588
tn,0.036184,0.226974,0.190789,0.273026
tp,0.536184,0.302632,0.361842,0.223684
fp,0.391447,0.200658,0.236842,0.154605
fn,0.036184,0.269737,0.210526,0.348684


In [4]:
#linear regression
from sklearn.linear_model import LinearRegression

def no_text(x):
    return np.squeeze(x[:,0:5])

linear_model = LinearRegression().fit(no_text(X), y)
linear_pred = linear_model.predict(no_text(X_test))>0

In [5]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
#weights to account for imbalance
weights = {0:2, 1:1}
log_model = LogisticRegression(solver='lbfgs', random_state =0, class_weight="balanced").fit(no_text(X), y>0)
log_pred = log_model.predict(no_text(X_test))

With Vector Embeddings now

In [12]:
#linear with vector embeddings
vector_linear_model = LinearRegression().fit(X, y)
vector_linear_pred = vector_linear_model.predict(X_test)>0
#vector_linear_model.score(X_tune, y_tune)

In [13]:
#logistic with vector embeddings
vector_logistic_model = LogisticRegression(random_state=0).fit(X,np.sign(y))
vector_logistic_pred = vector_logistic_model.predict(X_test)>0
#vector_logistic_model.score(X_tune, np.sign(y_tune))

In [165]:
benchmarks = ['linear', 'log', 'vector_linear','vector_logistic']
pd.concat([score_model((y_test>0), eval(i+"_pred"),i ) for i in benchmarks], 1)

Unnamed: 0,linear,log,vector_linear,vector_logistic
accuracy,0.566667,0.543333,0.506667,0.53
matthew_corr,0.039039,-0.085749,-0.023358,-0.023379
f1,0.694836,0.696231,0.586592,0.648379
tn,0.073333,0.02,0.156667,0.096667
tp,0.493333,0.523333,0.35,0.433333
fp,0.35,0.403333,0.266667,0.326667
fn,0.083333,0.053333,0.226667,0.143333


In [123]:
np.sum(confusion_matrix(log_pred, y_test>0))

100