## Analysis of XLM_512

In [None]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import json,os,pdb
import tqdm
import warnings
import yfinance as yf
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix
import statsmodels.formula.api as smf
from sklearn.tree import DecisionTreeClassifier

In [None]:
df_clean=pd.read_csv('clean_df.csv')
df=df_clean[df_clean['symbol_id']=='XLM_512']
df.shape

In [None]:
df.describe()#close price is 0.26 on average

In [None]:
df.index=df['time_close']
df

In [None]:
df.isnull().sum()

In [None]:
df.plot(y=['close', 'volume', 'marketcap',
                     'SPY', 'VIX', '10Y Treasury', 'Gold', 'Oil','EUR_USD'], logy=True,figsize=(15, 10))
plt.xlabel('Date')
plt.ylabel('Price or volume')
plt.title('XLM_512 price plotted against other prices and volume')
plt.show()


In [None]:
    plt.figure(figsize=(15,10))
    q3, q1 = df['tomorrow change'].quantile([0.99, 0.01])
    df.loc[(df['tomorrow change'] < q3) & (df['tomorrow change'] > q1)]['tomorrow change'].plot.hist(density=1,bins=100)
    plt.title('XLM_512 price change histogram')
    plt.show()
    print("Skewness: %f" % df['tomorrow change'].skew())
    print("Kurtosis: %f" % df['tomorrow change'].kurt())#sharp peak

In [None]:
for i in ['month','day','weekday','day of week','SPY', 'VIX', '10Y Treasury', 'Gold', 'Oil','EUR_USD']:
    #df.plot(x=i,y='close',kind='scatter')
    print(df[['close',i]].corr(method = 'pearson'))
    plot = sns.lmplot(y='close',x=i,data=df) 

In [None]:
#check seasonality
plt.figure(figsize=(15, 10))
df['year'] = pd.DatetimeIndex(df['time_close']).year
df['time_close'] = pd.to_datetime(df['time_close'])
sns.lineplot(x=df['time_close'].dt.dayofyear, y=df['close'], hue=df['year']) #no special trends each year, close is low in mid year, high in the first half year?

In [None]:
df=df.drop(columns=['year'])

In [None]:
import statsmodels.api as sm

In [None]:
df_close = df[['time_close', 'close']].copy()
df_close = df_close.set_index('time_close')
df_close.head()

decomp = sm.tsa.seasonal_decompose(df_close, period=100)
fig = decomp.plot()
fig.set_size_inches(10, 8)

In [None]:
corrmat = df.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat,cmap="PiYG",linewidths=.5)

## check stationarity

In [None]:
from statsmodels.tsa.stattools import adfuller

#ADF statistic to check stationarity
for col in df.columns[2:]:
    timeseries = df[col]
    result = adfuller(timeseries)
    #plt.figure(figsize=(15, 10))
   # ax = timeseries.plot(secondary_y=False, logy=False)

    #print(f'Testing {col} from {timeseries.index[0]:%Y-%m-%d} to {timeseries.index[-1]:%Y-%m-%d} for Stationarity')
    print(f'ADF Statistic: {result[0]:.3f}')
    print(f'p-value: {result[1]:.3E}')
#     print('Critical Values:')
#     for key, value in result[4].items():
#         print('\t%s: %.3f' % (key, value))
    if result[0] > result[4]["5%"]:
        conclusion=f"Failed to Reject H_0 at 5% -> {col} Time Series is Non-Stationary"
    else:
        conclusion=f"Reject H_0 at at 5% -> {col} Time Series is Stationary"
    print(conclusion)
    print('\n')

In [None]:
for i in ['SPY','VIX','Gold','Oil','EUR_USD','10Y Treasury','marketcap']:
    df[i+'_ret']=df[i].pct_change()
df

In [None]:
df.drop(df.groupby('symbol_id')['SPY_ret'].head(1).index, inplace=True) 
df

In [None]:
df.columns

In [None]:
df=df.drop(columns=['symbol_id', 'time_close', 'close',  'volume','marketcap','SPY','10Y Treasury', 'Gold', 'Oil',
      'EUR_USD', 'VIX_ret'])
df

In [None]:
#df=df.drop(columns=['symbol_id', 'time_close', 'close', 'volume', 'marketcap', 'month',
#     'day', 'weekday', 'day of week', 'price change', 'volume change','SPY','10Y Treasury', 'Gold', 'Oil',
#      'EUR_USD', 'VIX_ret'])
#df=df.drop(columns=['symbol_id', 'time_close', 'close',  'volume','marketcap','SPY','10Y Treasury', 'Gold', 'Oil',
#      'EUR_USD', 'VIX_ret'])


#print(df)

train_size = int(len(df) * 0.8)
test_size = len(df) - train_size
X_train,X_test = df.drop(columns='tomorrow change').iloc[0:train_size+1,0:], df.drop(columns='tomorrow change').iloc[train_size+1:,0:]
y_train,y_test=df.iloc[0:train_size+1,6], df.iloc[train_size+1:,6]
scaler=StandardScaler()
normalize_X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
normalize_X_test=pd.DataFrame(scaler.fit_transform(X_test),columns= X_test.columns)

In [None]:
normalize_X_train, X_test, y_train, y_test

## linear regression: X is all stationary variable, y is tmr change

In [None]:
#X=df[['SPY', 'VIX', '10Y Treasury', 'Gold', 'Oil', 'EUR_USD']]
#y=df['tomorrow change']
#train_size = int(len(df) * 0.8)
#test_size = len(df) - train_size
#X_train,X_test = df.iloc[0:train_size, 12:18], df.iloc[train_size:len(df), 12:18]
#y_train,y_test=df.iloc[0:train_size, 11], df.iloc[train_size:len(df), 11]
#scaler=StandardScaler()
#normalize_X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)*/


model=LinearRegression().fit(normalize_X_train, y_train)
y_pred=model.predict(normalize_X_test)

In [None]:
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred)) #not accurate

In [None]:
plt.plot(y_pred,color='red')#red is predicted value
plt.plot(list(y_test))#true value

In [None]:
df.columns

##ols

In [None]:
#ols
df2=df.copy()
df2.columns = [c.replace(' ', '_') for c in df2.columns]
df2=df2.rename(columns = {'10Y_Treasury_ret':'Treasury10y_ret'})
reg_mul = smf.ols('tomorrow_change ~  month+day+weekday+day_of_week+price_change+volume_change+SPY_ret+VIX+Treasury10y_ret+Gold_ret+Oil_ret+EUR_USD_ret+marketcap_ret', data = df2)
res_mul = reg_mul.fit()
print(res_mul.summary())

In [None]:
df_clean=df_clean.set_index('time_close')

In [None]:
#add LTC_2 and XRP_52
ltc=df_clean[df_clean['symbol_id']=='LTC_2']['close']
xrp=df_clean[df_clean['symbol_id']=='XRP_52']['close']
doge=df_clean[df_clean['symbol_id']=='DOGE_74']['close']
eth=df_clean[df_clean['symbol_id']=='ETH_1027']['close']
btc=df_clean[df_clean['symbol_id']=='BTC_1']['close']
xmr=df_clean[df_clean['symbol_id']=='XMR_328']['close']

df3=df.copy()
df3['ltc']=ltc
df3['xrp']=xrp
df3['doge']=doge
df3['eth']=eth
df3['btc']=btc
df3['xmr']=xmr
df3.columns

In [None]:
df3['ltc_ret']=df3['ltc'].pct_change()
df3['xrp_ret']=df3['xrp'].pct_change()
df3['doge_ret']=df3['doge'].pct_change()
df3['eth_ret']=df3['eth'].pct_change()
df3['btc_ret']=df3['btc'].pct_change()
df3['xmr_ret']=df3['xmr'].pct_change()
df3

In [None]:
df3.drop(df3['ltc_ret'].head(1).index, inplace=True) 
df3=df3.drop(columns=['ltc','xrp','doge','eth','btc','xmr'])
df3

In [None]:
train_size = int(len(df3) * 0.8)
test_size = len(df3) - train_size
X_train,X_test = df3.drop(columns='tomorrow change').iloc[0:train_size+1,0:], df3.drop(columns='tomorrow change').iloc[train_size+1:,0:]
y_train,y_test=df3.iloc[0:train_size+1,6], df3.iloc[train_size+1:,6]
scaler=StandardScaler()
normalize_X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
normalize_X_test=pd.DataFrame(scaler.fit_transform(X_test),columns= X_test.columns)

#X_train,X_test = df3.iloc[0:train_size, 12:20], df3.iloc[train_size:len(df3), 12:20]
#y_train,y_test=df3.iloc[0:train_size, 11], df3.iloc[train_size:len(df3), 11]
#normalize_X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

model2=LinearRegression().fit(normalize_X_train, y_train)
y_pred=model2.predict(normalize_X_test)
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred)) #not accurate

In [None]:
df3.columns

In [None]:
#ols
df4=df3.copy()#0.071
df4.columns = [c.replace(' ', '_') for c in df4.columns]
df4=df4.rename(columns = {'10Y_Treasury_ret':'Treasury10y_ret'})
reg_mul2 = smf.ols('tomorrow_change ~  month+day+weekday+day_of_week+price_change+volume_change+SPY_ret+VIX+Treasury10y_ret+Gold_ret+Oil_ret+EUR_USD_ret+ltc_ret+xrp_ret+doge_ret+eth_ret+btc_ret+xmr_ret+marketcap_ret', data = df4)
res_mul2 = reg_mul2.fit()
print(res_mul2.summary()) #accuracy increases

In [None]:
#change label to close
#reg_mul3 = smf.ols('close ~  SPY+VIX+Treasury10y+Gold+Oil+EUR_USD+ltc+xrp', data = df4)
#res_mul3 = reg_mul3.fit()
#print(res_mul3.summary())

In [None]:
#X=df3[['SPY', 'VIX', '10Y Treasury', 'Gold', 'Oil', 'EUR_USD','ltc','xrp']]
#y=df3[['tomorrow change']]
#tss = TimeSeriesSplit(n_splits = 2)
#for train_index, test_index in tss.split(X):
#    X_train, X_test = X.iloc[train_index, :], X.iloc[test_index,:]
#    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

#normalize_X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
#model8=LinearRegression().fit(normalize_X_train, y_train)
#normalize_X_test=pd.DataFrame(scaler.fit_transform(X_test),columns= X_test.columns)
#y_pred=model8.predict(normalize_X_test)
#print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

In [None]:
df3.columns

## logistic regression X is index return, y is binary response of tmr return

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
df3.loc[df3['tomorrow change'] < 0, 'target'] = 0
df3.loc[df3['tomorrow change'] > 0, 'target'] = 1
df3


In [None]:
df3.columns

## use decision tree on data

In [None]:
#make tomorrow change a binary response#0.49
#df3['target']=np.sign(df3['tomorrow change'])
#df3=df3.drop(columns=['year'])
#X3=df5.iloc[:,12:20]
#tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
from sklearn.metrics import accuracy_score

X_train,X_test = df3.drop(columns=['tomorrow change','target']).iloc[0:train_size+1,0:], df3.drop(columns=['tomorrow change','target']).iloc[train_size+1:,0:]
y_train,y_test=df3.iloc[0:train_size+1,20], df3.iloc[train_size+1:,20]

#model3=tree_clf.fit(X_train3, y_train3)

model3=LogisticRegression(random_state=42).fit(X_train, y_train)
y_pred=model3.predict(X_test)
#print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred)) 
#confusion_matrix(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
accuracy_score(y_test, y_pred)

In [None]:
import statsmodels.formula.api as smf
df7=df3.copy()
df7.columns = [c.replace(' ', '_') for c in df7.columns]
df7=df7.rename(columns = {'10Y_Treasury_ret':'Treasury10y_ret'})
formula = 'target ~ month+day+weekday+day_of_week+price_change+volume_change+SPY_ret+VIX+Treasury10y_ret+Gold_ret+Oil_ret+EUR_USD_ret+ltc_ret+xrp_ret+doge_ret+eth_ret+btc_ret+xmr_ret+marketcap_ret'
model = smf.glm(formula = formula, data=df7, family=sm.families.Binomial())
result = model.fit()
print(result.summary())

In [None]:
##glm
df6=df3.copy()
df6.columns = [c.replace(' ', '_') for c in df6.columns]
df6=df6.rename(columns = {'10Y_Treasury_ret':'Treasury10y_ret'})

model8 = smf.glm(formula='target ~ SPY_ret+VIX+Treasury10y_ret+Gold_ret+Oil_ret+EUR_USD_ret+ltc_ret+xrp_ret', data = df6,family=sm.families.Binomial())
results = model8.fit()

print(results.summary())

## change y to 4 categories: (decision tree)

In [None]:
print(df['tomorrow change'].quantile([0.25,0.50,0.75]))
Q3 = df['tomorrow change'].quantile(0.75)
Q1 = df['tomorrow change'].quantile(0.25)
Q2 = df['tomorrow change'].quantile(0.50)

In [None]:
#divide into 4 categories
df3.loc[df3['tomorrow change'] < Q1, 'target2'] = 0
df3.loc[(df3['tomorrow change'] >= Q1)&(df3['tomorrow change'] < Q2), 'target2'] = 1
df3.loc[(df3['tomorrow change'] >= Q2)&(df3['tomorrow change'] < Q3), 'target2'] = 2
df3.loc[df3['tomorrow change'] >= Q3, 'target2'] = 3
df3

In [None]:
df3.columns

In [None]:
from sklearn.metrics import classification_report

In [None]:
X_train,X_test = df3.drop(columns=['tomorrow change','target','target2']).iloc[0:train_size+1,0:], df3.drop(columns=['tomorrow change','target','target2']).iloc[train_size+1:,0:]
y_train,y_test=df3.iloc[0:train_size+1,21], df3.iloc[train_size+1:,21]

tree_clf = DecisionTreeClassifier(max_depth=100, random_state=42)

model4=tree_clf.fit(X_train, y_train)

y_pred=model4.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
accuracy_score(y_test, y_pred)

## LASSO to predict tmr_ret

In [None]:
#lasso
from sklearn.linear_model import Lasso
from sklearn.metrics import roc_auc_score
from sklearn.kernel_ridge import KernelRidge
import math
from sklearn.metrics import mean_squared_error

In [None]:
df3=df3.drop(columns=['target','target2'])
df3.columns

In [None]:
model5 = Lasso(alpha=0.001)

X_train,X_test = df3.drop(columns='tomorrow change').iloc[0:train_size+1,0:], df3.drop(columns='tomorrow change').iloc[train_size+1:,0:]
y_train,y_test=df3.iloc[0:train_size+1,6], df3.iloc[train_size+1:,6]
scaler=StandardScaler()
#normalize_X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
#normalize_X_test=pd.DataFrame(scaler.fit_transform(X_test),columns= X_test.columns)

model5.fit(normalize_X_train, y_train)
y_pred=model5.predict(normalize_X_test)
#print(model5.score(normalize_X_test, y_test), model5.score(normalize_X_train, y_train))

print(np.sqrt(mean_squared_error(y_test,y_pred)))
print(r2_score(y_test, y_pred))
#poor performance
#0.037
#0.064


In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge

In [None]:
X_poly = PolynomialFeatures(degree=5).fit_transform(X_train)
m_ridge = Ridge(alpha = 0.5, solver='lsqr')
m_ridge.fit(X_poly, y_train)
score_ridge = m_ridge.score(X_poly, y_train)
y_pred_ridge = m_ridge.predict(X_poly)

plt.figure(figsize = (28,10))
plt.plot( y_train, label='actual price')
plt.plot(y_pred_ridge, 'g--', label='ridge model prediction (Accuracy: {})'.format(round(score_ridge, 3)))
#plt.title('Bitcoin Price over last {} days, actual vs. predicted (from trained linear model w/ ridge regularization)'.format(days))
plt.xlabel('TimeStamp')
plt.ylabel('Bitcoin Price (USD)')
plt.legend(loc='upper right')
score_ridge

In [None]:
model6 = KernelRidge(kernel='polynomial',alpha=1)
model6.fit(normalize_X_train, y_train)
#print(model6.score(normalize_X_train, y_train))
y_pred = model6.predict(normalize_X_test)
print(np.sqrt(mean_squared_error(y_test,y_pred)))
print(r2_score(y_test, y_pred))

fig, ax = plt.subplots(figsize = [30,10])
ax.plot(y_pred,color='red')
ax.plot(y_train,color='green')
ax.plot(model6.predict(normalize_X_test))
ax.plot(y_test)
plt.show()

## SVM

In [None]:
from sklearn import svm
model7 = svm.SVR(kernel = 'poly')
model7.fit(normalize_X_train,y_train)
model7.score(normalize_X_train,y_train)
y_pred = model7.predict(normalize_X_test)
print(np.sqrt(mean_squared_error(y_test,y_pred)))
print(r2_score(y_test, y_pred))

## generalized linear regression

In [None]:
import statsmodels.formula.api as smf

In [None]:
df3.columns

In [None]:

#results.predict(y_

## RNN with LSTM

In [None]:
y_train,normalize_X_train

In [None]:
from statsmodels.tsa.arima.model import ARIMA
# fit model
model = ARIMA(y_train, exog=X_train, order=(0,1,1))
model_fit = model.fit()
 #summary of fit model
print(model_fit.summary())


# get predictions
predictions = model_fit.predict(start =train_size, end=len(df3)-1,exog=X_test)


# setting up for plots
act = pd.DataFrame(scaler_output.iloc[train_size:, 0])
predictions=pd.DataFrame(predictions)
predictions.reset_index(drop=True, inplace=True)
predictions.index=test_X.index
predictions['Actual'] = act['BTC Price next day']
predictions.rename(columns={0:'Pred', 'predicted_mean':'Pred'}, inplace=True)


In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
# grid serach for optimal polynomial degree
for j in [2,3,5]:
    #make polynomial features
    poly = PolynomialFeatures(degree = j)
    X_poly = poly.fit_transform(Xtrain.reshape((2411,1))) 


    poly.fit(X_poly, ytrain) 
    reg = LinearRegression() 
    reg.fit(X_poly, ytrain) 

    ypred=reg.predict(poly.fit_transform(Xtest.reshape((272,1))))
    ytest=ytest.reshape((272,1))

    #plot the same
    plt.plot(ypred,label='predicted with degree'+str(j))
    plt.legend()
    #plt.show()

    print("POLYNOMIAL REGRESSION")

    c=0
    for i in range(272):
        c+=(ypred[i]-ytest[i])**2
    c/=272
    
    print("Degree=",j,"        RMSE:",c**0.5)