In [22]:
import pandas as pd
import altair as alt
import datetime as datetime
import pandas_datareader as DataReader
import scipy
import sklearn
from sklearn.linear_model import LinearRegression
import numpy as np

In [31]:
# Round values to the nearest hundreth (cent)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [24]:
# Get the Bitcoin Historical Pricing Data
df = pd.read_csv("bitstampUSD.csv")
df.head()

FileNotFoundError: [Errno 2] File b'bitstampUSD.csv' does not exist: b'bitstampUSD.csv'

In [None]:
# Problems with the data set
# NaN values early on, data is by the second which doesn't correspond
# with most technical indicators and other financial instruments
df.info()

In [None]:
df.describe()

In [None]:
# what percent of set is null?
print(df.isnull().sum()*100/df.isnull().count())

In [None]:
# we will ignore earilier dates with NaN values
bitcoin = df.dropna()
bitcoin

In [25]:
# convert the timestamp variable to a datetime object
bitcoin['Timestamp']=pd.to_datetime(bitcoin['Timestamp'],unit='s')
# index by second
bitcoin.set_index('Timestamp', inplace = True)
# aggregate each group of seconds with by day for prediction, month for graphing
bitcoin_daily = bitcoin.resample('1D').mean()
bitcoin_monthly = bitcoin.resample('1M').mean()

KeyError: 'Timestamp'

In [None]:
bitcoin_daily

In [None]:
bitcoin_monthly

In [29]:
# Set Index to column so it can be referenced in altair
bitcoin_monthly["Month"] = bitcoin_monthly.index
# Graph Bitcoin Price over time
alt.Chart(bitcoin_monthly).mark_line().encode(
    x = alt.X('Month', title='Year'),
    y = alt.Y('Close', axis=alt.Axis(title='Closing Price (USD)')),
    color=alt.value('orange'),
).properties(
    title='Bitcoin Historical Price'
).configure_title(
    fontSize=20,
    font='Courier',
    anchor='start',
    color='gray'
)

Let's look at the general trend in Bitcoin Stock Prices over the past 8 years. There is a noticeable, meteoric rise in 2017 that is preceeded by years of slow growth.

In [17]:
# Do dates correspond with data?
bitcoin_daily["Month"] = pd.DatetimeIndex(bitcoin_daily['day']).month
bitcoin_daily["Year"] = pd.DatetimeIndex(bitcoin_daily['day']).year
bitcoin_daily.info

monthly_averages = bitcoin_daily.groupby('Month').mean()
monthly_averages["Month"] = monthly_averages.index
dates = {1:'January', 2:'February', 3 : 'March', 4: 'April', 5 : 'May', 6:'June', 7:'July', 8:'August', 9:'September', 10: 'October', 11: 'November', 12: 'December'}
monthly_averages.replace({"Month": dates}, inplace=True)

alt.Chart(monthly_averages).mark_bar().encode(
    x= alt.X('Month', sort=None),
    y= alt.Y('Close', axis=alt.Axis(format='$', title='"Average Price of Bitcoin in USD"'))
).properties(
    title = "Average Price of Bitcoin by Month"
).configure_title(
    fontSize=20,
    font='Courier',
    anchor='start',
    color='gray'
)

KeyError: 'Timestamp'

In [None]:
# Often we use basic market indicators to predict stock performance
# Is Bitcoin Coorelated with other instruments as a whole
start = bitcoin_monthly["Month"].iloc[0]
end = bitcoin_monthly["Month"].iloc[100]
bitcoin_monthly["S&P500"] = DataReader.DataReader(['sp500'], 'fred', start, end)
bitcoin_monthly["Nikkiei"] = DataReader.DataReader(['NIKKEI225'], 'fred', start, end)
#bitcoin_monthly["GDP"] = DataReader.DataReader('A191RL1Q225SBEA', 'fred', start, end)
bitcoin_monthly

In [None]:
# is bitcoin coorelated with these indicators
bitcoin_monthly.corr()['Close'].iloc[7:]

In [None]:
# Nicely visualized with chart below
bitcoin_sp = alt.Chart(bitcoin_monthly).mark_point().encode(
    x = alt.X('Close', axis=alt.Axis(format='$', title='Bitcoin (USD)')),
    y = alt.Y('S&P500',axis=alt.Axis(format='$', title='S&P (USD)')),
    color=alt.value('green'),
).mark_circle(size=30)
(bitcoin_sp + bitcoin_sp.transform_regression('Close', 'S&P500').mark_line()).configure_title(
    fontSize=20,
    font='Courier',
    anchor='start',
    color='gray'
).properties(
    title = "Coorelation Between Bitcoin, S&P 500"
)

In [None]:
bitcoin_sp = alt.Chart(bitcoin_monthly).mark_point().encode(
    x = alt.X('Close', axis=alt.Axis(format='$', title='Bitcoin (USD)')),
    y = alt.Y('Nikkiei',axis=alt.Axis(format='$', title='Nikkiei (Tokyo) (USD)')),
    color=alt.value('red'),
).mark_circle(size=30)
(bitcoin_sp + bitcoin_sp.transform_regression('Close', 'Nikkiei').mark_line()).configure_title(
    fontSize=20,
    font='Courier',
    anchor='start',
    color='gray'
).properties(
    title = "Coorelation Between Bitcoin, Nikkie Index"
)

In [None]:
# It seems like it is, so lets fit a model with these values
# first add data to day by day value
bitcoin_daily["day"] = bitcoin_daily.index


start = bitcoin_daily["day"].iloc[0]
end = bitcoin_daily["day"].iloc[3035]
bitcoin_daily["S&P500"] = DataReader.DataReader(['sp500'], 'fred', start, end)
bitcoin_daily["Nikkiei"] = DataReader.DataReader(['NIKKEI225'], 'fred', start, end)

In [None]:
print(bitcoin_daily.isnull().sum()*100/bitcoin_daily.isnull().count())

In [None]:
# NaNs correspond to weekends/holidays, so we drop them
bitcoin_daily.dropna(inplace = True)

In [None]:
bitcoin_daily

In [None]:
# Do dates correspond with data?
bitcoin_daily["Month"] = pd.DatetimeIndex(bitcoin_daily['day']).month
bitcoin_daily["Year"] = pd.DatetimeIndex(bitcoin_daily['day']).year
bitcoin_daily.info

In [None]:
monthly_averages = bitcoin_daily.groupby('Month').mean()
monthly_averages["Month"] = monthly_averages.index
dates = {1:'January', 2:'February', 3 : 'March', 4: 'April', 5 : 'May', 6:'June', 7:'July', 8:'August', 9:'September', 10: 'October', 11: 'November', 12: 'December'}
monthly_averages.replace({"Month": dates}, inplace=True)

In [None]:
alt.Chart(monthly_averages).mark_bar().encode(
    x= alt.X('Month', sort=None),
    y= alt.Y('Close', axis=alt.Axis(format='$', title='Average Price of Bitcoin in USD'))
).properties(
    title = "Average Price of Bitcoin by Month"
).configure_title(
    fontSize=20,
    font='Courier',
    anchor='start',
    color='gray'
)

In [None]:
# Make a model
def add_bias(data):
    data.insert(0, 'ones', np.ones(len(data)))

Y = bitcoin_daily["Close"]
X = bitcoin_daily[["Volume_(BTC)", "Volume_(Currency)", "S&P500", "Nikkiei", "Month"]]

In [None]:
add_bias(X)

In [None]:
X.replace({"Month": dates}, inplace=True)

In [None]:
X

In [None]:
# One Hot Encoding
from sklearn.feature_extraction import DictVectorizer

# one-hot encoding 
X_features_with_Month=X

Month = X_features_with_Month[['Month']].to_dict(orient='records') 

encoder = DictVectorizer(sparse=False)
Month_df = pd.DataFrame(
    data = encoder.fit_transform(Month),
    columns = encoder.feature_names_
)


# adjusting the index inconsistency issue
X_features_with_Month.reset_index(drop=True, inplace=True)
Month_df.reset_index(drop=True, inplace=True)

# Combine the features together with pd.concat
X_with_Month = pd.concat([X_features_with_Month,Month_df],ignore_index=False,axis=1).drop(['Month'],axis=1)
X_with_Month

In [None]:
def avg_squared_loss(y, y_hat):
    #return np.array([(y[i] - y_hat[i])**2 for i in range(0,len(y))]).sum()/len(y)
    return np.mean((y - y_hat)**2)

model = LinearRegression()
result = model.fit(X_with_Month,Y)
Y_hat = model.predict(X_with_Month)
loss = avg_squared_loss(Y_hat, Y)
loss

In [None]:
model.coef_

In [None]:
model.coef_.max()

In [None]:
alt.data_transformers.disable_max_rows()

source = pd.DataFrame({
    'Y': Y,
    'Y_hat': Y_hat
})

layer1 = alt.Chart(source).mark_circle(size=4).encode(
    x='Y',
    y='Y_hat',
    color = alt.value('red')
).properties(
    title='Expected (Y) vs Prediction (Y_Hat)'
)

layer1 + layer1.transform_regression('Y', 'Y_hat').mark_line()

In [None]:
# Find worse predictions of Linear Model SCIKIT Learn
bad_predictions = pd.DataFrame(data=Y, columns=["Y"])
bad_predictions["Predicted"] = Y
bad_predictions["Y_hat"] = Y_hat
bad_predictions["Difference"] = (bad_predictions["Predicted"] - bad_predictions["Y_hat"])**2

In [None]:
# Tends to be inaccurate for larger values of Y

In [None]:
bad_predictions["Bad Guess"] = bad_predictions["Difference"] > bad_predictions["Difference"].mean()

In [None]:
bad_predictions[bad_predictions["Bad Guess"]==True]
bitcoin_daily["Bad Guess"] = bad_predictions["Bad Guess"]

In [None]:
bad_dates = bitcoin_daily[bitcoin_daily["Bad Guess"]==True].groupby('Month').count()
bad_dates["Month"] = bad_dates.index
dates = {1:'January', 2:'February', 3 : 'March', 4: 'April', 5 : 'May', 6:'June', 7:'July', 8:'August', 9:'September', 10: 'October', 11: 'November', 12: 'December'}
bad_dates.replace({"Month": dates}, inplace=True)

In [None]:
alt.Chart(bad_dates).mark_bar().encode(
    x= alt.X('Month', sort=None),
    y= alt.Y('Close', axis=alt.Axis(title='"Count of Bad Predictions"'))
).properties(
    title = "Bad Predictions by Date"
).configure_title(
    fontSize=20,
    font='Courier',
    anchor='start',
    color='gray'
)

In [None]:
bitcoin_daily.loc[bad_predictions["Difference"].idxmax()]

In [None]:
bad_predictions

In [None]:
bad_dates2 = bitcoin_daily[bitcoin_daily["Bad Guess"]==True].groupby('Year').count()
bad_dates2["Year"] = bad_dates2.index
#dates = {1:'January', 2:'February', 3 : 'March', 4: 'April', 5 : 'May', 6:'June', 7:'July', 8:'August', 9:'September', 10: 'October', 11: 'November', 12: 'December'}
#bad_dates.replace({"Month": dates}, inplace=True)

In [None]:
alt.Chart(bad_dates2).mark_bar(size=10).encode(
    x= alt.X('Year:O'),
    y= alt.Y('Close', axis=alt.Axis(title='Count of Bad Predictions'))
).properties(
    title = "Bad Predictions by Year"
).configure_title(
    fontSize=20,
    font='Courier',
    anchor='start',
    color='gray'
)

In [None]:
# Notice that most of the changes took place during bitcoins meteoric rise, when the predicitons were lower.