In [2]:
from coinbase.rest import RESTClient
from json import dumps
import math
import requests
import pandas as pd
import datetime
import os
import numpy as np
import glob
import tqdm
import asyncio
import ccxt
from tqdm.auto import tqdm

import matplotlib as mpl
from matplotlib import pyplot as plt

from datetime import datetime, timedelta, timezone
from pprint import pprint

## WITH VIEW PERMISSIONS ONLY
api_key = "organizations/XXXXXX"
api_secret = "-----BEGIN EC PRIVATE KEY-----\XXXXXX\n-----END EC PRIVATE KEY-----\n"

client = RESTClient(api_key=api_key, api_secret=api_secret)

## Data Gathering

In [3]:
product = client.get_product("BTC-USD")
btc_usd_price = float(product["price"])
adjusted_btc_usd_price = str(math.floor(btc_usd_price - (btc_usd_price * 0.05)))
print(adjusted_btc_usd_price)

62215


In [4]:
base_url = 'https://api.pro.coinbase.com'

# Function to get historical rates
def fetch_data_in_chunks(product_id, start, end, granularity):
    chunk_size = timedelta(days=300*granularity/86400)  # Calculate the size of each chunk
    chunks = []
    
    current_start = start
    while current_start < end:
        current_end = min(current_start + chunk_size, end)
        url = f'https://api.pro.coinbase.com/products/{product_id}/candles'
        params = {
            'start': current_start.isoformat(),
            'end': current_end.isoformat(),
            'granularity': granularity
        }
        response = requests.get(url, params=params)
        if response.status_code == 200:
            # Convert to DataFrame and append to the list
            df = pd.DataFrame(response.json(), columns=['datetime', 'open', 'high', 'low', 'close', 'volume'])
            df['datetime'] = pd.to_datetime(df['datetime'], unit='s')
            chunks.append(df)
        else:
            print(f"Error fetching data: {response.json()}")
        current_start = current_end  # Update the start time for the next chunk
    
    # Combine all chunks into a single DataFrame
    return pd.concat(chunks, ignore_index=True)

# Function to get market orders
def get_market_orders(product_id):
    trades_url = f'{base_url}/products/{product_id}/trades'
    trades_response = requests.get(trades_url)
    trades_data = pd.DataFrame()
    book_data = pd.DataFrame()

    if trades_response.status_code == 200:
        trades_data = pd.DataFrame(trades_response.json())
    else:
        print(f"Error fetching trades: {trades_response.json()}")

    book_url = f'{base_url}/products/{product_id}/book?level=2'
    book_response = requests.get(book_url)
    if book_response.status_code == 200:
        bids = pd.DataFrame(book_response.json()['bids'], columns=['price', 'size', 'num-orders'])
        asks = pd.DataFrame(book_response.json()['asks'], columns=['price', 'size', 'num-orders'])
        book_data = pd.concat([bids, asks], ignore_index=True)
    else:
        print(f"Error fetching order book: {book_response.json()}")

    return trades_data, book_data

In [5]:
product_id = 'BTC-USD'
start_date = pd.to_datetime('2022-01-01')
end_date = pd.to_datetime('2024-03-22')
##end_date = datetime.now()
granularity = 3600  # 1 hour in seconds  # Daily data (60, 300, 900, 3600, 21600, 86400)

In [6]:
df = fetch_data_in_chunks(product_id, start_date, end_date, granularity)

# Fetch market orders
#trades, order_book = get_market_orders(product_id)

In [7]:
print(df)

                 datetime      open      high       low     close       volume
0     2022-01-13 12:00:00  43600.03  43833.73  43621.42  43820.28   339.543581
1     2022-01-13 11:00:00  43530.00  43852.73  43805.43  43618.57   377.402679
2     2022-01-13 10:00:00  43782.87  44043.69  43902.05  43805.42   248.686533
3     2022-01-13 09:00:00  43767.81  44009.08  43920.02  43902.03   268.967343
4     2022-01-13 08:00:00  43689.93  43924.39  43722.05  43920.02   220.509126
...                   ...       ...       ...       ...       ...          ...
19457 2024-03-11 04:00:00  68501.01  68838.04  68623.25  68577.58   217.306802
19458 2024-03-11 03:00:00  68314.32  68660.40  68349.05  68623.02   252.320047
19459 2024-03-11 02:00:00  68280.63  68559.04  68348.99  68354.81   398.627069
19460 2024-03-11 01:00:00  67636.62  68428.06  68167.86  68348.99  1180.293303
19461 2024-03-11 00:00:00  67112.21  69038.70  69032.80  68157.54  1316.015692

[19462 rows x 6 columns]


## Clean Data

## Real-time data fetching

In [None]:
import websocket

# Initialize an empty DataFrame to store real-time data
real_time_data = pd.DataFrame(columns=['time', 'price', 'volume'])

# Define the WebSocket URL
socket = "wss://ws-feed.pro.coinbase.com"

# Last update timestamp
last_update = datetime.datetime.utcnow()

def on_message(ws, message):
    global real_time_data, last_update
    
    # Parse the incoming message
    json_message = json.loads(message)
    
    # Process ticker messages only
    if json_message['type'] == 'ticker':
        # Convert timestamp to datetime
        timestamp = pd.to_datetime(json_message['time'])
        
        # Check if a minute has passed since the last update
        if (timestamp - last_update) >= datetime.timedelta(minutes=1):
            # Update the last update timestamp
            last_update = timestamp
            
            # Append the new data to the DataFrame
            new_data = {
                'time': timestamp,
                'price': float(json_message['price']),
                'volume': float(json_message['last_size']),
            }
            real_time_data = real_time_data.append(new_data, ignore_index=True)
            
            # Print the updated DataFrame
            print(real_time_data.tail())  # Print the last few rows

def on_error(ws, error):
    print(error)
    
def on_close(ws):
    print("WebSocket closed")
    
def on_open(ws):
    # Subscribe to the ticker channel
    subscribe_message = {
        "type": "subscribe",
        "channels": [{"name": "ticker", "product_ids": ["BTC-USD"]}]
    }
    ws.send(json.dumps(subscribe_message))

# Create and start the WebSocket client
ws = websocket.WebSocketApp(socket,
                            on_open=on_open,
                            on_message=on_message,
                            on_error=on_error,
                            on_close=on_close)

In [None]:
# Start the WebSocket client
ws.run_forever()

## Data Split

In [None]:
df = df[["date", "close", "volume", "trades"]]
df["Ret"] = df["close"].pct_change()
df["year"] = df["date"].dt.year
del df["close"]
df = df.loc[(df["year"] >= 2021)]  # To reduce the computational time
df["Ret"] = df["Ret"].fillna(0)
df["volume"] = df["volume"].fillna(0)
df["trades"] = df["trades"].fillna(0)
df = df.reindex(
    columns=[
        "date",
        "Ret",
        "volume",
        "trades",
        "year",
        "month",
        "day",
        "week",
        "weekday",
        "hour",
    ]
)
df = df[["date", "Ret"]]

df["Ret_10"] = df["Ret"].rolling(10).apply(lambda x: np.prod(1 + x / 100) - 1)
df["Ret_50"] = df["Ret"].rolling(50).apply(lambda x: np.prod(1 + x / 100) - 1)

df["Ret_25"] = df["Ret"].rolling(25).apply(lambda x: np.prod(1 + x / 100) - 1)
df["Ret25"] = df["Ret_25"].shift(-25)
del df["Ret_25"]
df = df.dropna()

In [None]:
Xdf, ydf = df.iloc[:, 1:-1], df.iloc[:, -1]
X = Xdf.astype("float32")
y = ydf.astype("float32")

In [None]:
val_split = 0.2
train_split = 0.625
train_size = int(len(df) * train_split)
val_size = int(train_size * val_split)
test_size = int(len(df) - train_size)

window_size = 30

ts = test_size
split_time = len(df) - ts
test_time = df.iloc[split_time + window_size :, 0:1].values


y_train_set = y[:split_time]
y_test_set = y[split_time:]

X_train_set = X[:split_time]
X_test_set = X[split_time:]

n_features = X_train_set.shape[1]

In [None]:
scaler_input = MinMaxScaler(feature_range=(-1, 1))
scaler_input.fit(X_train_set)
X_train_set_scaled = scaler_input.transform(X_train_set)
X_test_set_scaled = scaler_input.transform(X_test_set)

mean_ret = np.mean(y_train_set)

scaler_output = MinMaxScaler(feature_range=(-1, 1))
y_train_set = y_train_set.values.reshape(len(y_train_set), 1)
y_test_set = y_test_set.values.reshape(len(y_test_set), 1)
scaler_output.fit(y_train_set)
y_train_set_scaled = scaler_output.transform(y_train_set)

In [None]:
training_time = df.iloc[:split_time, 0:1].values

X_train = []
y_train = []

for i in range(window_size, y_train_set_scaled.shape[0]):
    X_train.append(X_train_set_scaled[i - window_size : i, :])
    y_train.append(y_train_set_scaled[i])

X_train, y_train = np.array(X_train), np.array(y_train)

print("Shape of training data", X_train.shape, y_train.shape)

X_test = []
y_test = y_test_set

for i in range(window_size, y_test_set.shape[0]):
    X_test.append(X_test_set_scaled[i - window_size : i, :])

X_test, y_test = np.array(X_test), np.array(y_test)

print("Shape of test data", X_test.shape, y_test.shape)

## Deep Learning for Estimating Fill Probabilities in a Limit Order Book

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

# Define the LSTM model
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(25))
model.add(Dense(1))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, batch_size=1, epochs=20)

## Reinforcement Learning for Optimal Execution

In [None]:
import gym
import numpy as np

# Create an environment for trading strategy
env = CryptoTradingEnv()

# Define your Q-learning parameters
learning_rate = 0.1
discount_factor = 0.95
exploration_rate = 1.0
max_exploration_rate = 1.0
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

# Initialize Q-table
q_table = np.zeros((env.observation_space.n, env.action_space.n))

# Q-learning algorithm
for episode in range(1000):
    state = env.reset()
    done = False
    
    while not done:
        # Exploration-exploitation trade-off
        if np.random.uniform(0, 1) < exploration_rate:
            action = env.action_space.sample()  # Explore action space
        else:
            action = np.argmax(q_table[state, :])  # Exploit learned values
        
        new_state, reward, done, info = env.step(action)
        
        # Update Q-table
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
            learning_rate * (reward + discount_factor * np.max(q_table[new_state, :]))
        
        state = new_state
        
    # Exploration rate decay
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)

## Variational Autoencoders

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda
from tensorflow.keras.models import Model
from tensorflow.keras.losses import mse
from tensorflow.keras import backend as K

original_dim = x_train.shape[1]
input_shape = (original_dim, )
intermediate_dim = 512
batch_size = 128
latent_dim = 2
epochs = 50

# VAE model = encoder + decoder
# build encoder model
inputs = Input(shape=input_shape, name='encoder_input')
x = Dense(intermediate_dim, activation='relu')(inputs)
z_mean = Dense(latent_dim, name='z_mean')(x)
z_log_var = Dense(latent_dim, name='z_log_var')(x)

# Use reparameterization trick to ensure correct gradient
def sampling(args):
    z_mean, z_log_var = args
    batch = K.shape(z_mean)[0]
    dim = K.int_shape(z_mean)[1]
    epsilon = K.random_normal(shape=(batch, dim))
    return z_mean + K.exp(0.5 * z_log_var) * epsilon

z = Lambda(sampling, output_shape=(latent_dim,), name='z')([z_mean, z_log_var])

# instantiate encoder model
encoder = Model(inputs, [z_mean, z_log_var, z], name='encoder')
encoder.summary()

# build decoder model
latent_inputs = Input(shape=(latent_dim,), name='z_sampling')
x = Dense(intermediate_dim, activation='relu')(latent_inputs)
outputs = Dense(original_dim, activation='sigmoid')(x)

# instantiate decoder model
decoder = Model(latent_inputs, outputs, name='decoder')
decoder.summary()

# instantiate VAE model
outputs = decoder(encoder(inputs)[2])
vae = Model(inputs, outputs, name='vae_mlp')

# VAE loss = mse_loss or xent_loss + kl_loss
reconstruction_loss = mse(inputs, outputs)
reconstruction_loss *= original_dim
kl_loss = 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var)
kl_loss = K.sum(kl_loss, axis=-1)
kl_loss *= -0.5
vae_loss = K.mean(reconstruction_loss + kl_loss)

vae.add_loss(vae_loss)
vae.compile(optimizer='adam')
vae.summary()

# Train the autoencoder
vae.fit(x_train, epochs=epochs, batch_size=batch_size, validation_data=(x_test, None))

## Sentiment Analysis with FinBERT

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
import requests
from bs4 import BeautifulSoup

# Load FinBERT model for sentiment analysis
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)

# Define a function to scrape
def scrape_news_sentiment(url):
    # Make a request to the website
    r = requests.get(url)
    
    # Initialize BeautifulSoup object to parse HTML
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # Find all news articles on the page
    articles = soup.find_all('article')
    news_contents = []
    sentiments = []
    
    # Iterate over each article, scrape the text, and perform sentiment analysis
    for article in articles:
        # Scrape the text from each article
        text = article.get_text()
        
        # Use the nlp pipeline to perform sentiment analysis on the scraped text
        sentiment = nlp(text)
        
        # Append the text and sentiment to the corresponding lists
        news_contents.append(text)
        sentiments.append(sentiment)
    
    return news_contents, sentiments

# Define the URL of the site from which to scrape the news
news_url = "https://crypto.news/"
contents, analyzed_sentiments = scrape_news_sentiment(news_url)

# Output the contents and sentiments
for content, sentiment in zip(contents, analyzed_sentiments):
    print(f"Content: {content[:200]}")  # Truncated for brevity
    print(f"Sentiment: {sentiment}")
    print("\n")