In [None]:
import os, logging, shutil
import urllib.request
from datetime import date, datetime, timedelta, timezone
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


config = {'study': 1,  
          'symbol':'BTCUSDT',
          'date':240901 } # YYMMDD

logging.basicConfig(level=logging.INFO, format='[%(asctime)s %(levelname)s %(lineno)d] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

root = os.getenv('MY_ROOT', os.getcwd())
study_dir = os.path.join(root, '_study/{}'.format(config['study']))
os.makedirs(study_dir, exist_ok=True)
#print(study_dir)

dt = datetime.strptime(str(config['date']), '%y%m%d')

pd.options.display.expand_frame_repr = False

indicators = []
outs = []

In [None]:
# Download historic data.
# Skip if already downloaded.
url = 'https://data.binance.vision/data/spot/monthly/klines/{}/1s/{}-1s-{}-{:02}.zip'.format(config['symbol'], config['symbol'], dt.year, dt.month)
zipfile = '{}/download.zip'.format(study_dir)
csvfile = '{}/download/{}-1s-{}-{:02}.csv'.format(study_dir, config['symbol'], dt.year, dt.month)
if not os.path.exists(csvfile):
    urllib.request.urlretrieve(url, zipfile)
    shutil.unpack_archive(zipfile, '{}/download'.format(study_dir))
if not os.path.exists(csvfile):
    raise Exception('Missing {}'.format(csvfile)) 

df = pd.read_csv(csvfile, names='open_time open high low close volume close_time quote_volume count taker_buy_volume taker_buy_quote_volume ignore'.split())
#display(df)
#print(len(df.close))

In [None]:
# Prepare data
df['time'] = pd.to_datetime(df['close_time'], unit='ms')
df = df.set_index('time', drop=False)   # pandas time based windowing needs time index
df['prev_close'] = df['close'].shift(1)
df['ret'] = (df.close - df.prev_close) * 10000 / df.prev_close
#df['ret'] = np.log(df.close/df.prev_close)

# Set targets/outs
#for i in range(10, 100, 10):
for i in range(1, 6, 2):
    out = 'out{}'.format(i)
    outs.append(out)
    out_close = df.close.shift(-i)
    val = ((out_close - df.close) * 10000/ df.close)    # Return in basis points (bps)
    df[out] = val
    #df[out] = np.sign(val) * (np.log(np.abs(val) + 1))
    #std = val.rolling(window=10).std()
    #df[out] = np.where(std == 0, val, val / std)
    #df[out] = np.sign(val) * (np.sqrt(np.abs(val) + 1) - 1)
#display(df)


In [None]:
# Add Indicators/Features
# Compare taker trades on each side, within an expotential moving window.
for i in range(1, 20, 2):
    span = i
    ind = 'ind_trade_side_{}'.format(span)
    indicators.append(ind)
    b = df.taker_buy_volume.ewm(span=span).mean()
    s = (df.volume - df.taker_buy_volume).ewm(span=span).mean()
    df[ind] = np.where(b + s == 0, 0, (b - s) * 2 / (b + s))  
#df.tail(20)

In [None]:
# Add Indicators/Features
# Compare price to its bollinger bands i.e. 2 standard deviation away from mean, for a rolling window.
for i in range(2, 20, 2):
#for i in [1]:
    window = i
    ind = 'ind_bollinger_{}'.format(window)
    indicators.append(ind)
    rolling = df.close.rolling(window=window)
    mean = rolling.mean()
    std = rolling.std()
    # zero from mean to (mean + std) then peaks at (mean + 2*std)
    df[ind] = np.select([std == 0, df.close >= mean, df.close < mean], [0, np.maximum(1 - np.abs(df.close - (mean + 2 * std)) / std, 0), -1 * np.maximum(1 - np.abs(df.close - (mean - 2 * std)) / std, 0)], 0)  
    #df[ind] = np.select([std == 0], [0], (df.close - mean) / std)  
#df.tail(20)

In [None]:
# Add Indicators/Features
# https://technical-analysis-library-in-python.readthedocs.io/en/latest/index.html
# RSI 
import ta.momentum
for i in range(2, 20, 2):
    window = i
    ind = 'ind_rsi_{}'.format(window)
    indicators.append(ind)
    df[ind] = (ta.momentum.RSIIndicator(df.close, window=window).rsi() - 50) / 50
#df.tail(20)

In [None]:
# Add Indicators/Features
# Previous returns.
for i in range(0, 10):
    ind = 'ind_ret_{}'.format(i)
    indicators.append(ind)
    df[ind] = df.ret.shift(i)
#df.tail(20)

In [None]:
# Add Indicators/Features
# How much does price move per trade volume. Compare that value for short and long windows.
for i in range(1, 20, 2):
#for i in range(10, 100, 10):
    sw = i
    lw = i * 5
    ind = 'ind_move_per_trade_{}'.format(i)
    indicators.append(ind)
    sum_sw = df.volume.rolling(window=sw).sum() 
    avg_sw =  np.divide(df.close - df.close.shift(sw), sum_sw, where=sum_sw!=0, out=np.zeros_like(sum_sw))
    sum_lw = df.volume.rolling(window=lw).sum() 
    avg_lw =  np.divide(df.close - df.close.shift(lw), sum_lw, where=sum_lw!=0, out=np.zeros_like(sum_lw))
    val = np.divide(avg_sw, np.abs(avg_lw), where=avg_lw!=0, out=np.zeros_like(avg_lw))
    df[ind] = val
#df.tail(20)

In [None]:
# Check corr
# High correlation between an indicators and target/out is better. 
# df[indicators + outs].corr()

In [None]:
# Check for unexpected NaN values before dropping
nasum = df.isna().sum()
print(nasum.loc[nasum > 10])

In [None]:
# Build models
# For each target/out. Later we will pick best performing model among these.

# Drop NaN values for model. LinearRegression does not accept NaN.
df = df.dropna()
# Define features
X = df[indicators]

models = pd.DataFrame(columns=['out', 'mse', 'r2', 'model'])
for out in outs:
    # Define target
    y = df[out]
    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the model
    model = LinearRegression()
    model.fit(X_train, y_train)
    # Make predictions
    predictions = model.predict(X_test)
    # Evaluate the model
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    models.loc[out] = [out, mse, r2, model]
    #print(pd.DataFrame([model.coef_, model.feature_names_in_]))
print(models)

In [None]:
# Pick model with best score
model = models.sort_values(by='r2', ascending=False).iloc[0]
print(model)

# Add final signal/prediction
df['signal'] = model['model'].predict(X)

In [None]:
# Simulate trading strategy for pnl (profit and loss).
# Try out different thresold values for entry/exit. 
# Then, pick the best performing threshold over training data set. Finally run that on test set.

df_train, df_test = train_test_split(df, test_size=0.5, shuffle=False)      # TODO: do not test on data with which model was build. 

def simulate(df_run, threshold):
    # Trading Strategy
    cost_bps = 0.5 
    balance = 0
    position = 0  # Number of shares
    turnover = 0
    qty = 1
    for i in range(len(df_run.index)):
        price = df_run.close.iloc[i]
        signal = df_run.signal.iloc[i]    

        if signal > threshold and position <= 0:
            position += qty
            balance -= qty * price
            balance -= qty * price * cost_bps / 10000
            turnover += qty * price
            #print(f"Buying {qty} at {price:.2f}")
        elif signal < -1 * threshold and position >= 0:
            # Sell stock
            position -= qty 
            balance += qty * price
            balance -= (qty * price) * cost_bps / 10000
            turnover += qty * price
            #print(f"Selling {qty} shares at {price:.2f}")
    pnl = balance + (position * df_run.close.iloc[-1])
    pnl_bps = pnl * 10000 / turnover
    print(f'debug: simulate threshold: {threshold} pnl: {pnl:.2f} pnl_bps: {pnl_bps:.2f}')
    return pnl_bps


thresholds = pd.DataFrame()
thresholds['threshold'] = np.multiply([0.5, 1, 1.5, 2], df_train.signal.std())
thresholds['pnl_bps'] = thresholds.apply(lambda row: simulate(df_train, row.threshold), axis=1)
print(thresholds)


In [None]:
# Pick threshold with highest pnl in training
threshold = thresholds.sort_values(by='pnl_bps', ascending=False).iloc[0]
print('\nPicked (train data):')
print(threshold)
print()

# Run sim on test data
pnl_bps = simulate(df_test, threshold['threshold'])
print(f'\npnl_bps (test data): {pnl_bps:.2f}')