In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from statsmodels.tsa.arima.model import ARIMA
from arch import arch_model
from fracdiff.sklearn import Fracdiff 
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt
import tsfeatures
from statistics import mean 
import random
import math

In [3]:
def comp_vol(x):
    return np.sqrt(np.sum(x**2))

def read_and_preprocess_stock(filename):
    stock = pd.read_csv(filename)
    stock['WAP'] = (stock['bid_price1'] * stock['ask_size1'] + stock['ask_price1'] * stock['bid_size1']) / (stock['bid_size1'] + stock['ask_size1'])
    stock['BidAskSpread'] = (stock['ask_price1'] / stock['bid_price1']) - 1
    return stock

def compute_log_returns(stock):
    log_r1 = []
    unique_time_id = stock['time_id'].unique()
    for time_id in unique_time_id:
        data = stock[stock['time_id'] == time_id]
        sec = data['seconds_in_bucket'].values
        price = data['WAP'].values
        log_r = np.log(price[1:] / price[:-1])
        log_returns_df = pd.DataFrame({'time': sec[1:], 'log_return': log_r})
        time_no_change = np.setdiff1d(np.arange(1, 601), log_returns_df['time'])
        if len(time_no_change) > 0:
            new_df = pd.DataFrame({'time': time_no_change, 'log_return': [0] * len(time_no_change)})
            log_returns_df = pd.concat([log_returns_df, new_df]).sort_values('time')
        log_r1.append(log_returns_df)
    return log_r1

def calculate_mean_volatility(log_r1):
    vol = []
    for df in log_r1:
        df['time_bucket'] = np.ceil(df['time'] / 30).astype(int)
        grouped = df.groupby('time_bucket')['log_return'].agg(comp_vol)
        all_sum = sum(grouped)
        vol.append(all_sum)
    mean_vol = np.mean(vol)
    return round(mean_vol, 4)

In [4]:
skip_stock = [12, 24, 25, 45, 49, 54, 57, 65, 71, 79, 91, 92, 106, 117, 121]
vols = {}

for i in range(0, 127):
    if i%10 == 0:
        print(i)
    if i not in skip_stock:
        stock = read_and_preprocess_stock(f"individual_book_train/stock_{i}.csv")
        log_r1 = compute_log_returns(stock)
        mean_vol = calculate_mean_volatility(log_r1)
        vols[f'stock_{i}'] = mean_vol

0
10
20
30
40
50
60
70
80
90
100
110
120


In [224]:
random.seed(3888)
top_15p_stocks = []
sorted_stocks = sorted(vols.items(), key=lambda x: x[1], reverse=True)
top_15p_s = sorted_stocks[:int(len(sorted_stocks) * 0.15)]
for item in top_15p_s:
    top_15p_stocks.append(item[0])
stocks_chosen = random.sample(top_15p_stocks, 10)
stocks_chosen

['stock_27',
 'stock_100',
 'stock_18',
 'stock_62',
 'stock_75',
 'stock_9',
 'stock_126',
 'stock_97',
 'stock_33',
 'stock_80']

In [5]:
def compute_log_returns(stock):
    log_r1 = []
    unique_time_id = stock['time_id'].unique()
    for time_id in unique_time_id:
        data = stock[stock['time_id'] == time_id]
        sec = data['seconds_in_bucket'].values
        price = data['WAP'].values
        log_r = np.log(price[1:] / price[:-1])
        log_returns_df = pd.DataFrame({'time': sec[1:], 'log_return': log_r})
        time_no_change = np.setdiff1d(np.arange(1, 601), log_returns_df['time'])
        if len(time_no_change) > 0:
            new_df = pd.DataFrame({'time': time_no_change, 'log_return': [0] * len(time_no_change)})
            log_returns_df = pd.concat([log_returns_df, new_df]).sort_values('time')
        log_r1.append(log_returns_df)
    return log_r1, unique_time_id

def calculate_volatility_id(log_r1, unique_time_id):
    vol_timeid = {}
    count = 0
    for df in log_r1:
        df['time_bucket'] = np.ceil(df['time'] / 30).astype(int)
        grouped = df.groupby('time_bucket')['log_return'].agg(comp_vol)
        all_sum = sum(grouped)
        vol_timeid[f'{unique_time_id[count]}'] = round(all_sum, 4)
        count += 1
    return vol_timeid

In [6]:
chosen_stocks = stocks_chosen.copy()
time_id_list = {}

for stock_id in chosen_stocks:
    stock = pd.read_csv(f"individual_book_train/{stock_id}.csv")
    unique_time_id = stock['time_id'].unique()
    time_id_list[stock_id] = unique_time_id
    
def common_elements(lists_dict):
    lists = list(lists_dict.values())
    common_elements = set(lists[0])
    for lst in lists[1:]:
        common_elements.intersection_update(set(lst))
    return list(common_elements)

common_id = common_elements(time_id_list)

NameError: name 'stocks_chosen' is not defined

In [226]:
vol_timeid = {}

for stock_id in chosen_stocks:
    stock = read_and_preprocess_stock(f"individual_book_train/{stock_id}.csv")
    log_r1, unique_time_id = compute_log_returns(stock)
    stock_vol_timeid = calculate_volatility_id(log_r1, unique_time_id)
    for key, value in stock_vol_timeid.items():
        if int(key) not in common_id:
            next
        else:
            vol_timeid[key] = vol_timeid.get(key, 0) + value

In [231]:
random.seed(3888)
ids_chosen = []
random_10_id = random.sample(vol_timeid.items(), 10)
for i in random_10_id:
    ids_chosen.append(i[0])
ids_chosen

['4707',
 '30641',
 '103',
 '18491',
 '30023',
 '27361',
 '7270',
 '22300',
 '17377',
 '13148']

FOR LOW STOCKS AND TIME_ID

In [22]:
random.seed(3888)
btm_15p_stocks = []
sorted_stocks = sorted(vols.items(), key=lambda x: x[1], reverse=True)
btm_15p_s = sorted_stocks[:int(len(sorted_stocks) * 0.15)]
for item in btm_15p_s:
    btm_15p_stocks.append(item[0])
stocks_chosen = random.sample(btm_15p_stocks, 10)
stocks_chosen

['stock_27',
 'stock_100',
 'stock_18',
 'stock_62',
 'stock_75',
 'stock_9',
 'stock_126',
 'stock_97',
 'stock_33',
 'stock_80']

In [23]:
def compute_log_returns(stock):
    log_r1 = []
    unique_time_id = stock['time_id'].unique()
    for time_id in unique_time_id:
        data = stock[stock['time_id'] == time_id]
        sec = data['seconds_in_bucket'].values
        price = data['WAP'].values
        log_r = np.log(price[1:] / price[:-1])
        log_returns_df = pd.DataFrame({'time': sec[1:], 'log_return': log_r})
        time_no_change = np.setdiff1d(np.arange(1, 601), log_returns_df['time'])
        if len(time_no_change) > 0:
            new_df = pd.DataFrame({'time': time_no_change, 'log_return': [0] * len(time_no_change)})
            log_returns_df = pd.concat([log_returns_df, new_df]).sort_values('time')
        log_r1.append(log_returns_df)
    return log_r1, unique_time_id

def calculate_volatility_id(log_r1, unique_time_id):
    vol_timeid = {}
    count = 0
    for df in log_r1:
        df['time_bucket'] = np.ceil(df['time'] / 30).astype(int)
        grouped = df.groupby('time_bucket')['log_return'].agg(comp_vol)
        all_sum = sum(grouped)
        vol_timeid[f'{unique_time_id[count]}'] = round(all_sum, 4)
        count += 1
    return vol_timeid

In [26]:
chosen_stocks = stocks_chosen.copy()
time_id_list = {}

for stock_id in chosen_stocks:
    stock = pd.read_csv(f"individual_book_train/{stock_id}.csv")
    unique_time_id = stock['time_id'].unique()
    time_id_list[stock_id] = unique_time_id
    
def common_elements(lists_dict):
    lists = list(lists_dict.values())
    common_elements = set(lists[0])
    for lst in lists[1:]:
        common_elements.intersection_update(set(lst))
    return list(common_elements)

common_id = common_elements(time_id_list)

In [28]:
vol_timeid = {}

for stock_id in chosen_stocks:
    stock = read_and_preprocess_stock(f"individual_book_train/{stock_id}.csv")
    log_r1, unique_time_id = compute_log_returns(stock)
    stock_vol_timeid = calculate_volatility_id(log_r1, unique_time_id)
    for key, value in stock_vol_timeid.items():
        if int(key) not in common_id:
            next
        else:
            vol_timeid[key] = vol_timeid.get(key, 0) + value
vol_timeid

{'5': 0.3367,
 '11': 0.1496,
 '16': 0.1341,
 '31': 0.17500000000000002,
 '62': 0.1211,
 '72': 0.5254,
 '97': 0.41690000000000005,
 '103': 0.34120000000000006,
 '109': 0.15309999999999999,
 '123': 0.18650000000000003,
 '128': 0.19829999999999998,
 '146': 0.7082999999999999,
 '147': 0.1492,
 '152': 0.31059999999999993,
 '157': 0.40070000000000006,
 '159': 0.16570000000000001,
 '169': 0.2985,
 '207': 0.4952,
 '211': 0.13240000000000002,
 '213': 0.18209999999999998,
 '218': 0.12250000000000001,
 '227': 0.31029999999999996,
 '229': 0.0871,
 '232': 0.3414,
 '250': 0.18089999999999998,
 '254': 0.6134999999999999,
 '256': 0.1772,
 '266': 0.2684,
 '273': 0.1268,
 '289': 0.1845,
 '297': 0.15149999999999997,
 '303': 0.31670000000000004,
 '309': 0.1845,
 '310': 0.7810999999999999,
 '317': 0.18409999999999999,
 '319': 0.17589999999999997,
 '325': 0.2341,
 '326': 0.5792,
 '335': 0.2933,
 '337': 0.12259999999999999,
 '358': 0.46059999999999995,
 '371': 0.49169999999999997,
 '373': 0.34789999999999993

In [30]:
random.seed(3888)
ids_chosen = []
random_10_id = random.sample(vol_timeid.items(), 10)
for i in random_10_id:
    ids_chosen.append(i[0])
ids_chosen

['4707',
 '30641',
 '103',
 '18491',
 '30023',
 '27361',
 '7270',
 '22300',
 '17377',
 '13148']