<a href="https://colab.research.google.com/github/Christianchesire/AI_project/blob/master/BuildItchOrderBook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [3]:
import gzip
import shutil
from pathlib import Path
from urllib.request import urlretrieve
from urllib.parse import urljoin
from clint.textui import progress
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
from struct import unpack
from collections import namedtuple, Counter
from datetime import timedelta
from time import time

In [2]:
!pip install clint

Collecting clint
  Downloading https://files.pythonhosted.org/packages/3d/b4/41ecb1516f1ba728f39ee7062b9dac1352d39823f513bb6f9e8aeb86e26d/clint-0.5.1.tar.gz
Collecting args
  Downloading https://files.pythonhosted.org/packages/e5/1c/b701b3f4bd8d3667df8342f311b3efaeab86078a840fb826bd204118cc6b/args-0.1.0.tar.gz
Building wheels for collected packages: clint, args
  Building wheel for clint (setup.py) ... [?25l[?25hdone
  Created wheel for clint: filename=clint-0.5.1-cp37-none-any.whl size=34472 sha256=ab6421cc1166650e43dd3bbf5386e3d64259c6486b37dcae7778e0a935629dae
  Stored in directory: /root/.cache/pip/wheels/4f/e9/45/223565e5b1a4b09e12c6de6f8ba7c2c0e9127dec17cf830f83
  Building wheel for args (setup.py) ... [?25l[?25hdone
  Created wheel for args: filename=args-0.1.0-cp37-none-any.whl size=3320 sha256=eb714b863897088ecd745bc6a65ded01d50104c640de66b9a9a279d85579cc24
  Stored in directory: /root/.cache/pip/wheels/58/54/ea/d995d18af68c057eb76b87b02c92bc66ac34d360ef141780f4
Successful

In [4]:
pwd

'/content'

In [5]:
ls -a
# '/content/sample_data'

[0m[01;34m.[0m/  [01;34m..[0m/  [01;34m.config[0m/  [01;34msample_data[0m/


## Get NASDAQ ITCH Data from FTP Server

In [6]:
 # store data in subdirectory
data_path = Path('/content/sample_data') # set to e.g. external harddrive
itch_store = str(data_path / 'itch.h5')
order_book_store = data_path / 'order_book.h5'

In [7]:
FTP_URL = 'ftp://emi.nasdaq.com/ITCH/Nasdaq_ITCH/'
SOURCE_FILE = '03272019.NASDAQ_ITCH50.gz'

In [8]:
def may_be_download(url):
    """Download & unzip ITCH data if not yet available"""
    filename = data_path / url.split('/')[-1]
    if not data_path.exists():
        print('Creating directory')
        data_path.mkdir()
    if not filename.exists():
        print('Downloading...', url)
        urlretrieve(url, filename)
    unzipped = data_path / (filename.stem + '.bin')
    if not (data_path / unzipped).exists():
        print('Unzipping to', unzipped)
        with gzip.open(str(filename), 'rb') as f_in:
            with open(unzipped, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
    return unzipped

In [9]:
file_name = may_be_download(urljoin(FTP_URL, SOURCE_FILE))
date = file_name.name.split('.')[0]

Downloading... ftp://emi.nasdaq.com/ITCH/Nasdaq_ITCH/03272019.NASDAQ_ITCH50.gz
Unzipping to /content/sample_data/03272019.NASDAQ_ITCH50.bin


In [11]:
cd sample_data

/content/sample_data


In [14]:
ls

03272019.NASDAQ_ITCH50.bin  california_housing_test.csv   mnist_train_small.csv
03272019.NASDAQ_ITCH50.gz   california_housing_train.csv  [0m[01;32mREADME.md[0m*
[01;32manscombe.json[0m*              mnist_test.csv


## ITCH format settings

Using struct module for binary data

Defining format strings

In [15]:
event_codes = {'O': 'Start of Messages',
               'S': 'Start of System Hours',
               'Q': 'Start of Market Hours',
               'M': 'End of  Market Hours',
               'E': 'End of System Hours',
               'C': 'End of Messages'}

In [17]:
encoding = {'primary_market_marker' : {'Y': 1, 'N': 0},
            'printable'            : {'Y': 1, 'N': 0},
            'buy_sell_indicator'   : {'B': 1, 'S': -1},
            'cross_type'           : {'O': 0, 'C': 1, 'H': 2},
            'imbalance_direction'  : {'B': 0, 'S': 1, 'N': 0, 'O': -1}}

In [18]:
formats = {
    ('integer', 2): 'H', # int of length 2 => format string 'H"
    ('integer', 4): 'I',
    ('integer', 6): '6s', # int of length 6 => parse as string, convert later
    ('integer', 8): 'Q',
    ('alpha', 1)  : 's',
    ('alpha', 2)  : '2s',
    ('alpha', 4)  : '4s', 
    ('alpha', 8)  : '8s', 
    ('price_4', 4): 'I', 
    ('price_8', 8): 'Q' # Cross Trade Message
}

Create message specs for binary data parser

In [19]:
#upload file for messages
#https://www.roelpeters.be/how-to-uploading-files-in-google-colab/
from google.colab import files
data_upload = files.upload()

Saving message_types.xlsx to message_types.xlsx


In [22]:
ls

03272019.NASDAQ_ITCH50.bin  california_housing_test.csv   mnist_test.csv
03272019.NASDAQ_ITCH50.gz   california_housing_train.csv  mnist_train_small.csv
[0m[01;32manscombe.json[0m*              message_types.xlsx            [01;32mREADME.md[0m*


In [25]:
!pip install -q xlrd

In [30]:
message_data = (pd.read_excel('message_types.xlsx',
                              sheet_name='messages')
                .sort_values('id')
                .drop('id', axis=1))

In [31]:
message_data.head()

Unnamed: 0,Name,Offset,Length,Value,Notes
0,Message Type,0,1,S,System Event Message
1,Stock Locate,1,2,Integer,Always 0
2,Tracking Number,3,2,Integer,Nasdaq internal tracking number
3,Timestamp,5,6,Integer,Nanoseconds since midnight
4,Event Code,11,1,Alpha,See System Event Codes below


## Basic Cleaning

In [None]:
def clean