# Database Access with ClickHouse Connect

## Setup

Please install the following packages:
- clickhouse_connect
- python-dotenv

Need to open ssh tunnel to access remotely
- open terminal/command prompt
- run(with your username replaced): ssh -L 8123:localhost:8123 <user>@ppolak5.ams.stonybrook.edu

Your env file should look like this for remote access (just switch the comment if you're on campus):

#host= "ppolak5.ams.stonybrook.edu." 

host = "localhost"

server_user= "<server_username>"

server_password= "<server_password>"

db_user=  "<db_username>"

db_pass= "<db_password>"

In [None]:
ssh -L 3306:localhost:3306 mhaggerty@ppolak5.ams.stonybrook.edu

# Custom SQL prompts to Clickhouse Database

## Trades

In [1]:
# import helper functions
from utils.clickhouse_query import *

# Note - there is restriction to 1,000,000 rows per day/per user - so it's wise to limit the query to a specific time range for testing purposes - aggregation can also be used to reduce the number of rows returned
# Here is a way to restrict the query to a specific time range
start_hour = 9
end_hour = 10

# Define the query - this query grabs trades data from AAPL on 2017-01-05 between 9am and 11am
query = f"""
SELECT * 
FROM TRADESDB.trades2017view 
WHERE (Symbol = 'AAPL') 
AND (Date = '2017-01-05') 
AND (toHour(Time) BETWEEN {start_hour} AND {end_hour})
AND Trade_Volume > 0
AND Trade_Price > 0
"""

# Execute the query and store the resulting dataframe
data = get_trades(query)

In [None]:
import pandas as pd

query = f"""
SELECT * 
FROM TRADESDB.trades2017view 
WHERE (Symbol = 'AAPL') 
AND (Date = '2017-01-05') 
AND (toHour(Time) BETWEEN {start_hour} AND {end_hour})
AND Trade_Volume > 0
AND Trade_Price > 0
"""

results = client.command(query)

## Next level - use df prompts, don't need all that code

In [12]:
import pandas as pd

# Note - there is restriction to 1,000,000 rows per day/per user - so it's wise to limit the query to a specific time range for testing purposes - aggregation can also be used to reduce the number of rows returned
# Here is a way to restrict the query to a specific time range
start_hour = 9
end_hour = 10

query = f"""
SELECT * 
FROM TRADESDB.trades2017view 
WHERE (Symbol = 'AAPL') 
AND (Date = '2017-01-05') 
AND (toHour(Time) BETWEEN {start_hour} AND {end_hour})
AND Trade_Volume > 0
AND Trade_Price > 0
"""

results = client.query_df(query)


In [13]:
results

#convert to csv
results.to_csv('aapl_trades.csv')

Unnamed: 0,Time,Exchange,Symbol,Sale_Condition,Trade_Volume,Trade_Price,Trade_Stop_Stock_Indicator,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Source_of_Trade,Trade_Reporting_Facility,Participant_Timestamp,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator,Date,YearMonth
0,2017-01-05 09:00:20.099632006-05:00,P,AAPL,@ TI,1,116.03,,0,2744,77,N,,90020098955008,,0,2017-01-05,201701
1,2017-01-05 09:00:20.435260797-05:00,P,AAPL,@ TI,1,116.03,,0,2751,78,N,,90020434563840,,0,2017-01-05,201701
2,2017-01-05 09:01:08.877023201-05:00,K,AAPL,@FTI,42,116.07,,0,2795,48,N,,90108876758000,,1,2017-01-05,201701
3,2017-01-05 09:01:08.877479494-05:00,P,AAPL,@FTI,42,116.07,,0,2796,79,N,,90108876799744,,1,2017-01-05,201701
4,2017-01-05 09:01:08.884202687-05:00,P,AAPL,@FTI,66,116.07,,0,2797,80,N,,90108883539968,,1,2017-01-05,201701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47489,2017-01-05 10:59:59.950371094-05:00,Z,AAPL,@F,100,116.33,,0,532529,4840,N,,105959944003000,,1,2017-01-05,201701
47490,2017-01-05 10:59:59.950377156-05:00,Z,AAPL,@F,100,116.33,,0,532530,4841,N,,105959944084000,,1,2017-01-05,201701
47491,2017-01-05 10:59:59.950397053-05:00,Z,AAPL,@F,300,116.33,,0,532531,4842,N,,105959944770000,,1,2017-01-05,201701
47492,2017-01-05 10:59:59.950402747-05:00,Z,AAPL,@F,100,116.33,,0,532532,4843,N,,105959945311000,,1,2017-01-05,201701


In [17]:
results2 = results.set_index('Participant_Timestamp')
results2

Unnamed: 0_level_0,Time,Exchange,Symbol,Sale_Condition,Trade_Volume,Trade_Price,Trade_Stop_Stock_Indicator,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Source_of_Trade,Trade_Reporting_Facility,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator,Date,YearMonth
Participant_Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
90020098955008,2017-01-05 09:00:20.099632006-05:00,P,AAPL,@ TI,1,116.03,,0,2744,77,N,,,0,2017-01-05,201701
90020434563840,2017-01-05 09:00:20.435260797-05:00,P,AAPL,@ TI,1,116.03,,0,2751,78,N,,,0,2017-01-05,201701
90108876758000,2017-01-05 09:01:08.877023201-05:00,K,AAPL,@FTI,42,116.07,,0,2795,48,N,,,1,2017-01-05,201701
90108876799744,2017-01-05 09:01:08.877479494-05:00,P,AAPL,@FTI,42,116.07,,0,2796,79,N,,,1,2017-01-05,201701
90108883539968,2017-01-05 09:01:08.884202687-05:00,P,AAPL,@FTI,66,116.07,,0,2797,80,N,,,1,2017-01-05,201701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105959944003000,2017-01-05 10:59:59.950371094-05:00,Z,AAPL,@F,100,116.33,,0,532529,4840,N,,,1,2017-01-05,201701
105959944084000,2017-01-05 10:59:59.950377156-05:00,Z,AAPL,@F,100,116.33,,0,532530,4841,N,,,1,2017-01-05,201701
105959944770000,2017-01-05 10:59:59.950397053-05:00,Z,AAPL,@F,300,116.33,,0,532531,4842,N,,,1,2017-01-05,201701
105959945311000,2017-01-05 10:59:59.950402747-05:00,Z,AAPL,@F,100,116.33,,0,532532,4843,N,,,1,2017-01-05,201701


In [15]:
import pandas as pd

# Note - there is restriction to 1,000,000 rows per day/per user - so it's wise to limit the query to a specific time range for testing purposes - aggregation can also be used to reduce the number of rows returned
# Here is a way to restrict the query to a specific time range
start_hour = 9
end_hour = 10

query = f"""
SELECT * 
FROM QUOTESDB.quotes2017view 
WHERE (Symbol = 'AAPL') 
AND (Date = '2017-01-05') 
AND (toHour(Time) BETWEEN {start_hour} AND {end_hour})
"""

quotes = client.query_df(query)

In [16]:
quotes

Unnamed: 0,Time,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,Sequence_Number,National_BBO_Indicator,...,Short_Sale_Restriction_Indicator,LULD_BBO_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,Participant_Timestamp,FINRA_ADF_Timestamp,FINRA_ADF_Market_Participant_Quote_Indicator,Security_Status_Indicator,Date,YearMonth
0,2017-01-05 09:00:03.544051121-05:00,P,AAPL,116.01,5.0,116.09,1.0,R,262437,0,...,0,,,,90003543338752,,,,2017-01-05,201701
1,2017-01-05 09:00:03.544730571-05:00,P,AAPL,116.01,4.0,116.09,1.0,R,262438,0,...,0,,,,90003544046592,,,,2017-01-05,201701
2,2017-01-05 09:00:07.806195362-05:00,P,AAPL,116.01,5.0,116.09,1.0,R,262556,0,...,0,,,,90007805484288,,,,2017-01-05,201701
3,2017-01-05 09:00:07.806554929-05:00,P,AAPL,116.01,4.0,116.09,1.0,R,262557,0,...,0,,,,90007805870080,,,,2017-01-05,201701
4,2017-01-05 09:00:09.915838847-05:00,P,AAPL,116.01,5.0,116.09,1.0,R,262765,0,...,0,,,,90009915143168,,,,2017-01-05,201701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441077,2017-01-05 10:59:59.991147643-05:00,B,AAPL,116.21,1.0,116.37,1.0,R,6046876,0,...,0,,,A,105959991128891,,,,2017-01-05,201701
441078,2017-01-05 10:59:59.994241745-05:00,Z,AAPL,116.32,3.0,116.34,6.0,R,6046890,2,...,0,,,A,105959994037000,,,,2017-01-05,201701
441079,2017-01-05 10:59:59.994397253-05:00,Z,AAPL,116.32,4.0,116.34,6.0,R,6046892,0,...,0,,,A,105959994205000,,,,2017-01-05,201701
441080,2017-01-05 10:59:59.994716652-05:00,J,AAPL,116.10,1.0,116.34,1.0,R,6046896,0,...,0,,,A,105959994517000,,,,2017-01-05,201701


In [18]:
quotes2 = quotes.set_index('Participant_Timestamp')
quotes2

Unnamed: 0_level_0,Time,Exchange,Symbol,Bid_Price,Bid_Size,Offer_Price,Offer_Size,Quote_Condition,Sequence_Number,National_BBO_Indicator,...,Retail_Interest_Indicator,Short_Sale_Restriction_Indicator,LULD_BBO_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,FINRA_ADF_Timestamp,FINRA_ADF_Market_Participant_Quote_Indicator,Security_Status_Indicator,Date,YearMonth
Participant_Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
90003543338752,2017-01-05 09:00:03.544051121-05:00,P,AAPL,116.01,5.0,116.09,1.0,R,262437,0,...,,0,,,,,,,2017-01-05,201701
90003544046592,2017-01-05 09:00:03.544730571-05:00,P,AAPL,116.01,4.0,116.09,1.0,R,262438,0,...,,0,,,,,,,2017-01-05,201701
90007805484288,2017-01-05 09:00:07.806195362-05:00,P,AAPL,116.01,5.0,116.09,1.0,R,262556,0,...,,0,,,,,,,2017-01-05,201701
90007805870080,2017-01-05 09:00:07.806554929-05:00,P,AAPL,116.01,4.0,116.09,1.0,R,262557,0,...,,0,,,,,,,2017-01-05,201701
90009915143168,2017-01-05 09:00:09.915838847-05:00,P,AAPL,116.01,5.0,116.09,1.0,R,262765,0,...,,0,,,,,,,2017-01-05,201701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105959991128891,2017-01-05 10:59:59.991147643-05:00,B,AAPL,116.21,1.0,116.37,1.0,R,6046876,0,...,,0,,,A,,,,2017-01-05,201701
105959994037000,2017-01-05 10:59:59.994241745-05:00,Z,AAPL,116.32,3.0,116.34,6.0,R,6046890,2,...,,0,,,A,,,,2017-01-05,201701
105959994205000,2017-01-05 10:59:59.994397253-05:00,Z,AAPL,116.32,4.0,116.34,6.0,R,6046892,0,...,,0,,,A,,,,2017-01-05,201701
105959994517000,2017-01-05 10:59:59.994716652-05:00,J,AAPL,116.10,1.0,116.34,1.0,R,6046896,0,...,,0,,,A,,,,2017-01-05,201701


In [19]:
merged_df = pd.merge(results2, quotes2, left_index=True, right_index=True, how='outer')


In [20]:
# need to figure out how to merge the two dataframes on the index - read papers
merged_df

Unnamed: 0_level_0,Time_x,Exchange_x,Symbol_x,Sale_Condition,Trade_Volume,Trade_Price,Trade_Stop_Stock_Indicator,Trade_Correction_Indicator,Sequence_Number_x,Trade_Id,...,Retail_Interest_Indicator,Short_Sale_Restriction_Indicator,LULD_BBO_Indicator,SIP_Generated_Message_Identifier,NBBO_LULD_Indicator,FINRA_ADF_Timestamp,FINRA_ADF_Market_Participant_Quote_Indicator,Security_Status_Indicator,Date_y,YearMonth_y
Participant_Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
90020098955008,2017-01-05 09:00:20.099632006-05:00,P,AAPL,@ TI,1,116.03,,0,2744,77,...,,,,,,,,,NaT,
90020434563840,2017-01-05 09:00:20.435260797-05:00,P,AAPL,@ TI,1,116.03,,0,2751,78,...,,,,,,,,,NaT,
90108876758000,2017-01-05 09:01:08.877023201-05:00,K,AAPL,@FTI,42,116.07,,0,2795,48,...,,,,,,,,,NaT,
90108876799744,2017-01-05 09:01:08.877479494-05:00,P,AAPL,@FTI,42,116.07,,0,2796,79,...,,,,,,,,,NaT,
90108883539968,2017-01-05 09:01:08.884202687-05:00,P,AAPL,@FTI,66,116.07,,0,2797,80,...,,,,,,,,,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95959740635262,NaT,,,,,,,,,,...,,0,,,A,,,,2017-01-05,201701
95959845466474,NaT,,,,,,,,,,...,,0,,,A,,,,2017-01-05,201701
95959883649187,NaT,,,,,,,,,,...,,0,,,A,,,,2017-01-05,201701
95959918971392,NaT,,,,,,,,,,...,B,0,,,A,,,,2017-01-05,201701


## Quotes

In [9]:
# import helper functions
from utils.clickhouse_query import *

# Note - there is restriction to 1,000,000 rows per day/per user - so it's wise to limit the query to a specific time range for testing purposes - aggregation can also be used to reduce the number of rows returned
# Here is a way to restrict the query to a specific time range
start_hour = 9
end_hour = 10

# Define the query - this query grabs quotes data from AAPL on 2017-01-05 between 9am and 11am
# get quotes
query = f'''
    SELECT * 
    FROM QUOTESDB.quotes2017
    WHERE Symbol = 'AAPL'
    AND Date = '2017-05-02'
    AND (toHour(Time) BETWEEN {start_hour} AND {end_hour})
'''
# Execute the query and store the resulting dataframe
quotes = get_quotes(query)

AttributeError: 'str' object has no attribute 'keys'

In [7]:
quotes.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrame


In [10]:
# NEED TO LOOK AT RESULTS - STRING
import pandas as pd

query = f'''
    SELECT * 
    FROM QUOTESDB.quotes2017view
    WHERE Symbol = 'AAPL'
    AND Date = '2017-05-02'
    LIMIT 5
'''

# Execute the query and store the resulting dataframe
data = get_quotes(query)

# Export the dataframe to a CSV file
data.to_csv('/Users/michael/Python/Stony Brook/TAQ-Query-Scripts/file.csv', index=False)

AttributeError: 'str' object has no attribute 'keys'

In [11]:
import pandas as pd

query = f'''
    SELECT * 
    FROM QUOTESDB.quotes2017view
    WHERE Symbol = 'AAPL'
    AND Date = '2017-05-02'
    LIMIT 5
'''

results = client.command(query)

Unexpected Http Driver Exception


OperationalError: Error HTTPConnectionPool(host='localhost', port=3306): Max retries exceeded with url: /?session_id=09392952-fb43-11ed-b0b4-f21898187414&database=default&wait_end_of_query=1&send_progress_in_http_headers=1&http_headers_progress_interval_ms=120000 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7ff4110e39d0>: Failed to establish a new connection: [Errno 61] Connection refused')) executing HTTP request http://localhost:3306

## Convert Timestamp to Participant Timestamp

In [67]:
trades = data

In [68]:
import pandas as pd

def convert_timestamp(df, column_name):
    df[column_name] = pd.to_datetime(df[column_name], format='%H%M%S%f').dt.time
    return df

trades = convert_timestamp(trades, 'Participant_Timestamp')

# Ensure "Date" column is in datetime format
trades['Date'] = pd.to_datetime(trades['Date'])

# Ensure "Participant_Timestamp" is of type str
trades['Participant_Timestamp'] = trades['Participant_Timestamp'].astype(str)

# Create new datetime column "DateTime"
trades['DateTime'] = pd.to_datetime(trades['Date'].dt.strftime('%Y-%m-%d') + ' ' + trades['Participant_Timestamp'])


In [74]:
trades

Unnamed: 0_level_0,Time,Exchange,Symbol,Sale_Condition,Trade_Volume,Trade_Price,Trade_Stop_Stock_Indicator,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Source_of_Trade,Trade_Reporting_Facility,Participant_Timestamp,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator,Date,YearMonth
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2017-01-05 09:00:20.098955,2017-01-05 09:00:20.099632006,P,AAPL,@ TI,1,116.03,,0,2744,77,N,,09:00:20.098955,,0,2017-01-05,201701
2017-01-05 09:00:20.434563,2017-01-05 09:00:20.435260797,P,AAPL,@ TI,1,116.03,,0,2751,78,N,,09:00:20.434563,,0,2017-01-05,201701
2017-01-05 09:01:08.876758,2017-01-05 09:01:08.877023201,K,AAPL,@FTI,42,116.07,,0,2795,48,N,,09:01:08.876758,,1,2017-01-05,201701
2017-01-05 09:01:08.876799,2017-01-05 09:01:08.877479494,P,AAPL,@FTI,42,116.07,,0,2796,79,N,,09:01:08.876799,,1,2017-01-05,201701
2017-01-05 09:01:08.883539,2017-01-05 09:01:08.884202687,P,AAPL,@FTI,66,116.07,,0,2797,80,N,,09:01:08.883539,,1,2017-01-05,201701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-01-05 10:59:59.944023,2017-01-05 10:59:59.949257478,K,AAPL,@F,200,116.33,,0,532527,6853,N,,10:59:59.944023,,1,2017-01-05,201701
2017-01-05 10:59:59.944003,2017-01-05 10:59:59.950371094,Z,AAPL,@F,100,116.33,,0,532529,4840,N,,10:59:59.944003,,1,2017-01-05,201701
2017-01-05 10:59:59.944084,2017-01-05 10:59:59.950377156,Z,AAPL,@F,100,116.33,,0,532530,4841,N,,10:59:59.944084,,1,2017-01-05,201701
2017-01-05 10:59:59.944770,2017-01-05 10:59:59.950397053,Z,AAPL,@F,300,116.33,,0,532531,4842,N,,10:59:59.944770,,1,2017-01-05,201701


In [75]:
import pandas as pd

# Assuming 'df' is your DataFrame
trades['Time - PTS'] = pd.to_datetime(trades['DateTime'])  # Ensuring DateTime column is of datetime type
trades.set_index('Time - PTS', inplace=True)  # Setting DateTime column as index


KeyError: 'DateTime'

In [70]:
trades

Unnamed: 0_level_0,Time,Exchange,Symbol,Sale_Condition,Trade_Volume,Trade_Price,Trade_Stop_Stock_Indicator,Trade_Correction_Indicator,Sequence_Number,Trade_Id,Source_of_Trade,Trade_Reporting_Facility,Participant_Timestamp,Trade_Reporting_Facility_TRF_Timestamp,Trade_Through_Exempt_Indicator,Date,YearMonth
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2017-01-05 09:00:20.098955,2017-01-05 09:00:20.099632006,P,AAPL,@ TI,1,116.03,,0,2744,77,N,,09:00:20.098955,,0,2017-01-05,201701
2017-01-05 09:00:20.434563,2017-01-05 09:00:20.435260797,P,AAPL,@ TI,1,116.03,,0,2751,78,N,,09:00:20.434563,,0,2017-01-05,201701
2017-01-05 09:01:08.876758,2017-01-05 09:01:08.877023201,K,AAPL,@FTI,42,116.07,,0,2795,48,N,,09:01:08.876758,,1,2017-01-05,201701
2017-01-05 09:01:08.876799,2017-01-05 09:01:08.877479494,P,AAPL,@FTI,42,116.07,,0,2796,79,N,,09:01:08.876799,,1,2017-01-05,201701
2017-01-05 09:01:08.883539,2017-01-05 09:01:08.884202687,P,AAPL,@FTI,66,116.07,,0,2797,80,N,,09:01:08.883539,,1,2017-01-05,201701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-01-05 10:59:59.944023,2017-01-05 10:59:59.949257478,K,AAPL,@F,200,116.33,,0,532527,6853,N,,10:59:59.944023,,1,2017-01-05,201701
2017-01-05 10:59:59.944003,2017-01-05 10:59:59.950371094,Z,AAPL,@F,100,116.33,,0,532529,4840,N,,10:59:59.944003,,1,2017-01-05,201701
2017-01-05 10:59:59.944084,2017-01-05 10:59:59.950377156,Z,AAPL,@F,100,116.33,,0,532530,4841,N,,10:59:59.944084,,1,2017-01-05,201701
2017-01-05 10:59:59.944770,2017-01-05 10:59:59.950397053,Z,AAPL,@F,300,116.33,,0,532531,4842,N,,10:59:59.944770,,1,2017-01-05,201701


In [1]:
# import helper functions
from utils.clickhouse_query import *
from utils.data import *
# Select apple trades from January of 2017 to April of 2017
query = f"""
SELECT * 
FROM TRADESDB.trades2017view 
WHERE (Symbol = 'AAPL') 
AND (Date = '2017-01-05') 
AND (toHour(Time) BETWEEN {start_hour} AND {end_hour})
AND Trade_Volume > 0
AND Trade_Price > 0
"""
data = load_and_preprocess_data(query)

ValueError: unconverted data remains: 000000