In [1]:
import MySQLdb as mdb
import pandas as pd
import numpy as np
from datetime import datetime
from pytz import timezone

In [2]:
# Create database connection
db_host = 'localhost'
db_user = 'crypto_user'
db_pass = 'Blue0811'
db_name = 'cryptocurrencies_master'
db_connection = mdb.connect(host=db_host, user=db_user,
                            passwd=db_pass, db=db_name)

In [3]:
# Extract cryptocurrency trading pairs from price_1min table
with db_connection.cursor() as cur:
    cur.execute('SELECT DISTINCT id FROM price_1min;')
    crypto_pairs = list(cur.fetchall())
print('Crypto-Pairs from 1-minute Data Table:')
for pair in crypto_pairs:
    print(pair[0])

Crypto-Pairs from 1-minute Data Table:
BCH-USD
BTC-USD
ETH-USD
LTC-USD


In [4]:
# Define function to transform unix timestamps to trading day indicies
def stamp_to_trading_day(timestamp):
    date = datetime.fromtimestamp(timestamp).astimezone(timezone('UTC'))
    minute = date.hour*60 + date.minute
    return (date.month, date.day, minute)

In [5]:
# Obtain volume data for each currency fom database
unix_start = 1572566400
unix_end = 1583020800
crypto_data = {}

for pair in crypto_pairs:
    pair = pair[0]
    with db_connection.cursor() as cur:
        execution_str = 'SELECT price_date, volume \nFROM price_1min \nWHERE id = "{}" AND '.format(pair)
        execution_str += '(price_date >= {} AND price_date < {})\n'.format(unix_start, unix_end)
        execution_str += 'ORDER BY price_date;'
        print('query:\n{}\n'.format(execution_str))
        cur.execute(execution_str)
        data = list(cur.fetchall())
    crypto_data[pair] = pd.DataFrame(data, columns=('price_date', 'volume'))
    crypto_data[pair]['price_date'] = crypto_data[pair]['price_date'].apply(stamp_to_trading_day)
    crypto_data[pair] = crypto_data[pair].set_index('price_date')

query:
SELECT price_date, volume 
FROM price_1min 
WHERE id = "BCH-USD" AND (price_date >= 1572566400 AND price_date < 1583020800)
ORDER BY price_date;

query:
SELECT price_date, volume 
FROM price_1min 
WHERE id = "BTC-USD" AND (price_date >= 1572566400 AND price_date < 1583020800)
ORDER BY price_date;

query:
SELECT price_date, volume 
FROM price_1min 
WHERE id = "ETH-USD" AND (price_date >= 1572566400 AND price_date < 1583020800)
ORDER BY price_date;

query:
SELECT price_date, volume 
FROM price_1min 
WHERE id = "LTC-USD" AND (price_date >= 1572566400 AND price_date < 1583020800)
ORDER BY price_date;



In [6]:
# Define the trading day indicies
trading_days = []
for m, n in ((11, 30), (12, 31), (1, 31), (2, 29)):
    trading_days += [(m, i) for i in range(1, n + 1)]
    
volume_cryptos = pd.DataFrame(columns=trading_days, index=range(0, 1440))

In [7]:
# Define function to compute the average volume for each trading day and minute 
def compute_vol_avg(trading_stamp, crypto_data):
    total_vol = 0
    for pair in crypto_data.keys():
        if crypto_data[pair].index.contains(trading_stamp):
            total_vol += crypto_data[pair]['volume'][trading_stamp]
        else:
            total_vol += 0
    return total_vol / len(crypto_data.keys())

In [8]:
# Compute volume data for the volume_cryptos table
for day in volume_cryptos.columns:
    for minute in volume_cryptos.index:
        trading_stamp = day + (minute,)
        volume_cryptos[day][minute] = compute_vol_avg(trading_stamp, crypto_data)

  """


In [9]:
print('Data Shape: {}'.format(volume_cryptos.shape))
volume_cryptos.head()

Data Shape: (1440, 121)


Unnamed: 0,"(11, 1)","(11, 2)","(11, 3)","(11, 4)","(11, 5)","(11, 6)","(11, 7)","(11, 8)","(11, 9)","(11, 10)",...,"(2, 20)","(2, 21)","(2, 22)","(2, 23)","(2, 24)","(2, 25)","(2, 26)","(2, 27)","(2, 28)","(2, 29)"
0,8.97077,9.52448,27.7077,28.2831,54.9851,78.7985,24.0819,33.2921,63.0435,6.37282,...,240.041,34.5393,16.8097,20.9787,146.763,243.769,346.125,132.473,163.899,72.0337
1,43.6276,213.665,61.8671,21.0084,21.4793,18.5754,77.7306,12.5094,72.7155,6.57368,...,186.461,138.815,82.3437,1.63956,132.843,29.1683,201.189,972.228,239.794,26.4632
2,11.6722,40.4598,33.2271,8.03392,10.453,11.5149,203.406,4.09453,2.22694,4.11494,...,421.362,86.2996,78.2372,5.75314,549.869,113.372,177.432,405.982,63.7563,35.4816
3,31.1001,5.30367,21.1073,1.75745,168.712,21.6921,13.9024,54.2846,1.26975,45.7626,...,435.134,35.3558,55.9405,256.768,204.554,37.7803,121.499,131.741,44.9759,26.2666
4,7.15688,3.05717,79.5344,7.33955,24.5172,5.23118,13.1275,12.9724,88.4375,11.0792,...,138.428,28.5055,4.95093,17.0059,176.265,31.8712,158.496,105.506,56.4955,291.459


In [10]:
# Compute volume data for the volume_cryptos table
volume_days = pd.DataFrame(columns=crypto_data.keys(), index=range(1440))
minutes = range(24*60)
trading_minutes = {minute: [day + (minute,) for day in trading_days] for minute in minutes}
for pair in crypto_data.keys():
    for minute in minutes:
        volume_days[pair][minute] = crypto_data[pair]['volume'][trading_minutes[minute]].mean()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self.loc[key]


In [11]:
print('Data Shape: {}'.format(volume_days.shape))
volume_days.head()

Data Shape: (1440, 4)


Unnamed: 0,BCH-USD,BTC-USD,ETH-USD,LTC-USD
0,53.5413,22.1059,167.07,241.191
1,44.7915,14.4794,105.412,177.137
2,32.0813,12.0744,108.982,166.337
3,31.7929,13.0342,95.402,127.053
4,25.0684,9.5097,73.9908,110.165


In [12]:
# Write volume data tables to pickle files
volume_cryptos.to_pickle('./volume_cryptos.pkl')
volume_days.to_pickle('./volume_days.pkl')