In [1]:
from datetime import datetime

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
df = pd.read_csv('2014_to_march2020_btc_minutes_data')
df.shape

(2602679, 6)

In [4]:
df.head()

Unnamed: 0,time,open,close,high,low,volume
0,2014-01-01 08:01:00,746.99,747.0,747.0,746.99,2.0
1,2014-01-01 08:02:00,743.89,744.89,744.89,743.89,0.020695
2,2014-01-01 08:04:00,745.01,745.0,745.01,745.0,0.9
3,2014-01-01 08:05:00,747.0,747.3,747.3,747.0,2.5
4,2014-01-01 08:07:00,744.87,744.87,744.87,744.87,1.230769


In [5]:
def labelling(df, col_idx, window=11):
    """
    Takes a pandas dataframe, sliding window size (default=11), and column index for 'close'
    
    GOAL: create a slidding window, check the middle value of the window, if its the window min() = BUY
    if its the window max() = SELL, otherwise = HOLD. Over the time period of the window find local peaks
    and troughs which would be "BUY" and "SELL" signals.
    
    df: Pandas DataFrame, should have open, close, high, low, and volumn but must have close
    window: int, number of periods used to create the sliding window
    col_idx: int, index number of 'close' column
    
    return: Pandas DataFrame with new column 'labels' which has "BUY", "SELL", and "HOLD" values
    """
    
    # set up the sliding window, mid point, column index, and end value of for loop
    period = int(window)
    mid_point = int((period)/ 2)
    close_col = int(col_idx)
    end = len(df)

    # get start time and print out starting message
    start_time = datetime.now()
    print("starting to label the data")

    # loop through dataframe
    for i in range(end):
        # if window would extend out of index of dataframe pass
        if (i + period) > end:
            pass
        
        else:
            # create window, locate closing price at mid point index
            window = df.iloc[i:i+period, close_col]
            mid_price = df.iat[i+mid_point, close_col]
            
            # use window and mid point to update 'label' column with correct value
            # 1 = BUY, 2 = SELL, 0 = HOLD
            if window.min() == mid_price:
                df.at[i+mid_point, 'label'] = 1
            elif window.max() == mid_price:
                df.at[i+mid_point, 'label'] = 2
            else:
                df.at[i+mid_point, 'label'] = 0
                
    # get ending time and print out duration
    end_time = datetime.now()
    print('Duration: {}'.format(end_time - start_time))
    
    # return dataframe with labels
    return df

In [6]:
df = labelling(df, 2, 21)
df.head(10)

starting to label the data
Duration: 0:07:19.335700


Unnamed: 0,time,open,close,high,low,volume,label
0,2014-01-01 08:01:00,746.99,747.0,747.0,746.99,2.0,
1,2014-01-01 08:02:00,743.89,744.89,744.89,743.89,0.020695,
2,2014-01-01 08:04:00,745.01,745.0,745.01,745.0,0.9,
3,2014-01-01 08:05:00,747.0,747.3,747.3,747.0,2.5,
4,2014-01-01 08:07:00,744.87,744.87,744.87,744.87,1.230769,
5,2014-01-01 08:08:00,744.84,744.87,744.87,744.84,0.37169,
6,2014-01-01 08:10:00,744.84,744.84,744.84,744.84,0.090297,
7,2014-01-01 08:12:00,744.0,744.5,744.5,744.0,4.963859,
8,2014-01-01 08:13:00,744.84,744.5,744.84,744.17,0.959867,
9,2014-01-01 08:14:00,744.17,744.17,744.17,744.17,0.047933,


In [9]:
df.tail(50)

Unnamed: 0,time,open,close,high,low,volume,label
2602629,2020-03-26 01:22:00,6709.8,6701.1,6709.8,6701.1,0.440426,0.0
2602630,2020-03-26 01:23:00,6701.1,6692.3,6701.1,6692.237573,1.51554,0.0
2602631,2020-03-26 01:24:00,6692.237573,6691.4,6692.281604,6691.4,0.577485,0.0
2602632,2020-03-26 01:25:00,6691.4,6691.3,6691.4,6691.3,0.034365,0.0
2602633,2020-03-26 01:26:00,6691.3,6690.9,6691.3,6690.8,5.6961,1.0
2602634,2020-03-26 01:27:00,6692.0,6700.962864,6703.2,6692.0,0.901172,0.0
2602635,2020-03-26 01:28:00,6700.962864,6697.362307,6700.962864,6697.362307,0.022923,0.0
2602636,2020-03-26 01:29:00,6697.362307,6710.0,6710.0,6697.362307,3.884179,0.0
2602637,2020-03-26 01:30:00,6705.1,6715.0,6715.6,6705.1,7.967836,0.0
2602638,2020-03-26 01:31:00,6715.0,6716.9,6717.0,6715.0,0.497144,0.0


In [13]:
df['label'].value_counts(normalize=True)

0.0    0.900912
2.0    0.049960
1.0    0.049128
Name: label, dtype: float64

In [35]:
df.to_csv('btc_minutes_data_labeled')