In [3]:
import pandas as pd
import ccxt
import datetime
import time

%load_ext autoreload
%autoreload 2

## Retrieve candle stick data 
- requesting data from phemex upto a certain point with chunking to prevent rate liming

In [13]:
# Initialize the exchange - Using Phemex
exchange = ccxt.phemex({"enableRateLimit": True})

# Define the symbols
symbol = "BTCUSDT"  # This is the correct symbol for Phemex USDT-margined perpetual
timeframe = "15m"  # You can change this to any of the supported timeframes

# Calculate start and end times
end_time = datetime.datetime.now()
start_time = end_time - datetime.timedelta(days=365)  # 1 year worth of data


# Function to fetch data in chunks
def fetch_ohlcv_data(start_time, end_time, timeframe):
    all_candles = []
    current_time = start_time

    # Define timeframe durations
    timeframe_durations = {
        "1m": datetime.timedelta(minutes=1),
        "3m": datetime.timedelta(minutes=3),
        "5m": datetime.timedelta(minutes=5),
        "10m": datetime.timedelta(minutes=10),
        "15m": datetime.timedelta(minutes=15),
        "1h": datetime.timedelta(hours=1),
        "4h": datetime.timedelta(hours=4),
        "1d": datetime.timedelta(days=1),
    }

    if timeframe not in timeframe_durations:
        raise ValueError(f"Unsupported timeframe: {timeframe}")

    duration = timeframe_durations[timeframe]

    # Start the timer
    start_time_perf = time.time()

    while current_time < end_time:
        try:
            print(f"Fetching data from {current_time}")
            candles = exchange.fetch_ohlcv(
                symbol,
                timeframe,
                since=int(current_time.timestamp() * 1000),
                limit=1000,  # Phemex allows up to 1000 candles per request
            )

            if not candles:
                break

            all_candles.extend(candles)

            # Update current_time to the last candle's time + 1 timeframe
            last_candle_time = datetime.datetime.fromtimestamp(candles[-1][0] / 1000)
            current_time = last_candle_time + duration

            print(f"Fetched {len(candles)} candles. Next fetch from {current_time}")

            # Sleep to respect rate limits
            time.sleep(exchange.rateLimit / 1000)  # rateLimit is in milliseconds

        except ccxt.NetworkError as e:
            print(f"Network error occurred: {str(e)}. Retrying in 10 seconds...")
            time.sleep(10)
        except ccxt.ExchangeError as e:
            print(f"Exchange error occurred: {str(e)}. Stopping.")
            break

    # End the timer
    end_time_perf = time.time()

    # Calculate the execution time
    execution_time = end_time_perf - start_time_perf

    print(f"Data fetching completed in {execution_time:.2f} seconds")

    return all_candles, execution_time


# Fetch the data and measure performance
candles, fetch_time = fetch_ohlcv_data(start_time, end_time, timeframe)

# Create DataFrame
df = pd.DataFrame(
    candles, columns=["Timestamp", "Open", "High", "Low", "Close", "Volume"]
)

# Convert Timestamp to datetime and set as index
df["Timestamp"] = pd.to_datetime(df["Timestamp"], unit="ms")
df.set_index("Timestamp", inplace=True)

# Convert columns to appropriate types
df = df.astype(
    {
        "Open": "float64",
        "High": "float64",
        "Low": "float64",
        "Close": "float64",
        "Volume": "float64",
    }
)

print(df.dtypes)  # Print data types
print(df.head())  # Print the first few rows of the dataframe


# Function to calculate expected number of candles
def calculate_expected_candles(start_time, end_time, timeframe):
    timeframe_minutes = {
        "1m": 1,
        "3m": 3,
        "5m": 5,
        "10m": 10,
        "15m": 15,
        "1h": 60,
        "4h": 240,
        "1d": 1440,
    }
    if timeframe not in timeframe_minutes:
        raise ValueError(f"Unsupported timeframe: {timeframe}")
    duration = end_time - start_time
    total_minutes = duration.total_seconds() / 60
    expected_candles = total_minutes / timeframe_minutes[timeframe]
    return int(expected_candles)


# Calculate and print expected number of candles
expected_candles = calculate_expected_candles(start_time, end_time, timeframe)
print(f"Expected number of candles: {expected_candles}")
print(f"Actual number of candles: {len(df)}")
print(f"Total execution time: {fetch_time:.2f} seconds")

df  # Return the dataframe for display in Jupyter

Fetching data from 2023-08-17 19:49:19.841113
Fetched 1000 candles. Next fetch from 2023-08-28 06:00:00
Fetching data from 2023-08-28 06:00:00
Fetched 1000 candles. Next fetch from 2023-09-07 16:00:00
Fetching data from 2023-09-07 16:00:00
Fetched 1000 candles. Next fetch from 2023-09-18 02:00:00
Fetching data from 2023-09-18 02:00:00
Fetched 1000 candles. Next fetch from 2023-09-28 12:00:00
Fetching data from 2023-09-28 12:00:00
Fetched 1000 candles. Next fetch from 2023-10-08 22:00:00
Fetching data from 2023-10-08 22:00:00
Fetched 1000 candles. Next fetch from 2023-10-19 08:00:00
Fetching data from 2023-10-19 08:00:00
Fetched 1000 candles. Next fetch from 2023-10-29 18:00:00
Fetching data from 2023-10-29 18:00:00
Fetched 1000 candles. Next fetch from 2023-11-09 03:00:00
Fetching data from 2023-11-09 03:00:00
Fetched 1000 candles. Next fetch from 2023-11-19 13:00:00
Fetching data from 2023-11-19 13:00:00
Fetched 1000 candles. Next fetch from 2023-11-29 23:00:00
Fetching data from 2023

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-08-18 00:00:00,26608.9,26828.2,26582.9,26785.4,1109.459
2023-08-18 00:15:00,26769.9,26788.1,26702.8,26716.9,1119.466
2023-08-18 00:30:00,26718.1,26728.4,26649.1,26672.5,2838.674
2023-08-18 00:45:00,26680.0,26740.2,26632.5,26740.2,1721.856
2023-08-18 01:00:00,26723.9,26752.9,26550.1,26559.4,1454.375
...,...,...,...,...,...
2024-08-16 22:30:00,59090.0,59151.9,59016.4,59097.8,25.564
2024-08-16 22:45:00,59097.6,59097.6,58941.9,58941.9,7.609
2024-08-16 23:00:00,58942.0,58963.8,58700.0,58700.0,34.808
2024-08-16 23:15:00,58712.3,58993.1,58712.3,58987.8,18.994


### Generate a dynamic output file name 

In [14]:
# Check if 'Timestamp' is the index
if df.index.name == "Timestamp":
    # If 'Timestamp' is the index, we can use it directly
    start_date = df.index.min().strftime("%Y%m%d")
    end_date = df.index.max().strftime("%Y%m%d")
else:
    # If 'Timestamp' is a column, use it to get start and end dates
    start_date = df["Timestamp"].min().strftime("%Y%m%d")
    end_date = df["Timestamp"].max().strftime("%Y%m%d")

# # Extract the symbol (assuming it's defined in your code)
# symbol = "BTCUSDT"  # Make sure this matches the symbol in your code

# # Extract the timeframe (assuming it's defined in your code)
# timeframe = "1m"  # Make sure this matches the timeframe in your code

# Create the dynamic filename
output_filename = f"{symbol}_{timeframe}_{start_date}_{end_date}.parquet"  # generates full candlestick_data
output_filename = f"./saved_candlestick_data/{output_filename}"  #

print(f"Generated filename: {output_filename}")

Generated filename: ./saved_candlestick_data/BTCUSDT_15m_20230818_20240816.parquet


### Save the candles dataframe to a parquet file on disk
_only run this cell after the previous 2, fetching candle stick data and createing the output filename_

In [15]:
%time df.to_parquet(output_filename)

CPU times: user 12.2 ms, sys: 2.9 ms, total: 15.1 ms
Wall time: 15.8 ms
