_if getting binance data from the US need a vpn_
- update script to have a progress bar in the fetch candle cell these are the packages that i need
```python
## Use this to add a progress bar to get data
from termcolor import colored, cprint
from progress.bar import IncrementalBar, ShadyBar
```

In [1]:
import pandas as pd
import ccxt
import datetime
import time
import os

%load_ext autoreload
%autoreload 2

## Retrieve candle stick data 
- requesting data from phemex upto a certain point with chunking to prevent rate liming

In [None]:
# Initialize the exchange - Using Phemex
# exchange = ccxt.phemex({"enableRateLimit": True})
exchange = ccxt.binance()  # need a VPN if your in the US

# Define the symbols
symbol = "BTCUSDT"  # This is the correct symbol for Phemex USDT-margined perpetual
timeframe = "1h"  # You can change this to any of the supported timeframes

# Calculate start and end times
end_time = datetime.datetime.now()
start_time = end_time - datetime.timedelta(days=365)  # 1 year worth of data


# Function to fetch data in chunks
def fetch_ohlcv_data(start_time, end_time, timeframe):
    all_candles = []
    current_time = start_time

    # Define timeframe durations
    timeframe_durations = {
        "1m": datetime.timedelta(minutes=1),
        "3m": datetime.timedelta(minutes=3),
        "5m": datetime.timedelta(minutes=5),
        "15m": datetime.timedelta(minutes=15),
        "30m": datetime.timedelta(minutes=30),
        "1h": datetime.timedelta(hours=1),
        "4h": datetime.timedelta(hours=4),
        "1d": datetime.timedelta(days=1),
    }

    if timeframe not in timeframe_durations:
        raise ValueError(f"Unsupported timeframe: {timeframe}")

    duration = timeframe_durations[timeframe]

    # Start the timer
    start_time_perf = time.time()

    while current_time < end_time:
        try:
            print(f"Fetching data from {current_time}")
            candles = exchange.fetch_ohlcv(
                symbol,
                timeframe,
                since=int(current_time.timestamp() * 1000),
                limit=1000,  # Phemex allows up to 1000 candles per request
            )

            if not candles:
                break

            all_candles.extend(candles)

            # Update current_time to the last candle's time + 1 timeframe
            last_candle_time = datetime.datetime.fromtimestamp(candles[-1][0] / 1000)
            current_time = last_candle_time + duration

            print(f"Fetched {len(candles)} candles. Next fetch from {current_time}")

            # Sleep to respect rate limits
            time.sleep(exchange.rateLimit / 1000)  # rateLimit is in milliseconds

        except ccxt.NetworkError as e:
            print(f"Network error occurred: {str(e)}. Retrying in 10 seconds...")
            time.sleep(10)
        except ccxt.ExchangeError as e:
            print(f"Exchange error occurred: {str(e)}. Stopping.")
            break

    # End the timer
    end_time_perf = time.time()

    # Calculate the execution time
    execution_time = end_time_perf - start_time_perf

    print(f"Data fetching completed in {execution_time:.2f} seconds")

    return all_candles, execution_time


# Fetch the data and measure performance
candles, fetch_time = fetch_ohlcv_data(start_time, end_time, timeframe)

# Create DataFrame
df = pd.DataFrame(
    candles, columns=["Timestamp", "Open", "High", "Low", "Close", "Volume"]
)

# Convert Timestamp to datetime and set as index
df["Timestamp"] = pd.to_datetime(df["Timestamp"], unit="ms")
df.set_index("Timestamp", inplace=True)

# Convert columns to appropriate types
df = df.astype(
    {
        "Open": "float64",
        "High": "float64",
        "Low": "float64",
        "Close": "float64",
        "Volume": "float64",
    }
)

print(df.dtypes)  # Print data types
print(df.head())  # Print the first few rows of the dataframe


# Function to calculate expected number of candles
def calculate_expected_candles(start_time, end_time, timeframe):
    timeframe_minutes = {
        "1m": 1,
        "3m": 3,
        "5m": 5,
        "10m": 10,
        "15m": 15,
        "1h": 60,
        "4h": 240,
        "1d": 1440,
    }
    if timeframe not in timeframe_minutes:
        raise ValueError(f"Unsupported timeframe: {timeframe}")
    duration = end_time - start_time
    total_minutes = duration.total_seconds() / 60
    expected_candles = total_minutes / timeframe_minutes[timeframe]
    return int(expected_candles)


# Calculate and print expected number of candles
expected_candles = calculate_expected_candles(start_time, end_time, timeframe)
print(f"Expected number of candles: {expected_candles}")
print(f"Actual number of candles: {len(df)}")
print(f"Total execution time: {fetch_time:.2f} seconds")

df  # Return the dataframe for display in Jupyter

### Generate a dynamic output file name 

In [None]:
# Extract the exchange name
exchange_name = exchange.id  # This will give us 'binance' or 'phemex'

# Check if 'Timestamp' is the index
if df.index.name == "Timestamp":
    # If 'Timestamp' is the index, we can use it directly
    start_date = df.index.min().strftime("%Y%m%d")
    end_date = df.index.max().strftime("%Y%m%d")
else:
    # If 'Timestamp' is a column, use it to get start and end dates
    start_date = df["Timestamp"].min().strftime("%Y%m%d")
    end_date = df["Timestamp"].max().strftime("%Y%m%d")

# Create the directory structure
base_dir = "./saved_candlestick_data"
directory = os.path.join(base_dir, exchange_name)
os.makedirs(directory, exist_ok=True)

# Create the dynamic filename
if exchange_name == "binance":
    filename = f"{exchange_name}_{symbol}_{timeframe}_{start_date}_{end_date}.csv"
else:  # for phemex and potentially other exchanges
    filename = f"{symbol}_{timeframe}_{start_date}_{end_date}.csv"

output_filename = os.path.join(directory, filename)

print(f"Generated filepath: {output_filename}")

### Save the candles dataframe to a parquet file on disk
_only run this cell after the previous 2, fetching candle stick data and createing the output filename_

In [None]:
# %time df.to_parquet(output_filename)
%time df.to_csv(output_filename)

In [None]:
!eza -la saved_candlestick_data/binance

In [None]:
!pwd