In [3]:
import requests
import subprocess
import pandas as pd
import numpy as np
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor


In [4]:
#===================SETTINGS=======================#
#Check path for Currency and Dates first!
curreny = "BTCUSDT"
start_date = "2019-01-01" #"2017-08-17"
end_date = "2019-01-10"   #"2023-11-30"
#Original Path: https://data.binance.vision/?prefix=data/spot/daily/klines/BTCUSDT/1m/
base_url = "https://data.binance.vision/data/spot/daily/klines/"+curreny+"/1m/"
csv_filename = curreny+"_daily.csv"
foldername = curreny+"_DL"
sample_period = "D" #daily
#==================================================#
# Generate the list of file URLs
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
file_urls = [f"{base_url}{curreny}-1m-{date.strftime('%Y-%m-%d')}.zip" for date in date_range]

# Set the destination folder
destination_folder = Path(foldername) #CHANGE ME
destination_folder.mkdir(parents=True, exist_ok=True)

def check_file_existence(url):
    response = requests.head(url)
    return response.status_code == 200

def download_file(url, destination_path):
    if check_file_existence(url):
        response = requests.get(url)
        with open(destination_path, 'wb') as file:
            file.write(response.content)

# Function to download files in parallel
def download_files_parallel(file_urls, destination_folder):
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(download_file, url, destination_folder / url.split("/")[-1]) for url in file_urls] #DL to Folder/file.zip
        print("Download started...")
        # Wait for all threads to complete
        for future in futures:
            future.result()

# Function to download files in sequential
def download_files_sequential(file_urls, destination_folder):
    for url in file_urls:
        filename = url.split("/")[-1]
        download_file(url, filename)

# Download files in parallel
download_files_parallel(file_urls, destination_folder)
print("---------Download complete----------")


Download started...


---------Download complete----------


# Unzip files #

In [5]:
import os
import zipfile

def unzip_all_files(destination_folder):
    # Ensure the folder path exists
    if not os.path.exists(destination_folder):
        print(f"The folder '{destination_folder}' does not exist.")
        return

    # Get a list of all files in the folder
    files = os.listdir(destination_folder)

    for file in files:
        file_path = os.path.join(destination_folder, file)
        #print(file_path)
        # Check if the file is a zip file
        if file.endswith('.zip'):
            try:
                # Create a ZipFile object
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    # Extract all contents to the folder
                    zip_ref.extractall(destination_folder)

                # print(f"Unzipped: {file}")
            except zipfile.BadZipFile:
                print(f"Skipped: {file} (Not a valid zip file)")
    print("---------Unzip complete----------")

#Call
unzip_all_files(destination_folder)


---------Unzip complete----------


# Delete ZIP files #

In [6]:
# Count the number of .csv and .zip files in the destination folder
csv_files = list(destination_folder.glob("*.csv"))
zip_files = list(destination_folder.glob("*.zip"))
print(f"Number of .csv files: {len(csv_files)}")
print(f"Number of .zip files: {len(zip_files)}")

if len(csv_files) == len(zip_files):
  for zip_file in zip_files:
      zip_file.unlink()
  print("ZIP files deleted")
else:
  print("Error: Amount of CSV files not same than ZIP files.")


Number of .csv files: 10
Number of .zip files: 10
ZIP files deleted


# Clean Dataframes

In [7]:
CSV_df = []
for csv_file in csv_files:
  _data = pd.read_csv(csv_file, names=["time", "open", "high", "low", "close", "volume BTC", "close_time","volume USD","num_trades","taker_buy_volume","taker_buy_quote_volume","ignore"], parse_dates=['time'])

  #Format and clean data
  _data["time"] = pd.to_datetime(_data["time"], unit="ms")
  _data.set_index("time", inplace=True)
  _data = _data.drop(columns=["close_time", "num_trades", "taker_buy_volume", "taker_buy_quote_volume", "ignore"])
  CSV_df.append(_data)


#Merge and Resample#

In [8]:
merged_df = pd.concat(CSV_df).sort_values(by="time")
merged_df = merged_df.resample(sample_period).agg({
    'open': 'first',
    'high': 'max',
    'low': 'min',
    'close': 'last',
    'volume BTC': 'sum',
    'volume USD': 'sum'
})


# Export #

In [9]:
merged_df.to_csv(csv_filename) #Filename defined in the settings


# Check missing data #

In [14]:
# Generate expected timestamps with a 1-minute frequency
df_test = pd.read_csv(csv_filename)
expected_timestamps = pd.date_range(start=start_date, end=end_date, freq='D')

actual_timestamps = pd.to_datetime(df_test['time'])
missing_timestamps = expected_timestamps[~expected_timestamps.isin(actual_timestamps)]

if missing_timestamps.empty:
    print("The dataset has no missing timestamps.")
else:
    print(f"The dataset has {len(missing_timestamps)} missing timestamps:")
    print(missing_timestamps)


The dataset has no missing timestamps.


In [11]:
# Check specific rows
# df_test[df_test["time"] >= "2017-09-06 16:00:00"]


# Remove temp files #

In [12]:
#Check if new CSV file exist and zipped files was ok.
if len(csv_files) == len(zip_files) and Path(csv_filename).exists():
  !rm -R BTCUSDT_DL/
  print("Temp files removed")
else:
  print("Error: Temp files not removed!")


Temp files removed


In [13]:
#Show CSV file that has created
df_test


Unnamed: 0,time,open,high,low,close,volume BTC,volume USD
0,2019-01-01,3701.23,3810.16,3642.0,3797.14,23741.687033,88149250.0
1,2019-01-02,3796.45,3882.14,3750.45,3858.56,35156.463369,133876600.0
2,2019-01-03,3857.57,3862.74,3730.0,3766.78,29406.948359,111657400.0
3,2019-01-04,3767.2,3823.64,3703.57,3792.01,29519.554671,111034600.0
4,2019-01-05,3790.09,3840.99,3751.0,3770.96,30490.667751,115893500.0
5,2019-01-06,3771.12,4027.71,3740.0,3987.6,36553.806709,142198800.0
6,2019-01-07,3987.62,4017.9,3921.53,3975.45,31869.846264,126830400.0
7,2019-01-08,3976.76,4069.8,3903.0,3955.13,38901.423122,154778800.0
8,2019-01-09,3955.45,4006.81,3930.04,3966.65,28989.439511,115219000.0
9,2019-01-10,3966.06,3996.01,3540.0,3585.88,59402.22851,221789300.0
