# 保存币安合约历史数据

本文档用于下载和保存币安合约的历史K线数据。

In [1]:
import ccxt
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
import os
import json
from tqdm import tqdm

In [2]:
# 设置数据获取参数
params = {
    # 数据获取配置
    'begin_date': '2025-03-01',
    'end_date': '2025-03-24',
    'time_intervals': ['1m'],  # 可选: ['1m', '5m', '15m', '1h', '4h', '1d']
    'use_all_usdt_pairs': False,  # 设置为True则获取所有USDT交易对
    'specific_symbols': ['1000PEPE/USDT:USDT'],  # 当use_all_usdt_pairs为False时使用
    'base_path': r'\\znas\Main\futures',  # 指定数据保存的根目录
    
    # 代理配置
    'proxy': {
        'host': '127.0.0.1',
        'port': 5878
    },
    
    # 交易所基础配置
    'exchange_config': {
        'timeout': 9474,
        'enableRateLimit': False,
        'options': {
            'defaultType': 'future'  # 设置为合约模式
        }
    }
}

# 确保根目录存在
os.makedirs(params['base_path'], exist_ok=True)

# 生成日期列表
start_date = datetime.strptime(params['begin_date'], '%Y-%m-%d')
end_date = datetime.strptime(params['end_date'], '%Y-%m-%d')
date_list = pd.date_range(start=start_date, end=end_date, freq='D').strftime('%Y-%m-%d').tolist()

In [3]:
def scan_existing_files(base_path):
    """预扫描已存在的文件"""
    existing_files = set()
    for root, _, files in os.walk(base_path):
        for file in files:
            if file.endswith('.csv'):
                existing_files.add(file)
    print(f"已扫描到 {len(existing_files)} 个现有文件")
    return existing_files

def analyze_download_status(target_symbols, existing_files, date_list, time_intervals):
    """分析每个交易对的下载情况"""
    stats = {}
    for symbol in target_symbols:
        stats[symbol] = {
            'total_expected': len(date_list) * len(time_intervals),
            'downloaded': 0,
            'missing_dates': []
        }
        
        for start_time in date_list:
            for time_interval in time_intervals:
                date_str = str(pd.to_datetime(start_time).date())
                file_name = f"{date_str}_{symbol.replace('/', '_').replace(':', '_')}_{time_interval}.csv"
                if file_name in existing_files:
                    stats[symbol]['downloaded'] += 1
                else:
                    stats[symbol]['missing_dates'].append(f"{date_str}_{time_interval}")

    print("\n下载统计信息:")
    incomplete_symbols = []
    for symbol, data in stats.items():
        completion_rate = (data['downloaded'] / data['total_expected']) * 100
        print(f"{symbol}: 完成率 {completion_rate:.2f}% ({data['downloaded']}/{data['total_expected']})")
        if data['downloaded'] < data['total_expected']:
            incomplete_symbols.append(symbol)
            if len(data['missing_dates']) <= 10:
                print(f"  缺失数据: {data['missing_dates'][:10]}")
            else:
                print(f"  缺失数据过多，共{len(data['missing_dates'])}个日期")

    print(f"\n未完全下载的交易对数量: {len(incomplete_symbols)}/{len(target_symbols)}")
    return incomplete_symbols

In [4]:
def init_exchange():
    """初始化交易所接口"""
    config = {
        **params['exchange_config'],
        'proxies': {
            'http': f"http://{params['proxy']['host']}:{params['proxy']['port']}",
            'https': f"http://{params['proxy']['host']}:{params['proxy']['port']}"
        }
    }
    return ccxt.binance(config)

exchange = init_exchange()

In [5]:
def get_available_symbols():
    """获取可用的交易对列表"""
    if params['use_all_usdt_pairs']:
        markets = exchange.load_markets()
        return [symbol for symbol in markets.keys() if ':USDT' in symbol]
    else:
        return params['specific_symbols']

def fetch_and_save_data(symbol, timeframe, start_time):
    """获取并保存单个交易对的数据"""
    try:
        # 构建文件名和路径
        date_str = str(pd.to_datetime(start_time).date())
        date_path = os.path.join(params['base_path'], date_str)
        os.makedirs(date_path, exist_ok=True)
        
        file_name = f"{date_str}_{symbol.replace('/', '').replace(':', '_')}_{timeframe}.csv"
        save_path = os.path.join(date_path, file_name)
        
        # 如果文件已存在，跳过
        if os.path.exists(save_path):
            # print(f"文件 {file_name} 已存在，跳过下载")
            return True, None
        
        # 获取数据
        print(f'正在获取 {exchange.id} {symbol} {timeframe} {start_time} 的数据')
        since = int(pd.Timestamp(f'{start_time} 00:00:00').timestamp() * 1000)
        end = int(pd.Timestamp(f'{start_time} 23:59:59').timestamp() * 1000)
        
        all_data = []
        current_since = since
        
        while current_since < end:
            data = exchange.fetch_ohlcv(
                symbol=symbol,
                timeframe=timeframe,
                since=current_since,
                limit=1000
            )
            
            if not data:
                break
                
            all_data.extend(data)
            current_since = data[-1][0] + 1
            time.sleep(0.9)
        
        if not all_data:
            print(f"{symbol} 在 {start_time} 无数据")
            return False, None
            
        # 转换为DataFrame并保存
        df = pd.DataFrame(
            all_data,
            columns=['datetime', 'open', 'high', 'low', 'close', 'volume']
        )
        df['datetime'] = pd.to_datetime(df['datetime'], unit='ms')
        
        # 筛选当天数据
        target_date = pd.to_datetime(start_time).date()
        df = df[df['datetime'].dt.date == target_date]
        
        # 数据清理
        df = df.drop_duplicates(subset=['datetime'], keep='last')\
               .sort_values('datetime')\
               .reset_index(drop=True)
        
        if not df.empty:
            df.to_csv(save_path, index=False)
            # print(f'成功下载并保存 {symbol} 在 {start_time} 的数据，数据形状: {df.shape}')
            return True, df
        else:
            print(f"{symbol} 在 {start_time} 筛选后无数据")
            return False, None
        
    except Exception as e:
        print(f'获取数据失败: {symbol}_{timeframe}_{start_time}, 错误: {e}')
        return False, None

In [6]:
# 获取现有文件列表
existing_files = scan_existing_files(params['base_path'])

# 获取要处理的交易对
target_symbols = get_available_symbols()
print(f"将处理以下交易对: {target_symbols}")

# 主循环
error_list = []
empty_data_count = {}  # 用于记录每个交易对的连续空数据天数

# 反转日期列表，从最新日期开始获取
date_list.reverse()

for symbol in target_symbols:
    empty_data_count[symbol] = 0
    
    for start_time in date_list:
        if empty_data_count[symbol] >= 3:
            print(f'{symbol} 连续 {empty_data_count[symbol]} 天无数据，跳转到下一个交易对')
            break
            
        for time_interval in params['time_intervals']:
            success, df = fetch_and_save_data(symbol, time_interval, start_time)
            
            if not success:
                error_msg = f'{exchange.id}_{symbol}_{time_interval}_{start_time}'
                error_list.append(error_msg)
                empty_data_count[symbol] += 1
            else:
                empty_data_count[symbol] = 0
                
        if empty_data_count[symbol] >= 3:
            break

# 分析下载情况
print("\n开始分析下载情况...")
existing_files = scan_existing_files(params['base_path'])  # 重新扫描
incomplete_symbols = analyze_download_status(
    target_symbols,
    existing_files,
    date_list,
    params['time_intervals']
)

# 询问是否重新下载未完成的交易对
if incomplete_symbols:
    print("\n是否要重新下载未完成的交易对？(y/n)")
    if input().lower() == 'y':
        target_symbols = incomplete_symbols
        # 重新运行主循环
        print("\n开始重新下载未完成的交易对...")
        for symbol in target_symbols:
            empty_data_count[symbol] = 0
            
            for start_time in date_list:
                if empty_data_count[symbol] >= 3:
                    print(f'{symbol} 连续 {empty_data_count[symbol]} 天无数据，跳转到下一个交易对')
                    break
                    
                for time_interval in params['time_intervals']:
                    success, df = fetch_and_save_data(symbol, time_interval, start_time)
                    
                    if not success:
                        error_msg = f'{exchange.id}_{symbol}_{time_interval}_{start_time}'
                        error_list.append(error_msg)
                        empty_data_count[symbol] += 1
                    else:
                        empty_data_count[symbol] = 0
                        
                if empty_data_count[symbol] >= 3:
                    break        

已扫描到 203197 个现有文件
将处理以下交易对: ['1000PEPE/USDT:USDT']
正在获取 binance 1000PEPE/USDT:USDT 1m 2025-03-24 的数据
正在获取 binance 1000PEPE/USDT:USDT 1m 2025-03-23 的数据
正在获取 binance 1000PEPE/USDT:USDT 1m 2025-03-22 的数据
正在获取 binance 1000PEPE/USDT:USDT 1m 2025-03-21 的数据
正在获取 binance 1000PEPE/USDT:USDT 1m 2025-03-20 的数据
正在获取 binance 1000PEPE/USDT:USDT 1m 2025-03-19 的数据

开始分析下载情况...
已扫描到 203203 个现有文件

下载统计信息:
1000PEPE/USDT:USDT: 完成率 0.00% (0/24)
  缺失数据过多，共24个日期

未完全下载的交易对数量: 1/1

是否要重新下载未完成的交易对？(y/n)

开始重新下载未完成的交易对...


In [7]:
import os
import pandas as pd
from datetime import datetime
import glob
import csv

def create_data_availability_matrix(root_path):
    # 获取所有日期文件夹
    date_folders = glob.glob(os.path.join(root_path, '*'))
    
    # 收集所有交易对和日期
    all_pairs = set()
    all_dates = set()
    data_dict = {}
    
    for folder in date_folders:
        date = os.path.basename(folder)
        try:
            datetime.strptime(date, '%Y-%m-%d')  # 验证是否为日期格式
        except ValueError:
            continue
            
        all_dates.add(date)
        csv_files = glob.glob(os.path.join(folder, f'{date}_*_USDT_1m.csv'))
        
        for file in csv_files:
            # 从文件名中提取交易对名称
            filename = os.path.basename(file)
            pair = filename.split('_')[1]
            all_pairs.add(pair)
            
            # 文件存在即标记为1
            if date not in data_dict:
                data_dict[date] = {}
            data_dict[date][pair] = 1
    
    # 创建DataFrame
    all_dates = sorted(list(all_dates))
    all_pairs = sorted(list(all_pairs))
    
    df = pd.DataFrame(index=all_pairs, columns=all_dates)
    
    # 填充数据
    for date in all_dates:
        for pair in all_pairs:
            df.loc[pair, date] = data_dict.get(date, {}).get(pair, 0)
    
    # 保存结果
    output_file = os.path.join(root_path, 'data_availability_matrix.csv')
    df.to_csv(output_file)
    
    print(f"数据可用性矩阵已保存到: {output_file}")
    
    # 打印统计信息
    total_pairs = len(all_pairs)
    total_dates = len(all_dates)
    complete_data_points = (df == 1).sum().sum()
    
    print(f"\n统计信息:")
    print(f"总交易对数量: {total_pairs}")
    print(f"总日期数量: {total_dates}")
    print(f"有数据的点数量: {complete_data_points}")
    print(f"数据覆盖率: {(complete_data_points/(total_pairs*total_dates)*100):.2f}%")
    
    return df

# 使用示例
root_path = r"\\znas\Main\futures"
matrix = create_data_availability_matrix(root_path)

数据可用性矩阵已保存到: \\znas\Main\futures\data_availability_matrix.csv

统计信息:
总交易对数量: 404
总日期数量: 814
有数据的点数量: 202691
数据覆盖率: 61.64%


In [8]:
import os
import pandas as pd
from datetime import datetime
import glob
import csv
import numpy as np

def analyze_data_gaps(df, max_continuous_gap=10, ignore_tail_gaps=True):
    """
    分析数据缺口
    df: 数据可用性矩阵
    max_continuous_gap: 最大连续空值天数，超过这个天数的空值认为是历史数据真实不存在
    ignore_tail_gaps: 是否忽略尾部的连续空值（历史数据通常从某个时间点开始）
    """
    problematic_pairs = {}
    
    for pair in df.index:
        row_data = df.loc[pair].values
        gaps = []
        gap_start = None
        continuous_zeros = 0
        
        for i, value in enumerate(row_data):
            if value == 0:
                continuous_zeros += 1
                if gap_start is None:
                    gap_start = df.columns[i]
            else:
                if gap_start is not None:
                    # 如果连续空值天数小于阈值，记录这个缺口
                    if continuous_zeros < max_continuous_gap:
                        gaps.append({
                            'start_date': gap_start,
                            'end_date': df.columns[i-1],
                            'days': continuous_zeros
                        })
                gap_start = None
                continuous_zeros = 0
        
        # 处理最后的空值序列
        if gap_start is not None and not ignore_tail_gaps:
            if continuous_zeros < max_continuous_gap:
                gaps.append({
                    'start_date': gap_start,
                    'end_date': df.columns[-1],
                    'days': continuous_zeros
                })
        
        # 如果存在需要处理的缺口，添加到问题交易对列表
        if gaps:
            problematic_pairs[pair] = gaps
    
    return problematic_pairs

def create_data_availability_matrix(root_path):
    # 获取所有日期文件夹
    date_folders = glob.glob(os.path.join(root_path, '*'))
    
    # 收集所有交易对和日期
    all_pairs = set()
    all_dates = set()
    data_dict = {}
    
    for folder in date_folders:
        date = os.path.basename(folder)
        try:
            datetime.strptime(date, '%Y-%m-%d')  # 验证是否为日期格式
        except ValueError:
            continue
            
        all_dates.add(date)
        csv_files = glob.glob(os.path.join(folder, f'{date}_*_USDT_1m.csv'))
        
        for file in csv_files:
            # 从文件名中提取交易对名称
            filename = os.path.basename(file)
            pair = filename.split('_')[1]
            all_pairs.add(pair)
            
            # 文件存在即标记为1
            if date not in data_dict:
                data_dict[date] = {}
            data_dict[date][pair] = 1
    
    # 创建DataFrame
    all_dates = sorted(list(all_dates))
    all_pairs = sorted(list(all_pairs))
    
    df = pd.DataFrame(index=all_pairs, columns=all_dates)
    
    # 填充数据
    for date in all_dates:
        for pair in all_pairs:
            df.loc[pair, date] = data_dict.get(date, {}).get(pair, 0)
    
    # 保存结果
    output_file = os.path.join(root_path, 'data_availability_matrix.csv')
    df.to_csv(output_file)
    
    print(f"数据可用性矩阵已保存到: {output_file}")
    
    # 分析数据缺口
    problematic_pairs = analyze_data_gaps(df)
    
    # 打印统计信息
    total_pairs = len(all_pairs)
    total_dates = len(all_dates)
    complete_data_points = (df == 1).sum().sum()
    
    print(f"\n基本统计信息:")
    print(f"总交易对数量: {total_pairs}")
    print(f"总日期数量: {total_dates}")
    print(f"有数据的点数量: {complete_data_points}")
    print(f"数据覆盖率: {(complete_data_points/(total_pairs*total_dates)*100):.2f}%")
    
    print(f"\n数据缺失分析:")
    print(f"发现 {len(problematic_pairs)} 个交易对存在不连续的数据缺失")
    
    # 保存需要重新获取数据的清单
    redownload_list = []
    for pair, gaps in problematic_pairs.items():
        print(f"\n交易对 {pair} 的数据缺口:")
        for gap in gaps:
            print(f"  从 {gap['start_date']} 到 {gap['end_date']} (共 {gap['days']} 天)")
            # 将每个缺失的日期都添加到重新下载列表
            current_date = datetime.strptime(gap['start_date'], '%Y-%m-%d')
            end_date = datetime.strptime(gap['end_date'], '%Y-%m-%d')
            while current_date <= end_date:
                redownload_list.append({
                    'symbol': pair,
                    'date': current_date.strftime('%Y-%m-%d')
                })
                current_date = current_date + pd.Timedelta(days=1)
    
    # 保存需要重新下载的数据清单
    redownload_df = pd.DataFrame(redownload_list)
    redownload_file = os.path.join(root_path, 'redownload_list.csv')
    redownload_df.to_csv(redownload_file, index=False)
    print(f"\n需要重新下载的数据清单已保存到: {redownload_file}")
    
    return df, problematic_pairs

# 使用示例
root_path = r"\\znas\Main\futures"
matrix, problems = create_data_availability_matrix(root_path)

数据可用性矩阵已保存到: \\znas\Main\futures\data_availability_matrix.csv

基本统计信息:
总交易对数量: 404
总日期数量: 814
有数据的点数量: 202691
数据覆盖率: 61.64%

数据缺失分析:
发现 0 个交易对存在不连续的数据缺失

需要重新下载的数据清单已保存到: \\znas\Main\futures\redownload_list.csv


In [9]:
def redownload_missing_data(params):
    """
    根据redownload_list.csv补充下载缺失数据
    复用原有的fetch_and_save_data函数
    """
    # 初始化交易所
    exchange = init_exchange()
    
    # 读取需要重新下载的数据清单
    redownload_file = os.path.join(params['base_path'], 'redownload_list.csv')
    if not os.path.exists(redownload_file):
        print("未找到重新下载清单文件！")
        return
    
    redownload_df = pd.read_csv(redownload_file)
    
    # 按交易对分组
    grouped_downloads = redownload_df.groupby('symbol')
    
    print(f"开始补充下载缺失数据...")
    print(f"共有 {len(grouped_downloads)} 个交易对需要补充数据")
    
    # 用于记录错误
    error_list = []
    empty_data_count = {}
    
    # 获取可用的交易对列表
    available_markets = exchange.load_markets()
    
    for symbol, group in grouped_downloads:
        print(f"\n处理交易对: {symbol}")
        empty_data_count[symbol] = 0
        dates = sorted(group['date'].unique())
        
        # 构建交易所格式的symbol
        base_symbol = symbol.replace('USDT', '')  # 移除USDT后缀
        exchange_symbol = f"{base_symbol}/USDT:USDT"
        
        # 检查交易对是否可用
        if exchange_symbol not in available_markets:
            print(f"交易对 {exchange_symbol} 在交易所中不可用，跳过")
            continue
        
        for start_time in dates:
            if empty_data_count[symbol] >= 3:
                print(f'{symbol} 连续 {empty_data_count[symbol]} 天无数据，跳转到下一个交易对')
                break
                
            print(f"  下载 {start_time} 的数据...")
            
            # 删除可能存在的不完整文件
            date_str = str(pd.to_datetime(start_time).date())
            date_path = os.path.join(params['base_path'], date_str)
            os.makedirs(date_path, exist_ok=True)
            
            for time_interval in params['time_intervals']:
                success, df = fetch_and_save_data(exchange_symbol, time_interval, start_time)
                
                if not success:
                    error_msg = f'{exchange.id}_{symbol}_{time_interval}_{start_time}'
                    error_list.append(error_msg)
                    empty_data_count[symbol] += 1
                    print(f"    {time_interval} 数据下载失败")
                else:
                    empty_data_count[symbol] = 0
                    print(f"    {time_interval} 数据下载成功")
            
            if empty_data_count[symbol] >= 3:
                break
    
    # 保存下载失败的记录
    if error_list:
        error_file = os.path.join(params['base_path'], 'redownload_errors.txt')
        with open(error_file, 'w') as f:
            f.write('\n'.join(error_list))
        print(f"\n仍有部分数据下载失败，详细信息已保存到: {error_file}")
    else:
        print("\n所有缺失数据补充完成！")
    
    # 重新生成可用性矩阵
    print("\n重新生成数据可用性矩阵...")
    matrix, problems = create_data_availability_matrix(params['base_path'])
    
    # 检查是否还有问题数据
    if problems:
        print(f"\n警告：仍有 {len(problems)} 个交易对存在数据缺失")
        print("建议检查 data_availability_matrix.csv 查看具体情况")
    else:
        print("\n所有数据已完整补充！")

# 运行补充下载
redownload_missing_data(params)

EmptyDataError: No columns to parse from file