In [4]:
# 1. Import
import pandas as pd
import requests
import time
from datetime import datetime
from pybit.unified_trading import HTTP
import os

# 2. Common Config
SYMBOL = 'HYPEUSDT'
INTERVAL_BYBIT = '1' # Bybitは数字
INTERVAL_BINANCE = '1m'  # Binanceは文字列
START_DATE = '2025/09/01/00/00'
END_DATE = '2025/09/26/00/00'

# 3. Helper Functions
def to_timestamp(date_str):
    dt = datetime.strptime(date_str, '%Y/%m/%d/%H/%M')
    return int(dt.timestamp() * 1000)

def to_datetime(ms):
    return datetime.utcfromtimestamp(ms / 1000)

# 4. Bybit Kline Fetch
def fetch_bybit_kline(symbol, interval, start_ts, end_ts):
    session = HTTP(testnet=False)
    all_dfs = []
    current_end = end_ts
    step = 1

    while True:
        try:
            response = session.get_kline(
                category='linear',
                symbol=symbol,
                interval=interval,
                start=start_ts,
                end=current_end,
                limit=1000
            )
            df = pd.DataFrame(response['result']['list'])
        except Exception as e:
            print(f"[Error] Bybit fetch failed: {e}")
            break

        if df.empty:
            print("[Info] No more Bybit data.")
            break

        all_dfs.append(df)
        oldest = int(df.iloc[-1][0])
        current_end = oldest - 1
        print(f"[Bybit Step {step}] Oldest: {to_datetime(oldest)} Rows: {len(df)}")
        time.sleep(0.1)
        step += 1

    if all_dfs:
        master_df = pd.concat(all_dfs, ignore_index=True)
        master_df = master_df.iloc[::-1]  # 最新→古い を 古い→最新に並び替え
        master_df.columns = ["timestamp", "O", "H", "L", "C", "V", "T"]
        master_df["timestamp"] = master_df["timestamp"].astype('int64')
        master_df["timestamp"] = pd.to_datetime(master_df["timestamp"], unit='ms')
        return master_df
    else:
        return pd.DataFrame()

# 5. Binance Kline Fetch
def fetch_binance_kline(symbol, interval, start_ts, end_ts):
    endpoint = 'https://fapi.binance.com/fapi/v1/klines'
    if interval in ['1m', '5m', '15m', '30m']:
        interval_minutes = int(interval.replace('m', ''))
        interval_milliseconds = interval_minutes * 60 * 1000
    elif interval in ['1h', '4h']:
        interval_minutes = int(interval.replace('h', '')) * 60
        interval_milliseconds = interval_minutes * 60 * 1000
    else:
        raise ValueError(f"Unsupported interval: {interval}")

    all_data = []
    current_start = start_ts
    step = 1
    MAX_STEPS = 500

    while current_start < end_ts:
        current_end = current_start + interval_milliseconds * 1000
        if current_end > end_ts:
            current_end = end_ts

        params = {
            'symbol': symbol,
            'interval': interval,
            'limit': 1000,
            'startTime': current_start,
            'endTime': current_end
        }
        response = requests.get(endpoint, params=params)
        data = response.json()

        if not isinstance(data, list) or not data:
            print(f"[Binance Step {step}] No data or error response.")
            break

        all_data.extend(data)
        oldest_timestamp = data[0][0]
        print(f"[Binance Step {step}] Oldest: {to_datetime(oldest_timestamp)} Rows: {len(data)}")

        last_close_time = data[-1][6]
        current_start = last_close_time + 1
        step += 1
        time.sleep(0.1)

        if step > MAX_STEPS:
            print("Reached max Binance steps. Breaking.")
            break

    if all_data:
        columns = [
            'timestamp', 'open', 'high', 'low', 'close', 'volume',
            'close_time', 'quote_asset_volume', 'number_of_trades',
            'taker_buy_base_asset_volume', 'taker_buy_quote_asset_volume', 'ignore'
        ]
        df = pd.DataFrame(all_data, columns=columns)

        for col in ['open', 'high', 'low', 'close', 'volume', 'taker_buy_base_asset_volume']:
            df[col] = df[col].astype(float)

        # 必要な列だけ
        df_selected = df[['timestamp', 'number_of_trades', 'taker_buy_base_asset_volume', 'volume']].copy()
        df_selected['taker_sell_base_asset_volume'] = df_selected['volume'] - df_selected['taker_buy_base_asset_volume']
        df_selected = df_selected.drop(columns=['volume'])
        df_selected["timestamp"] = pd.to_datetime(df_selected["timestamp"], unit='ms')
        return df_selected
    else:
        return pd.DataFrame()

# 6. Main Function
def main():
    start_ts = to_timestamp(START_DATE)
    end_ts = to_timestamp(END_DATE)

    # Bybitデータ取得
    print(f"\nFetching Bybit data for {SYMBOL} ({INTERVAL_BYBIT}min)...")
    bybit_df = fetch_bybit_kline(SYMBOL, INTERVAL_BYBIT, start_ts, end_ts)
    if bybit_df.empty:
        print("[Error] No Bybit data.")
        return

    # Binanceデータ取得
    print(f"\nFetching Binance data for {SYMBOL} ({INTERVAL_BINANCE})...")
    binance_df = fetch_binance_kline(SYMBOL, INTERVAL_BINANCE, start_ts, end_ts)
    if binance_df.empty:
        print("[Error] No Binance data.")
        return

    # --- マージ ---
    print("\nMerging Bybit and Binance data...")
    merged_df = pd.merge(bybit_df, binance_df, on='timestamp', how='inner')

    # --- チェック ---
    print("\n【Mergedデータチェック】")

    # 1. 欠損値チェック
    missing = merged_df.isnull().sum()
    print("\n【欠損値チェック】")
    print(missing)

    if missing.any():
        print("❗ 欠損値があります。処理を中断します。")
        raise ValueError("欠損値検出")
    else:
        print("✅ 欠損値はありません。")

    # 2. timestamp間隔チェック
    merged_df = merged_df.sort_values('timestamp').reset_index(drop=True)
    diff = merged_df['timestamp'].diff().dropna()
    expected_interval = pd.Timedelta(minutes=int(INTERVAL_BYBIT))
    interval_check = (diff == expected_interval)

    print("\n【タイムスタンプ間隔チェック】")
    print(f"ずれている箇所数: {(~interval_check).sum()}箇所")

    if (~interval_check).any():
        mismatch_idx = interval_check[~interval_check].index
        print("❗ タイムスタンプに等間隔でない箇所があります。例:")
        for idx in mismatch_idx[:5]:  # 最初の5件だけ例示
            prev_ts = merged_df.loc[idx - 1, 'timestamp']
            curr_ts = merged_df.loc[idx, 'timestamp']
            print(f"ズレ: {prev_ts} → {curr_ts} （差分: {curr_ts - prev_ts}）")
        raise ValueError("タイムスタンプ間隔異常")
    else:
        print("✅ タイムスタンプも等間隔できれいに並んでいます。")

    # --- 保存 ---
    start_str = datetime.strptime(START_DATE, "%Y/%m/%d/%H/%M").strftime("%Y%m%d")
    end_str = datetime.strptime(END_DATE, "%Y/%m/%d/%H/%M").strftime("%Y%m%d")
    file_name = f"01.data/Market_Bybit+Binance_{SYMBOL}_{INTERVAL_BYBIT}min_{start_str}-{end_str}.csv"
    merged_df.to_csv(file_name, index=False)
    print(f"[Success] Merged data saved to {file_name}")

# 7. Execute
if __name__ == "__main__":
    main()



Fetching Bybit data for HYPEUSDT (1min)...
[Bybit Step 1] Oldest: 2025-09-25 03:21:00 Rows: 1000
[Bybit Step 2] Oldest: 2025-09-24 10:41:00 Rows: 1000
[Bybit Step 3] Oldest: 2025-09-23 18:01:00 Rows: 1000
[Bybit Step 4] Oldest: 2025-09-23 01:21:00 Rows: 1000
[Bybit Step 5] Oldest: 2025-09-22 08:41:00 Rows: 1000
[Bybit Step 6] Oldest: 2025-09-21 16:01:00 Rows: 1000
[Bybit Step 7] Oldest: 2025-09-20 23:21:00 Rows: 1000
[Bybit Step 8] Oldest: 2025-09-20 06:41:00 Rows: 1000
[Bybit Step 9] Oldest: 2025-09-19 14:01:00 Rows: 1000
[Bybit Step 10] Oldest: 2025-09-18 21:21:00 Rows: 1000
[Bybit Step 11] Oldest: 2025-09-18 04:41:00 Rows: 1000
[Bybit Step 12] Oldest: 2025-09-17 12:01:00 Rows: 1000
[Bybit Step 13] Oldest: 2025-09-16 19:21:00 Rows: 1000
[Bybit Step 14] Oldest: 2025-09-16 02:41:00 Rows: 1000
[Bybit Step 15] Oldest: 2025-09-15 10:01:00 Rows: 1000
[Bybit Step 16] Oldest: 2025-09-14 17:21:00 Rows: 1000
[Bybit Step 17] Oldest: 2025-09-14 00:41:00 Rows: 1000
[Bybit Step 18] Oldest: 2025-0