In [1]:
import ccxt
import pandas as pd
import time
from pathlib import Path

### 이건 간단하게 CCXT를 이용해서 데이터(1분봉)를 가져오는 코드 예시.

In [None]:
EXCHANGE = ccxt.upbit({"enableRateLimit" : True})
SYMBOL = "BTC/KRW" # 업비트 심볼 표기 방법. 이더리움(ETH/KRW), 리플(XRP/KRW), 솔라나(SOL/KRW), 바이낸스코인(BNB/KRW)로 바꾸면댐.
TIMEFRAME = "1m" # 1분단위로 가져올 것.
LIMIT = 200 # 업비트 OHLCV 호출 최대치 근처

In [3]:
EXCHANGE.load_markets()
ohlcv = EXCHANGE.fetch_ohlcv(SYMBOL, timeframe = TIMEFRAME, limit = LIMIT)
# CCXT 표준 컬럼이 [timestamp, open, high, low, close, volume]임.
df = pd.DataFrame(ohlcv, columns = ["ts", "open", "high", "low", "close", "volume"])
df["time"] = pd.to_datetime(df["ts"], unit = "ms", utc = True).dt.tz_convert("Asia/Seoul") # UTC 기준 타임스탬프 -> 한국 표준 시간으로 변경

In [4]:
df

Unnamed: 0,ts,open,high,low,close,volume,time
0,1761788580000,165105000.0,165172000.0,165099000.0,165172000.0,0.122555,2025-10-30 10:43:00+09:00
1,1761788640000,165172000.0,165230000.0,165106000.0,165224000.0,0.899353,2025-10-30 10:44:00+09:00
2,1761788700000,165173000.0,165321000.0,165172000.0,165236000.0,0.852458,2025-10-30 10:45:00+09:00
3,1761788760000,165236000.0,165325000.0,165212000.0,165236000.0,0.342143,2025-10-30 10:46:00+09:00
4,1761788820000,165304000.0,165326000.0,165212000.0,165325000.0,0.453248,2025-10-30 10:47:00+09:00
...,...,...,...,...,...,...,...
195,1761800280000,163615000.0,163789000.0,163375000.0,163375000.0,5.216646,2025-10-30 13:58:00+09:00
196,1761800340000,163375000.0,163472000.0,163144000.0,163145000.0,4.598579,2025-10-30 13:59:00+09:00
197,1761800400000,163147000.0,163147000.0,163010000.0,163100000.0,5.546417,2025-10-30 14:00:00+09:00
198,1761800460000,163100000.0,163164000.0,163000000.0,163000000.0,6.831265,2025-10-30 14:01:00+09:00


### 이제 데이터를 쌓고 저장해보자. (비트코인)

In [5]:
SYMBOL = "BTC/KRW"                # 비트코인 데이터
TIMEFRAME = "1m"                  # 1분봉
CHUNK_LIMIT = 200                 # Upbit 한 번에 가져올 최대 근처
SAVE_PATH = "data/ohlcv_full.parquet"
START_ISO = "2020-01-01T00:00:00Z" # 시작일(원하면 바꾸세요)
END_BUFFER_SEC = 60               # 최신 캔들 확정 여유(초)
MAX_EMPTY_CHUNKS = 10             # 연속 빈 응답 허용 횟수(초과 시 종료)

In [6]:
# timeframe → 밀리초
TIMEFRAME_MS = {
    "1m": 60_000, "3m": 180_000, "5m": 300_000, "15m": 900_000, "30m": 1_800_000,
    "1h": 3_600_000, "4h": 14_400_000, "1d": 86_400_000,
}[TIMEFRAME]

In [7]:
# Upbit 객체 생성
ex = ccxt.upbit({"enableRateLimit": True})
ex.load_markets()

{'BERA/BTC': {'id': 'BTC-BERA',
  'lowercaseId': None,
  'symbol': 'BERA/BTC',
  'base': 'BERA',
  'quote': 'BTC',
  'settle': None,
  'baseId': 'BERA',
  'quoteId': 'BTC',
  'settleId': None,
  'type': 'spot',
  'spot': True,
  'margin': False,
  'swap': False,
  'future': False,
  'option': False,
  'index': False,
  'active': True,
  'contract': False,
  'linear': None,
  'inverse': None,
  'subType': None,
  'taker': 0.0025,
  'maker': 0.0025,
  'contractSize': None,
  'expiry': None,
  'expiryDatetime': None,
  'strike': None,
  'optionType': None,
  'precision': {'amount': 1e-08,
   'price': 1e-08,
   'cost': None,
   'base': None,
   'quote': None},
  'limits': {'leverage': {'min': None, 'max': None},
   'amount': {'min': None, 'max': None},
   'price': {'min': None, 'max': None},
   'cost': {'min': None, 'max': None}},
  'marginModes': {'cross': None, 'isolated': None},
  'created': None,
  'info': {'market': 'BTC-BERA',
   'korean_name': '베라체인',
   'english_name': 'Berachain'}

In [8]:
# 저장 폴더
Path("data").mkdir(exist_ok=True, parents=True)

In [9]:
def append_parquet(path: str, df_new: pd.DataFrame):
    df_new = df_new.drop_duplicates(subset=["ts"]).sort_values("ts")
    if Path(path).exists():
        old = pd.read_parquet(path)
        df = pd.concat([old, df_new], ignore_index=True)
        df = df.drop_duplicates(subset=["ts"]).sort_values("ts")
    else:
        df = df_new
    df.to_parquet(path, index=False)

In [10]:
def main():
    since = ex.parse8601(START_ISO)
    end_ms = ex.milliseconds() - END_BUFFER_SEC * 1000

    empty_streak = 0
    batch_count  = 0
    buffer = []  # 메모리 절약하려면 주기적으로 파일에 flush

    while since < end_ms:
        try:
            ohlcv = ex.fetch_ohlcv(SYMBOL, timeframe=TIMEFRAME, since=since, limit=CHUNK_LIMIT)

            if ohlcv:
                df = pd.DataFrame(ohlcv, columns=["ts","open","high","low","close","volume"])
                df["time"] = pd.to_datetime(df["ts"], unit="ms", utc=True).dt.tz_convert("Asia/Seoul")
                buffer.append(df)

                last_ts = int(df["ts"].iloc[-1])
                since   = last_ts + 1              # 정확히 이어붙이기
                empty_streak = 0
                batch_count += 1
                
                # 주기적으로 저장(메모리 절약 & 중간 보존)
                if batch_count % 20 == 0:          # 20 청크마다 저장
                    append_parquet(SAVE_PATH, pd.concat(buffer, ignore_index=True))
                    print(f"[flush] up to {df['time'].iloc[-1]} → {SAVE_PATH}")
                    buffer = []

                print(f"{df['time'].iloc[0]} ~ {df['time'].iloc[-1]} (n={len(df)})")

            else:
                # 비어 있으면 ‘끝’으로 보지 말고 커서를 한 덩어리 건너뛰어 진행
                empty_streak += 1
                if empty_streak > MAX_EMPTY_CHUNKS:
                    print(f"연속 빈 응답 {empty_streak}회 → 종료(실제 끝으로 판단)")
                    break

                jump = TIMEFRAME_MS * CHUNK_LIMIT
                since = min(since + jump, end_ms)
                print(f"[empty #{empty_streak}] jump cursor → {pd.to_datetime(since, unit='ms', utc=True).tz_convert('Asia/Seoul')}")
                time.sleep(ex.rateLimit / 1000)

        except Exception as e:
            print("오류:", e)
            time.sleep(3)  # 지수 백오프로 바꿔도 좋음

    # 남은 버퍼 최종 저장
    if buffer:
        append_parquet(SAVE_PATH, pd.concat(buffer, ignore_index=True))
        print(f"[final flush] → {SAVE_PATH}")

    print("✅ 수집 종료")


In [11]:
if __name__ == "__main__":
    main()

2020-01-01 09:00:00+09:00 ~ 2020-01-01 12:19:00+09:00 (n=192)
2020-01-01 12:20:00+09:00 ~ 2020-01-01 15:39:00+09:00 (n=177)
2020-01-01 15:40:00+09:00 ~ 2020-01-01 18:59:00+09:00 (n=193)
2020-01-01 19:00:00+09:00 ~ 2020-01-01 22:19:00+09:00 (n=188)
2020-01-01 22:20:00+09:00 ~ 2020-01-02 01:39:00+09:00 (n=188)
2020-01-02 01:40:00+09:00 ~ 2020-01-02 04:59:00+09:00 (n=140)
2020-01-02 05:02:00+09:00 ~ 2020-01-02 08:19:00+09:00 (n=146)
2020-01-02 08:21:00+09:00 ~ 2020-01-02 11:39:00+09:00 (n=195)
2020-01-02 11:40:00+09:00 ~ 2020-01-02 14:59:00+09:00 (n=199)
2020-01-02 15:00:00+09:00 ~ 2020-01-02 18:19:00+09:00 (n=199)
2020-01-02 18:20:00+09:00 ~ 2020-01-02 21:39:00+09:00 (n=195)
2020-01-02 21:40:00+09:00 ~ 2020-01-03 00:59:00+09:00 (n=185)
2020-01-03 01:00:00+09:00 ~ 2020-01-03 04:19:00+09:00 (n=190)
2020-01-03 04:20:00+09:00 ~ 2020-01-03 07:39:00+09:00 (n=185)
2020-01-03 07:40:00+09:00 ~ 2020-01-03 10:59:00+09:00 (n=198)
2020-01-03 11:00:00+09:00 ~ 2020-01-03 14:19:00+09:00 (n=200)
2020-01-