In [12]:
import ccxt
import pandas as pd
import time
from pathlib import Path

### 이건 간단하게 CCXT를 이용해서 데이터(1분봉)를 가져오는 코드 예시.

In [13]:
EXCHANGE = ccxt.upbit({"enableRateLimit" : True})
SYMBOL = "BTC/KRW" # 업비트 심볼 표기 방법. 비트코인(BTC/KRW), 이더리움(ETH/KRW), 리플(XRP/KRW), 솔라나(SOL/KRW), 바이낸스코인(BNB/KRW)로 바꾸면댐.
TIMEFRAME = "1m" # 1분단위로 가져올 것.
LIMIT = 200 # 업비트 OHLCV 호출 최대치 근처

In [14]:
EXCHANGE.load_markets()
ohlcv = EXCHANGE.fetch_ohlcv(SYMBOL, timeframe = TIMEFRAME, limit = LIMIT)
# CCXT 표준 컬럼이 [timestamp, open, high, low, close, volume]임.
df = pd.DataFrame(ohlcv, columns = ["ts", "open", "high", "low", "close", "volume"])
df["time"] = pd.to_datetime(df["ts"], unit = "ms", utc = True).dt.tz_convert("Asia/Seoul") # UTC 기준 타임스탬프 -> 한국 표준 시간으로 변경

In [15]:
df

Unnamed: 0,ts,open,high,low,close,volume,time
0,1762802280000,156458000.0,156470000.0,156401000.0,156445000.0,0.162265,2025-11-11 04:18:00+09:00
1,1762802340000,156445000.0,156445000.0,156335000.0,156390000.0,0.186800,2025-11-11 04:19:00+09:00
2,1762802400000,156401000.0,156441000.0,156335000.0,156436000.0,0.086252,2025-11-11 04:20:00+09:00
3,1762802460000,156449000.0,156449000.0,156333000.0,156360000.0,0.576959,2025-11-11 04:21:00+09:00
4,1762802520000,156360000.0,156376000.0,156333000.0,156334000.0,0.053675,2025-11-11 04:22:00+09:00
...,...,...,...,...,...,...,...
195,1762813980000,156441000.0,156444000.0,156402000.0,156444000.0,0.172503,2025-11-11 07:33:00+09:00
196,1762814040000,156444000.0,156444000.0,156402000.0,156444000.0,0.243487,2025-11-11 07:34:00+09:00
197,1762814100000,156410000.0,156411000.0,156403000.0,156410000.0,0.144949,2025-11-11 07:35:00+09:00
198,1762814160000,156410000.0,156410000.0,156403000.0,156410000.0,0.207515,2025-11-11 07:36:00+09:00


### 이제 데이터를 쌓고 저장해보자. (비트코인)

In [None]:
SYMBOL = "SOL/KRW"                # 비트코인(BTC/KRW), 이더리움(ETH/KRW), 리플(XRP/KRW), 솔라나(SOL/KRW), 바이낸스코인(BNB/KRW)로 바꾸면댐.
TIMEFRAME = "5m"                  # 1분봉
CHUNK_LIMIT = 200                 # Upbit 한 번에 가져올 최대 근처
SAVE_PATH = "data/ohlcv_full_{}_{}.parquet".format(SYMBOL.replace("/",""), TIMEFRAME)
START_ISO = "2022-01-01T00:00:00Z" # 시작일(원하면 바꾸세요)
END_BUFFER_SEC = 60               # 최신 캔들 확정 여유(초)
MAX_EMPTY_CHUNKS = 10             # 연속 빈 응답 허용 횟수(초과 시 종료)

In [17]:
# timeframe → 밀리초
TIMEFRAME_MS = {
    "1m": 60_000, "3m": 180_000, "5m": 300_000, "15m": 900_000, "30m": 1_800_000,
    "1h": 3_600_000, "4h": 14_400_000, "1d": 86_400_000,
}[TIMEFRAME]

In [18]:
# Upbit 객체 생성
ex = ccxt.upbit({"enableRateLimit": True})
ex.load_markets()

{'BERA/BTC': {'id': 'BTC-BERA',
  'lowercaseId': None,
  'symbol': 'BERA/BTC',
  'base': 'BERA',
  'quote': 'BTC',
  'settle': None,
  'baseId': 'BERA',
  'quoteId': 'BTC',
  'settleId': None,
  'type': 'spot',
  'spot': True,
  'margin': False,
  'swap': False,
  'future': False,
  'option': False,
  'index': False,
  'active': True,
  'contract': False,
  'linear': None,
  'inverse': None,
  'subType': None,
  'taker': 0.0025,
  'maker': 0.0025,
  'contractSize': None,
  'expiry': None,
  'expiryDatetime': None,
  'strike': None,
  'optionType': None,
  'precision': {'amount': 1e-08,
   'price': 1e-08,
   'cost': None,
   'base': None,
   'quote': None},
  'limits': {'leverage': {'min': None, 'max': None},
   'amount': {'min': None, 'max': None},
   'price': {'min': None, 'max': None},
   'cost': {'min': None, 'max': None}},
  'marginModes': {'cross': None, 'isolated': None},
  'created': None,
  'info': {'market': 'BTC-BERA',
   'korean_name': '베라체인',
   'english_name': 'Berachain'}

In [19]:
# 저장 폴더
Path("data").mkdir(exist_ok=True, parents=True)

In [20]:
def append_parquet(path: str, df_new: pd.DataFrame):
    df_new = df_new.drop_duplicates(subset=["ts"]).sort_values("ts")
    if Path(path).exists():
        old = pd.read_parquet(path)
        df = pd.concat([old, df_new], ignore_index=True)
        df = df.drop_duplicates(subset=["ts"]).sort_values("ts")
    else:
        df = df_new
    df.to_parquet(path, index=False)

In [21]:
def main():
    since = ex.parse8601(START_ISO)
    end_ms = ex.milliseconds() - END_BUFFER_SEC * 1000

    empty_streak = 0
    batch_count  = 0
    buffer = []  # 메모리 절약하려면 주기적으로 파일에 flush

    while since < end_ms:
        try:
            ohlcv = ex.fetch_ohlcv(SYMBOL, timeframe=TIMEFRAME, since=since, limit=CHUNK_LIMIT)

            if ohlcv:
                df = pd.DataFrame(ohlcv, columns=["ts","open","high","low","close","volume"])
                df["time"] = pd.to_datetime(df["ts"], unit="ms", utc=True).dt.tz_convert("Asia/Seoul")
                buffer.append(df)

                last_ts = int(df["ts"].iloc[-1])
                since   = last_ts + 1              # 정확히 이어붙이기
                empty_streak = 0
                batch_count += 1
                
                # 주기적으로 저장(메모리 절약 & 중간 보존)
                if batch_count % 20 == 0:          # 20 청크마다 저장
                    append_parquet(SAVE_PATH, pd.concat(buffer, ignore_index=True))
                    print(f"[flush] up to {df['time'].iloc[-1]} → {SAVE_PATH}")
                    buffer = []

                print(f"{df['time'].iloc[0]} ~ {df['time'].iloc[-1]} (n={len(df)})")

            else:
                # 비어 있으면 ‘끝’으로 보지 말고 커서를 한 덩어리 건너뛰어 진행
                empty_streak += 1
                if empty_streak > MAX_EMPTY_CHUNKS:
                    print(f"연속 빈 응답 {empty_streak}회 → 종료(실제 끝으로 판단)")
                    break

                jump = TIMEFRAME_MS * CHUNK_LIMIT
                since = min(since + jump, end_ms)
                print(f"[empty #{empty_streak}] jump cursor → {pd.to_datetime(since, unit='ms', utc=True).tz_convert('Asia/Seoul')}")
                time.sleep(ex.rateLimit / 1000)

        except Exception as e:
            print("오류:", e)
            time.sleep(3)  # 지수 백오프로 바꿔도 좋음

    # 남은 버퍼 최종 저장
    if buffer:
        append_parquet(SAVE_PATH, pd.concat(buffer, ignore_index=True))
        print(f"[final flush] → {SAVE_PATH}")

    print("✅ 수집 종료")


In [22]:
if __name__ == "__main__":
    main()

2020-01-01 09:00:00+09:00 ~ 2020-01-02 01:25:00+09:00 (n=189)
2020-01-02 01:45:00+09:00 ~ 2020-01-02 18:05:00+09:00 (n=177)
2020-01-02 18:10:00+09:00 ~ 2020-01-03 10:45:00+09:00 (n=193)
2020-01-03 10:50:00+09:00 ~ 2020-01-04 03:20:00+09:00 (n=199)
2020-01-04 03:35:00+09:00 ~ 2020-01-04 20:00:00+09:00 (n=178)
2020-01-04 20:05:00+09:00 ~ 2020-01-05 12:40:00+09:00 (n=184)
2020-01-05 12:45:00+09:00 ~ 2020-01-06 05:20:00+09:00 (n=197)
2020-01-06 05:25:00+09:00 ~ 2020-01-06 22:00:00+09:00 (n=199)
2020-01-06 22:05:00+09:00 ~ 2020-01-07 14:40:00+09:00 (n=200)
2020-01-07 14:45:00+09:00 ~ 2020-01-08 07:20:00+09:00 (n=200)
2020-01-08 07:25:00+09:00 ~ 2020-01-09 00:00:00+09:00 (n=200)
2020-01-09 00:05:00+09:00 ~ 2020-01-09 16:40:00+09:00 (n=200)
2020-01-09 16:45:00+09:00 ~ 2020-01-10 09:20:00+09:00 (n=197)
2020-01-10 09:25:00+09:00 ~ 2020-01-11 02:00:00+09:00 (n=200)
2020-01-11 02:05:00+09:00 ~ 2020-01-11 18:40:00+09:00 (n=199)
2020-01-11 18:45:00+09:00 ~ 2020-01-12 11:20:00+09:00 (n=200)
2020-01-