In [None]:
import pandas as pd
import numpy as np
import glob
from pathlib import Path
from tqdm.auto import tqdm

from gym_trading_env.downloader import download, EXCHANGE_LIMIT_RATES, _download_symbols
from datetime import datetime
from dateutil.relativedelta import relativedelta
import os, shutil, uuid
from multiprocessing.pool import ThreadPool
import asyncio

In [None]:
raw_data = "data/raw"
processed_train_data = "data/processed/training/"
processed_test_data = "data/processed/testing/"
train_data = "data/training"
test_data = "data/testing"

EXCHANGE_LIMIT_RATES["btcturk"] = {
    "limit": 500,
    "pause_every": 120,
    "pause" : 2,
}

TARGET_TIMEFRAME = "1h"
DOWNLOAD_TIMEFRAME = "30m"
EXCHANGE = "binance"
SYMBOL = "ETH/TRY"
test_ratio = 0.2

START = datetime(year=2020, month=1, day=1)
INTERVAL = relativedelta(weeks=1)
DELTA = relativedelta(days=1)

until = datetime.now()
since = until - INTERVAL

In [None]:
for root, dirs, files in os.walk("./data/"):
    for file in files:
        filepath = os.path.join(root, file)
        os.remove(filepath)

In [None]:
first = True
intervals = [] # [(since, until), ...]
while since > START and until > START:
    intervals.append((since, until))
    if first:
        first = False
        since -= INTERVAL
        until -= INTERVAL
    else:
        since -= DELTA
        until -= DELTA

for since, until in tqdm(intervals):
    if not os.path.exists(f"{raw_data}/{EXCHANGE}-{SYMBOL.replace('/', '')}-{since.year}{since.month:02}{since.day:02}-{until.year}{until.month:02}{until.day:02}-{DOWNLOAD_TIMEFRAME}.pkl"):
        try:
            download(
                exchange_names=[EXCHANGE],
                symbols=[SYMBOL],
                timeframe=DOWNLOAD_TIMEFRAME,
                dir=raw_data,
                since=since,
                until=until,
            )
        except:
            print(f"Failed to download {raw_data}/{EXCHANGE}-{SYMBOL.replace('/', '')}-{since.year}{since.month:02}{since.day:02}-{until.year}{until.month:02}{until.day:02}-{DOWNLOAD_TIMEFRAME}.pkl.\nRetrying...")

In [None]:
raw_paths = glob.glob(f"{raw_data}/*pkl")
# np.random.shuffle(raw_paths)
raw_paths.reverse()
for i, raw_path in enumerate(tqdm(raw_paths)):
    try:
        name = Path(raw_path).name.split(".")[0].replace(DOWNLOAD_TIMEFRAME, TARGET_TIMEFRAME)
        df: pd.DataFrame = pd.read_pickle(raw_path)
        timeframe = (df.index - df.index.to_series().shift(1)).value_counts().index[0]
        for offset in range(pd.Timedelta(TARGET_TIMEFRAME) // timeframe):
            process_df = df.resample(pd.Timedelta(TARGET_TIMEFRAME), offset=offset * timeframe).agg(
                {
                    "date_close": lambda x: x.iloc[-1] if len(x) > 0 else np.nan,
                    "open": lambda x: x.iloc[0] if len(x) > 0 else np.nan,
                    "high": lambda x: max(x) if len(x) > 0 else np.nan,
                    "low": lambda x: min(x) if len(x) > 0 else np.nan,
                    "close": lambda x: x.iloc[-1] if len(x) > 0 else np.nan,
                    "volume": lambda x: sum(x) if len(x) > 0 else np.nan,
                }
            )[1:-1]
            process_df.dropna(inplace=True)
            
            random_filename = uuid.uuid4().hex
            if i == 0:
                process_df.to_pickle(f"{processed_test_data}/{name}-{offset}.pkl")
                shutil.copy(f"{processed_test_data}/{name}-{offset}.pkl", f"{test_data}/{random_filename}.pkl")
                break
            else:
                process_df.to_pickle(f"{processed_train_data}/{name}-{offset}.pkl")
                shutil.copy(f"{processed_train_data}/{name}-{offset}.pkl", f"{train_data}/{random_filename}.pkl")
    except:
        pass