In [1]:
import requests
import pandas as pd
import time
from typing import Tuple
from abc import ABC, abstractmethod
from typing import Dict, Any, List, Optional, Callable
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_rows', None)

In [2]:
class IDataFetcher(ABC):
    @abstractmethod
    def fetch(self, start_time: int, end_time: int) -> pd.DataFrame:
        pass

In [3]:
class MarketDataFetcher(IDataFetcher):
    def __init__(
        self,
        api_key: str,
        base_url: str,
        endpoint: str,
        params_template: Dict[str, Any],
        flatten_json: bool = False,
        rename_columns: Optional[Dict[str, str]] = None
    ):
        self.api_key = api_key
        self.base_url = base_url
        self.endpoint = endpoint
        self.params_template = params_template
        self.flatten_json = flatten_json
        self.rename_columns = rename_columns or {}

    def fetch(self, start_time: int, end_time: int) -> pd.DataFrame:
        headers = {"X-API-Key": self.api_key}
        url = f"{self.base_url}/{self.endpoint}"
        params = self.params_template.copy()
        params.update({
            "start_time": start_time,
            "end_time": end_time
        })

        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json().get("data", [])

        # Use flatten if nested JSON
        df = pd.json_normalize(data) if self.flatten_json else pd.DataFrame(data)

        # Add datetime index
        df['datetime'] = pd.to_datetime(df['start_time'], unit='ms')
        df.set_index('datetime', inplace=True)
        df.drop(columns=['start_time'], inplace=True)

        # Rename if required
        df.rename(columns=self.rename_columns, inplace=True)

        return df


In [4]:
now = int(time.time() * 1000)
start_time = now - 6 * 365 * 24 * 60 * 60 * 1000

api_key = "YBYu4oyUpkxUKi5FVZGxnDRTsBneFjEvxMs55xJHysR6kZf6"

cryptoquant_fetcher = MarketDataFetcher(
    api_key= api_key,
    base_url = "https://api.datasource.cybotrade.rs/cryptoquant",
    endpoint = "btc/market-data/price-ohlcv",
    params_template = {"window": "hour", "flatten": "true"}
)

cryptoquant_data = cryptoquant_fetcher.fetch(start_time, now)
print(cryptoquant_data.head(10))

                           close         high          low         open  \
datetime                                                                  
2019-04-12 08:00:00  5005.201510  5065.578853  4975.212093  5013.896480   
2019-04-12 09:00:00  5048.045167  5048.550739  4979.091315  5005.631050   
2019-04-12 10:00:00  5068.236543  5110.133518  4984.604596  5055.378017   
2019-04-12 11:00:00  5068.619189  5119.707024  5043.530011  5078.031336   
2019-04-12 12:00:00  5057.879621  5106.376465  5040.506185  5072.300779   
2019-04-12 13:00:00  5081.996580  5107.645747  5040.438517  5101.843124   
2019-04-12 14:00:00  5063.260539  5110.291395  5033.162575  5080.586778   
2019-04-12 15:00:00  5040.445252  5112.870477  5039.333747  5110.351250   
2019-04-12 16:00:00  5080.702022  5119.608518  5051.085948  5085.039931   
2019-04-12 17:00:00  5047.019451  5128.006080  5045.177233  5077.783590   

                          volume  
datetime                          
2019-04-12 08:00:00  2342.331

In [5]:
class IPreprocessor(ABC):
    @abstractmethod
    def process(self, df: pd.DataFrame) -> pd.DataFrame:
        pass

In [6]:
class Normalizer(IPreprocessor):
    def _init_(self):
        self.scaler = MinMaxScaler()

    def process(self, df: pd.DataFrame) -> pd.DataFrame:
        ohlcv_columns = ['open', 'high', 'low', 'close', 'volume']
        df[ohlcv_columns] = self.scaler.fit_transform(df[ohlcv_columns])
        return df

In [7]:
class MissingTimestampHandler(IPreprocessor):
    def _init_(self, frequency: str = '1h'):
        self.frequency = frequency

    def process(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.resample(self.frequency).mean()
        df.interpolate(method='linear', inplace=True)
        return df

In [8]:
# Need only if u want to change the data interval
class Resampler(IPreprocessor):
    def _init_(self, frequency: str):
        self.frequency = frequency

    def process(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.resample(self.frequency).agg({
            'open': 'first',
            'high': 'max',
            'low': 'min',
            'close': 'last',
            'volume': 'sum'
        })

In [9]:
class TrainTestSplitter:
    def __init__(self, train_ratio: float = 0.7):
        self.train_ratio = train_ratio

    def split(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        split_index = int(len(df) * self.train_ratio)
        df_train = df.iloc[:split_index]
        df_test = df.iloc[split_index:]
        return df_train, df_test

    def to_array(self, df: pd.DataFrame) -> list:
        return df.values.tolist()
