In [1]:
import requests
import pandas as pd
import time
from typing import Tuple
from abc import ABC, abstractmethod
from typing import Dict, Any, List, Optional, Callable
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_rows', None)

In [2]:
class IDataFetcher(ABC):
    @abstractmethod
    def fetch(self, start_time: int, end_time: int) -> pd.DataFrame:
        pass

In [3]:
class MarketDataFetcher(IDataFetcher):
    def __init__(
        self,
        api_key: str,
        base_url: str,
        endpoint: str,
        params_template: Dict[str, Any],
        flatten_json: bool = False,
        rename_columns: Optional[Dict[str, str]] = None
    ):
        self.api_key = api_key
        self.base_url = base_url
        self.endpoint = endpoint
        self.params_template = params_template
        self.flatten_json = flatten_json
        self.rename_columns = rename_columns or {}

    def fetch(self, start_time: int, end_time: int) -> pd.DataFrame:
        headers = {"X-API-Key": self.api_key}
        url = f"{self.base_url}/{self.endpoint}"
        params = self.params_template.copy()
        params.update({
            "start_time": start_time,
            "end_time": end_time
        })

        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json().get("data", [])

        # Use flatten if nested JSON
        df = pd.json_normalize(data) if self.flatten_json else pd.DataFrame(data)

        # Add datetime index
        df['datetime'] = pd.to_datetime(df['start_time'], unit='ms')
        df.set_index('datetime', inplace=True)
        df.drop(columns=['start_time'], inplace=True)

        # Rename if required
        df.rename(columns=self.rename_columns, inplace=True)

        return df


In [4]:
def get_cryptoquant_data():
    now = int(time.time() * 1000)
    start_time = now - 6 * 365 * 24 * 60 * 60 * 1000

    api_key = "YBYu4oyUpkxUKi5FVZGxnDRTsBneFjEvxMs55xJHysR6kZf6"

    fetcher = MarketDataFetcher(
        api_key=api_key,
        base_url="https://api.datasource.cybotrade.rs/cryptoquant",
        endpoint="xrp/market-data/price-ohlcv",
        params_template={"window": "hour", "flatten": "true"}
    )

    return fetcher.fetch(start_time, now)


In [5]:
cryptoquant_data = get_cryptoquant_data()
print(cryptoquant_data.head(10))

                        close      high       low      open        volume
datetime                                                                 
2019-04-13 08:00:00  0.327959  0.331140  0.326784  0.329983  5.267267e+06
2019-04-13 09:00:00  0.327912  0.330566  0.326139  0.328055  3.579032e+06
2019-04-13 10:00:00  0.322813  0.329290  0.321690  0.327018  7.187208e+06
2019-04-13 11:00:00  0.324656  0.325658  0.321414  0.321631  5.438059e+06
2019-04-13 12:00:00  0.323879  0.327485  0.322847  0.325811  2.254610e+06
2019-04-13 13:00:00  0.324818  0.327435  0.323023  0.324852  1.373858e+06
2019-04-13 14:00:00  0.326190  0.327597  0.324320  0.324870  2.731291e+06
2019-04-13 15:00:00  0.324661  0.327893  0.324107  0.326253  3.749263e+06
2019-04-13 16:00:00  0.323255  0.325904  0.322509  0.324662  2.397102e+06
2019-04-13 17:00:00  0.324473  0.326343  0.323093  0.323429  1.597702e+06


In [6]:
class IPreprocessor(ABC):
    @abstractmethod
    def process(self, df: pd.DataFrame) -> pd.DataFrame:
        pass

In [7]:
class Normalizer(IPreprocessor):
    def __init__(self):
        self.scaler = MinMaxScaler()

    def process(self, df: pd.DataFrame) -> pd.DataFrame:
        ohlcv_columns = ['open', 'high', 'low', 'close', 'volume']
        df[ohlcv_columns] = self.scaler.fit_transform(df[ohlcv_columns])
        return df

In [8]:
class MissingTimestampHandler(IPreprocessor):
    def __init__(self, frequency: str = '1h'):
        self.frequency = frequency

    def process(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.resample(self.frequency).mean()
        df.interpolate(method='linear', inplace=True)
        return df

In [9]:
# Need only if u want to change the data interval
class Resampler(IPreprocessor):
    def __init__(self, frequency: str):
        self.frequency = frequency

    def process(self, df: pd.DataFrame) -> pd.DataFrame:
        return df.resample(self.frequency).agg({
            'open': 'first',
            'high': 'max',
            'low': 'min',
            'close': 'last',
            'volume': 'sum'
        })

In [10]:
normalizer = Normalizer()
normalized_data = normalizer.process(cryptoquant_data)
print(normalized_data.head(10))

                        close      high       low      open    volume
datetime                                                             
2019-04-13 08:00:00  0.063652  0.060760  0.067005  0.064995  0.002603
2019-04-13 09:00:00  0.063637  0.060584  0.066805  0.064398  0.001762
2019-04-13 10:00:00  0.062073  0.060193  0.065425  0.064077  0.003559
2019-04-13 11:00:00  0.062638  0.059080  0.065339  0.062409  0.002688
2019-04-13 12:00:00  0.062400  0.059640  0.065784  0.063703  0.001103
2019-04-13 13:00:00  0.062688  0.059624  0.065838  0.063406  0.000664
2019-04-13 14:00:00  0.063109  0.059674  0.066240  0.063412  0.001340
2019-04-13 15:00:00  0.062640  0.059765  0.066174  0.063840  0.001847
2019-04-13 16:00:00  0.062209  0.059155  0.065679  0.063347  0.001174
2019-04-13 17:00:00  0.062582  0.059290  0.065860  0.062965  0.000776


In [11]:
class TrainTestSplitter:
    def __init__(self, train_ratio: float = 0.7):
        self.train_ratio = train_ratio

    def split(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
        split_index = int(len(df) * self.train_ratio)
        df_train = df.iloc[:split_index]
        df_test = df.iloc[split_index:]
        return df_train, df_test

    def to_array(self, df: pd.DataFrame) -> list:
        return df.values.tolist()
