In [2]:
!pip install numpy
!pip install pandas

In [9]:
import os
from pathlib import Path
from os.path import join
from os.path import abspath
import numpy as np
import pandas as pd
from typing import List
from datetime import datetime
from abc import ABC, abstractmethod

In [10]:
class Data:
    def __init__(self, idx, features, target, headers=None):
        self.headers = headers
        self.idx = idx
        self.features = features
        self.target = target

    @staticmethod
    def from_csv(path):
        abs_path = abspath(path)
        dataframe = pd.read_csv(abs_path)

        pure_data = np.array(dataframe).T
        idx = pure_data[0]
        features = pure_data[1:].T

        return Data(idx=idx, features=features, target=None)

    def as_pd_dataframe(self, headers: List[str] = None):
        frame = pd.DataFrame(data=self.features, index=self.idx, columns=headers)

        return frame

    def __repr__(self):
        return {
            'idx': self.idx,
            'features': self.features,
            'target': self.target,
        }

    def __str__(self):
        return str(self.__repr__())


class PreprocessStrategy(ABC):
    def __init__(self, data: Data):
        self.data = data

    @abstractmethod
    def process(self) -> Data:
        raise NotImplementedError

    def _chek_and_clean_timestamps(self):
        self.data.idx = [datetime.utcfromtimestamp(int(timestamp) / 1000).strftime('%Y-%m-%d %H:%M:%S')
                         for timestamp in self.data.idx]


class OHLCPreprocess(PreprocessStrategy):
    def __init__(self, data: Data):
        super().__init__(data)

    def process(self) -> Data:
        self._chek_and_clean_timestamps()

        self.data.features = self.data.features.T[:4].T

        return self.data


In [12]:
def default_result_path():
    name = 'NerpaResults'
    default_data_path = os.path.join(str(Path.home()), name)
    if name not in os.listdir(str(Path.home())):
        os.mkdir(default_data_path)
    return default_data_path

In [13]:
# fill in the path to the file
# raw Data is an object which has indices, features and target
raw_data = Data.from_csv('data/raw.csv')

In [14]:
# This preprocessing strategy remove unnecessary columns and restore date
cleaned_data = OHLCPreprocess(raw_data).process()
headers = ['Open', 'High', 'Low', 'Close']
frame = cleaned_data.as_pd_dataframe(headers=headers)

# save the result to the ~home/NerpaResults
frame.to_csv(join(default_result_path(), 'result.csv'))