Skip to content

Commit

Permalink
Add PRef operator (microsoft#988)
Browse files Browse the repository at this point in the history
  • Loading branch information
Chaoyingz committed Mar 22, 2022
1 parent 73d90f7 commit a59b793
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 15 deletions.
4 changes: 2 additions & 2 deletions qlib/data/base.py
Expand Up @@ -254,10 +254,10 @@ class PFeature(Feature):
def __str__(self):
return "$$" + self._name

def _load_internal(self, instrument, start_index, end_index, cur_time):
def _load_internal(self, instrument, start_index, end_index, cur_time, period=None):
from .data import PITD # pylint: disable=C0415

return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time)
return PITD.period_feature(instrument, str(self), start_index, end_index, cur_time, period)


class ExpressionOps(Expression):
Expand Down
31 changes: 22 additions & 9 deletions qlib/data/data.py
Expand Up @@ -12,7 +12,7 @@
import bisect
import numpy as np
import pandas as pd
from typing import List, Union
from typing import List, Union, Tuple

# For supporting multiprocessing in outer code, joblib is used
from joblib import delayed
Expand Down Expand Up @@ -335,7 +335,15 @@ def feature(self, instrument, field, start_time, end_time, freq):

class PITProvider(abc.ABC):
@abc.abstractmethod
def period_feature(self, instrument, field, start_index: int, end_index: int, cur_time: pd.Timestamp) -> pd.Series:
def period_feature(
self,
instrument,
field,
start_index: int,
end_index: int,
cur_time: pd.Timestamp,
period_list: Tuple[int] = None,
) -> pd.Series:
"""
get the historical periods data series between `start_index` and `end_index`
Expand Down Expand Up @@ -732,7 +740,7 @@ class LocalPITProvider(PITProvider):
# TODO: Add PIT backend file storage
# NOTE: This class is not multi-threading-safe!!!!

def period_feature(self, instrument, field, start_index, end_index, cur_time):
def period_feature(self, instrument, field, start_index, end_index, cur_time, period=None):
if not isinstance(cur_time, pd.Timestamp):
raise ValueError(
f"Expected pd.Timestamp for `cur_time`, got '{cur_time}'. Advices: you can't query PIT data directly(e.g. '$$roewa_q'), you must use `P` operator to convert data to each day (e.g. 'P($$roewa_q)')"
Expand Down Expand Up @@ -771,8 +779,8 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time):
if not (index_path.exists() and data_path.exists()):
raise FileNotFoundError("No file is found. Raise exception and ")
# NOTE: The most significant performance loss is here.
# Does the accelration that makes the program complicated really matters?
# - It make parameters parameters of the interface complicate
# Does the acceleration that makes the program complicated really matters?
# - It makes parameters of the interface complicate
# - It does not performance in the optimal way (places all the pieces together, we may achieve higher performance)
# - If we design it carefully, we can go through for only once to get the historical evolution of the data.
# So I decide to deprecated previous implementation and keep the logic of the program simple
Expand All @@ -786,14 +794,19 @@ def period_feature(self, instrument, field, start_index, end_index, cur_time):
return pd.Series()
last_period = data["period"][:loc].max() # return the latest quarter
first_period = data["period"][:loc].min()

period_list = get_period_list(first_period, last_period, quarterly)
period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index]
if period is not None:
if period not in period_list:
return pd.Series()
else:
period_list = [period]
else:
period_list = period_list[max(0, len(period_list) + start_index - 1) : len(period_list) + end_index]
value = np.full((len(period_list),), np.nan, dtype=VALUE_DTYPE)
for i, period in enumerate(period_list):
for i, p in enumerate(period_list):
# last_period_index = self.period_index[field].get(period) # For acceleration
value[i], now_period_index = read_period_data(
index_path, data_path, period, cur_time_int, quarterly # , last_period_index # For acceleration
index_path, data_path, p, cur_time_int, quarterly # , last_period_index # For acceleration
)
# self.period_index[field].update({period: now_period_index}) # For acceleration
# NOTE: the index is period_list; So it may result in unexpected values(e.g. nan)
Expand Down
4 changes: 2 additions & 2 deletions qlib/data/ops.py
Expand Up @@ -1643,10 +1643,10 @@ def register_all_ops(C):
"""register all operator"""
logger = get_module_logger("ops")

from qlib.data.pit import P # pylint: disable=C0415
from qlib.data.pit import P, PRef # pylint: disable=C0415

Operators.reset()
Operators.register(OpsList + [P])
Operators.register(OpsList + [P, PRef])

if getattr(C, "custom_ops", None) is not None:
Operators.register(C.custom_ops)
Expand Down
17 changes: 16 additions & 1 deletion qlib/data/pit.py
Expand Up @@ -37,7 +37,7 @@ def _load_internal(self, instrument, start_index, end_index, freq):

# The calculated value will always the last element, so the end_offset is zero.
try:
s = self.feature.load(instrument, -start_ws, 0, cur_time)
s = self._load_feature(instrument, -start_ws, 0, cur_time)
resample_data[cur_index - start_index] = s.iloc[-1] if len(s) > 0 else np.nan
except FileNotFoundError:
get_module_logger("base").warning(f"WARN: period data not found for {str(self)}")
Expand All @@ -48,10 +48,25 @@ def _load_internal(self, instrument, start_index, end_index, freq):
)
return resample_series

def _load_feature(self, instrument, start_index, end_index, cur_time):
return self.feature.load(instrument, start_index, end_index, cur_time)

def get_longest_back_rolling(self):
# The period data will collapse as a normal feature. So no extending and looking back
return 0

def get_extended_window_size(self):
# The period data will collapse as a normal feature. So no extending and looking back
return 0, 0


class PRef(P):
def __init__(self, feature, period):
super().__init__(feature)
self.period = period

def __str__(self):
return f"{super().__str__()}[{self.period}]"

def _load_feature(self, instrument, start_index, end_index, cur_time):
return self.feature.load(instrument, start_index, end_index, cur_time, self.period)
43 changes: 42 additions & 1 deletion scripts/data_collector/pit/test_pit.py
Expand Up @@ -92,7 +92,7 @@ def test_expr(self):
"P((Ref($$roewa_q, 1) +$$roewa_q) / 2)",
]
instruments = ["sh600519"]
data = D.features(instruments, fields, start_time="2019-01-01", end_time="20190719", freq="day")
data = D.features(instruments, fields, start_time="2019-01-01", end_time="2019-07-19", freq="day")
expect = """
P(Mean($$roewa_q, 1)) P($$roewa_q) P(Mean($$roewa_q, 2)) P(Ref($$roewa_q, 1)) P((Ref($$roewa_q, 1) +$$roewa_q) / 2)
instrument datetime
Expand Down Expand Up @@ -189,6 +189,47 @@ def test_expr2(self):
fields += ["P(Sum($$yoyni_q, 4))"]
fields += ["$close", "P($$roewa_q) * $close"]
data = D.features(instruments, fields, start_time="2019-01-01", end_time="2020-01-01", freq="day")
except_data = """
P($$roewa_q) P($$yoyni_q) P(($$roewa_q / $$yoyni_q) / Ref($$roewa_q / $$yoyni_q, 1) - 1) P(Sum($$yoyni_q, 4)) $close P($$roewa_q) * $close
instrument datetime
sh600519 2019-01-02 0.255220 0.243892 1.484224 1.661578 63.595333 16.230801
2019-01-03 0.255220 0.243892 1.484224 1.661578 62.641907 15.987467
2019-01-04 0.255220 0.243892 1.484224 1.661578 63.915985 16.312637
2019-01-07 0.255220 0.243892 1.484224 1.661578 64.286530 16.407207
2019-01-08 0.255220 0.243892 1.484224 1.661578 64.212196 16.388237
... ... ... ... ... ... ...
2019-12-25 0.255819 0.219821 0.677052 1.081693 122.150467 31.248409
2019-12-26 0.255819 0.219821 0.677052 1.081693 122.301315 31.286999
2019-12-27 0.255819 0.219821 0.677052 1.081693 125.307404 32.056015
2019-12-30 0.255819 0.219821 0.677052 1.081693 127.763992 32.684456
2019-12-31 0.255819 0.219821 0.677052 1.081693 127.462303 32.607277
[244 rows x 6 columns]
"""
self.check_same(data, except_data)

def test_pref_operator(self):
instruments = ["sh600519"]
fields = ["PRef($$roewa_q, 201902)", "PRef($$yoyni_q, 201801)", "P($$roewa_q)"]
data = D.features(instruments, fields, start_time="2018-04-28", end_time="2019-07-19", freq="day")
except_data = """
PRef($$roewa_q, 201902) PRef($$yoyni_q, 201801) P($$roewa_q)
instrument datetime
sh600519 2018-05-02 NaN 0.395075 0.088887
2018-05-03 NaN 0.395075 0.088887
2018-05-04 NaN 0.395075 0.088887
2018-05-07 NaN 0.395075 0.088887
2018-05-08 NaN 0.395075 0.088887
... ... ... ...
2019-07-15 0.000000 0.395075 0.000000
2019-07-16 0.000000 0.395075 0.000000
2019-07-17 0.000000 0.395075 0.000000
2019-07-18 0.175322 0.395075 0.175322
2019-07-19 0.175322 0.395075 0.175322
[299 rows x 3 columns]
"""
self.check_same(data, except_data)


if __name__ == "__main__":
Expand Down

0 comments on commit a59b793

Please sign in to comment.