In [1]:
import datetime
import json
import os

import numpy as np
import pandas as pd

from copy import deepcopy
from collections import defaultdict
from pprint import pprint
from pandas import ExcelWriter
from pathlib import Path
from typing import List, Text, Tuple, Union

In [2]:
import mlflow
import optuna
import qlib

from qlib.contrib.data.utils.neutralize import get_riskest_features
from qlib.data.dataset import DataHandlerLP
from qlib.data.dataset.weight import Reweighter
from qlib.constant import REG_CN, REG_US
from qlib.contrib.report import analysis_model, analysis_position
from qlib.utils import init_instance_by_config
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord, SigAnaRecord

In [3]:
def timestamp():
    return datetime.datetime.now().strftime('%Y%m%d')


def get_diff_date(d, diff, date_format="%Y-%m-%d"):
    d = datetime.datetime.strptime(d, date_format) + datetime.timedelta(days=diff)
    return d.strftime(date_format)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [4]:
def get_params_from_file(file, key):
    with open(file, "r") as f:
        params = json.load(f)
    return params.get(key)


def update_params_to_file(file, key, value):
    with open(file, "r") as f:
        params = json.load(f)
    params[key] = value
    with open(file, "w") as f:
        json.dump(params, f, indent=4)
        
        
def update_report_df(folder, key, df):
    df.to_excel(f"{folder}/{key}.xlsx")

In [5]:
STRATEGY_PARAMS_FILE = "../data/params/strategy.json"
MODEL_PARAMS_FILE = "../data/params/model.json"
REPORT_DF_FOLDER = "../data/report_df"

In [6]:
MODEL_LOSS_KEY_DICT = {
    "mse": "l2",
    "mse_log": "l2",
    "binary": "binary_logloss",
    "lambdarank": "ndcg@5",
}

In [7]:
REGION = REG_CN
INSTRUMENTS = "csi300"
DEAL_PRICE = "open"
MODEL_LOSS = "mse"
LABEL_NORM = "CSRankNorm"
ENABLE_NEUTRALIZE = True

# the day when you have the stock data after close
PRED_DATE = "2023-02-17"
TEST_END_DATE = PRED_DATE
BACKTEST_END_DATE = get_diff_date(PRED_DATE, -1)

USE_BEST_STRATEGY_PARAMS = False
USE_BEST_MODEL_PARAMS = True

In [8]:
ALL_CONFIG_KEY = f"{REGION}_{INSTRUMENTS}_{DEAL_PRICE}_{LABEL_NORM}_{MODEL_LOSS}"
if ENABLE_NEUTRALIZE:
    ALL_CONFIG_KEY += "_neutralize"

In [9]:
ALL_CONFIG_KEY

'cn_csi300_open_CSRankNorm_mse_neutralize'

In [10]:
BECHMARK_PARAMS = {
    "csi300": "SH000300",
    "csi500": "SH000905",
    "csi800": "SH000906",
    # https://github.com/microsoft/qlib/issues/720
    "SP500": "^gspc",
    "NASDAQ100": "^ndx",
}

In [11]:
TOPK_LIST = [1, 2, 4, 6, 8, 10]
N_DROP_LIST = [1, 2, 3, 4, 5]

In [12]:
REGION_CONFIG = {
    REG_CN: {
        "benchmark": BECHMARK_PARAMS[INSTRUMENTS],
        "exchange_kwargs": {
            "codes": INSTRUMENTS,
            "freq": "day",
            "trade_unit": 100,
            "limit_threshold": 0.095,
            "deal_price": DEAL_PRICE,
            "open_cost": 0.0005,
            "close_cost": 0.0015,
            "min_cost": 5,
        }
    },
    REG_US: {
        "benchmark": BECHMARK_PARAMS[INSTRUMENTS],
        "exchange_kwargs": {
            "codes": INSTRUMENTS,
            "freq": "day",
            "trade_unit": 1,
            "limit_threshold": None,
            "deal_price": DEAL_PRICE,
            # estimated from moomoo sg
            "open_cost": 0.003,
            "close_cost": 0.005,
            "min_cost": 0
        }
    }
}

In [13]:
DATE_CONFIG = {
    REG_CN: {
        "train": {
            "start": "2008-01-01",
            "end": "2016-12-31"
        },
        "valid": {
            "start": "2017-01-01",
            "end": "2018-12-31"
        },
        "test": {
            "start": "2019-01-01",
            "end": TEST_END_DATE
        },
        "backtest": {
            "start": "2019-01-01",
            "end": BACKTEST_END_DATE
        }
    },
    REG_US: {
        "train": {
            "start": "2008-01-01",
            "end": "2016-12-31"
        },
        "valid": {
            "start": "2017-01-01",
            "end": "2018-12-31"
        },
        "test": {
            "start": "2019-01-01",
            "end": TEST_END_DATE
        },
        "backtest": {
            "start": "2019-01-01",
            "end": BACKTEST_END_DATE
        }
    }
}

In [14]:
EXP_NAME = "tutorial_exp"

# Data

In [15]:
qlib.init(provider_uri=f"~/.qlib/qlib_data/{REGION}_data", region=REGION)

[94379:MainThread](2023-02-18 20:33:53,037) INFO - qlib.Initialization - [config.py:416] - default_conf: client.
[94379:MainThread](2023-02-18 20:33:53,045) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[94379:MainThread](2023-02-18 20:33:53,047) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/Users/chenglong.chen/.qlib/qlib_data/cn_data')}


## Off-the-shelf dataset

Qlib integrated some dataset alreadly

In [16]:
class IdentityReweighter(Reweighter):
    def __init__(self):
        pass
    
    def reweight(self, data: Union[pd.DataFrame, pd.Series]):
        w_s = pd.Series(1.0, index=data.index)
        return w_s

    
class TargetReweighter(Reweighter):
    def __init__(self):
        pass
    
    def reweight(self, data: Union[pd.DataFrame, pd.Series]):
        w_s = data["label"].abs().values.flatten()
        return w_s

In [17]:
reweighter = IdentityReweighter()
if MODEL_LOSS in ["binary"]:
    labels = [f'If(Gt(Ref(${DEAL_PRICE}, -2), Ref(${DEAL_PRICE}, -1)), 1, 0)']
    infer_processors = [
        {"class": "Fillna", "kwargs": {"fields_group": "feature"}},
        {"class": "CSRankNorm", "kwargs": {"fields_group": "feature"}},
    ]
    learn_processors = [
        {"class": "DropnaLabel"},
    ]
elif MODEL_LOSS in ["lambdarank"]:
    labels = [f'Ref(${DEAL_PRICE}, -2)/Ref(${DEAL_PRICE}, -1) - 1']
    infer_processors = [
        {"class": "Fillna", "kwargs": {"fields_group": "feature"}},
        {"class": "CSRankNorm", "kwargs": {"fields_group": "feature"}},
    ]
    learn_processors = [
        {"class": "DropnaLabel"},
        {"class": "CSBucketizeLabel", "kwargs": {"bucket_size": 10}},
    ]
elif MODEL_LOSS in ["mse_log"]:
    labels = [f'Log(Ref(${DEAL_PRICE}, -2)/Ref(${DEAL_PRICE}, -1))']
    infer_processors = [
        {"class": "Fillna", "kwargs": {"fields_group": "feature"}},
        {"class": "CSRankNorm", "kwargs": {"fields_group": "feature"}},
    ]
    learn_processors = [
        {"class": "DropnaLabel"},
        {"class": LABEL_NORM, "kwargs": {"fields_group": "label"}},
    ]
else:
    labels = [f'Ref(${DEAL_PRICE}, -2)/Ref(${DEAL_PRICE}, -1) - 1']
    infer_processors = [
        {"class": "Fillna", "kwargs": {"fields_group": "feature"}},
        {"class": "CSRankNorm", "kwargs": {"fields_group": "feature"}},
    ]
    learn_processors = [
        {"class": "DropnaLabel"},
        {"class": LABEL_NORM, "kwargs": {"fields_group": "label"}},
    ]
label_names = ['LABEL0']
handler_kwargs = {
    "start_time": DATE_CONFIG[REGION]["train"]["start"],
    "end_time": DATE_CONFIG[REGION]["test"]["end"],
    "fit_start_time": DATE_CONFIG[REGION]["train"]["start"],
    "fit_end_time": DATE_CONFIG[REGION]["train"]["end"],
    "instruments": INSTRUMENTS,
    "label": (labels, label_names),
    "learn_processors": learn_processors,
    "infer_processors": infer_processors,
}
handler_conf = {
    "class": "Alpha158",
    "module_path": "qlib.contrib.data.handler",
    "kwargs": handler_kwargs,
}

In [18]:
hd = init_instance_by_config(handler_conf)

[94379:MainThread](2023-02-18 20:35:28,330) INFO - qlib.timer - [log.py:128] - Time cost: 95.256s | Loading data Done
[94379:MainThread](2023-02-18 20:35:29,979) INFO - qlib.timer - [log.py:128] - Time cost: 1.252s | Fillna Done
[94379:MainThread](2023-02-18 20:36:58,113) INFO - qlib.timer - [log.py:128] - Time cost: 88.132s | CSRankNorm Done
[94379:MainThread](2023-02-18 20:37:02,655) INFO - qlib.timer - [log.py:128] - Time cost: 3.849s | DropnaLabel Done
[94379:MainThread](2023-02-18 20:37:02,976) INFO - qlib.timer - [log.py:128] - Time cost: 0.320s | CSRankNorm Done
[94379:MainThread](2023-02-18 20:37:03,073) INFO - qlib.timer - [log.py:128] - Time cost: 94.741s | fit & process data Done
[94379:MainThread](2023-02-18 20:37:03,074) INFO - qlib.timer - [log.py:128] - Time cost: 190.002s | Init data Done


In [19]:
df = hd.fetch()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [20]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,...,VSUMN10,VSUMN20,VSUMN30,VSUMN60,VSUMD5,VSUMD10,VSUMD20,VSUMD30,VSUMD60,LABEL0
datetime,instrument,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2008-01-02,SH600000,-0.311400,0.899600,-0.438267,1.637733,1.603133,1.153333,0.461333,-0.703533,-0.807333,0.519000,...,1.038000,1.014933,0.657400,0.173000,0.219133,-1.026467,-0.957267,-0.611267,-0.161467,-0.038642
2008-01-02,SH600001,0.795800,1.372467,0.034600,1.591600,1.257133,1.026467,-0.103800,0.265267,-0.346000,-0.588200,...,-1.522400,-1.326333,-1.510867,-0.530533,1.568533,1.591600,1.384000,1.557000,0.565133,0.145259
2008-01-02,SH600004,1.395533,0.841933,1.672333,-1.038000,-1.118733,-1.435900,-1.435900,1.372467,1.545467,-1.187933,...,-1.430133,-1.280200,-1.268667,0.495933,1.591600,1.499333,1.337867,1.314800,-0.484400,0.000451
2008-01-02,SH600005,-1.580067,-0.023067,-1.487800,-0.588200,-0.542067,0.415200,0.311400,-1.453200,-1.407067,1.591600,...,0.299867,1.591600,1.557000,1.499333,-1.245600,-0.230667,-1.580067,-1.545467,-1.487800,0.071758
2008-01-02,SH600006,-0.207600,-0.334467,-0.115333,-0.011533,0.103800,0.899600,1.118733,0.196067,0.530533,0.415200,...,0.795800,-0.622800,0.876533,0.899600,0.023067,-0.784267,0.680467,-0.830400,-0.888067,0.029483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-02-17,SZ300896,-0.173000,0.334467,0.242200,1.164867,0.853467,-0.622800,-0.634333,-0.807333,-0.911133,0.184533,...,0.046133,1.349400,1.257133,0.299867,-0.738133,-0.034600,-1.337867,-1.245600,-0.288333,
2023-02-17,SZ300919,-0.276800,-0.369067,-0.103800,0.357533,0.415200,-1.343633,-1.343633,-0.380600,-1.453200,0.288333,...,0.265267,1.545467,1.038000,1.107200,-0.369067,-0.253733,-1.533933,-1.026467,-1.095667,
2023-02-17,SZ300957,1.441667,1.441667,1.176400,1.718467,1.084133,1.533933,0.945733,-0.230667,1.107200,-1.430133,...,-1.418600,-0.149933,-1.164867,-0.507467,1.141800,1.430133,0.161467,1.176400,0.519000,
2023-02-17,SZ300979,1.234067,0.173000,1.130267,1.153333,0.888067,1.407067,1.251367,0.980333,1.257133,-1.222533,...,1.453200,1.649267,1.430133,1.337867,-1.176400,-1.441667,-1.637733,-1.418600,-1.326333,


In [21]:
hd.data_loader


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



<qlib.data.dataset.loader.QlibDataLoader at 0x7fbea00f7400>

In [22]:
hd.data_loader.fields

{'feature': (['($close-$open)/$open',
   '($high-$low)/$open',
   '($close-$open)/($high-$low+1e-12)',
   '($high-Greater($open, $close))/$open',
   '($high-Greater($open, $close))/($high-$low+1e-12)',
   '(Less($open, $close)-$low)/$open',
   '(Less($open, $close)-$low)/($high-$low+1e-12)',
   '(2*$close-$high-$low)/$open',
   '(2*$close-$high-$low)/($high-$low+1e-12)',
   '$open/$close',
   '$high/$close',
   '$low/$close',
   '$vwap/$close',
   'Ref($close, 5)/$close',
   'Ref($close, 10)/$close',
   'Ref($close, 20)/$close',
   'Ref($close, 30)/$close',
   'Ref($close, 60)/$close',
   'Mean($close, 5)/$close',
   'Mean($close, 10)/$close',
   'Mean($close, 20)/$close',
   'Mean($close, 30)/$close',
   'Mean($close, 60)/$close',
   'Std($close, 5)/$close',
   'Std($close, 10)/$close',
   'Std($close, 20)/$close',
   'Std($close, 30)/$close',
   'Std($close, 60)/$close',
   'Slope($close, 5)/$close',
   'Slope($close, 10)/$close',
   'Slope($close, 20)/$close',
   'Slope($close, 30)/

In [23]:
hd.learn_processors

[<qlib.data.dataset.processor.DropnaLabel at 0x7fbea0101c10>,
 <qlib.data.dataset.processor.CSRankNorm at 0x7fbea0101070>]

In [24]:
hd.infer_processors

[<qlib.data.dataset.processor.Fillna at 0x7fbea00f7a90>,
 <qlib.data.dataset.processor.CSRankNorm at 0x7fbea01018b0>]

In [25]:
hd

<qlib.contrib.data.handler.Alpha158 at 0x7fbea00f7b50>

In [26]:
hd.process_type # appending type

'append'

In [27]:
hd.fetch(col_set="label", data_key=hd.DK_L)

Unnamed: 0_level_0,Unnamed: 1_level_0,LABEL0
datetime,instrument,Unnamed: 2_level_1
2008-01-02,SH600000,-1.658660
2008-01-02,SH600001,1.730000
2008-01-02,SH600004,-0.731237
2008-01-02,SH600005,1.587320
2008-01-02,SH600006,0.517216
...,...,...
2023-02-15,SZ300896,0.968800
2023-02-15,SZ300919,-1.649267
2023-02-15,SZ300957,-0.715067
2023-02-15,SZ300979,0.438267


In [28]:
hd.fetch(col_set="label", data_key=hd.DK_I)


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



Unnamed: 0_level_0,Unnamed: 1_level_0,LABEL0
datetime,instrument,Unnamed: 2_level_1
2008-01-02,SH600000,-0.038642
2008-01-02,SH600001,0.145259
2008-01-02,SH600004,0.000451
2008-01-02,SH600005,0.071758
2008-01-02,SH600006,0.029483
...,...,...
2023-02-17,SZ300896,
2023-02-17,SZ300919,
2023-02-17,SZ300957,
2023-02-17,SZ300979,


In [29]:
dataset_conf = {
    "class": "DatasetH",
    "module_path": "qlib.data.dataset",
    "kwargs": {
        "handler": hd,
        "segments": {
            "train": (DATE_CONFIG[REGION]["train"]["start"], DATE_CONFIG[REGION]["train"]["end"]),
            "valid": (DATE_CONFIG[REGION]["valid"]["start"], DATE_CONFIG[REGION]["valid"]["end"]),
            "test": (DATE_CONFIG[REGION]["test"]["start"], DATE_CONFIG[REGION]["test"]["end"]),
        },
    },
}


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.



In [30]:
dataset = init_instance_by_config(dataset_conf)

In [31]:
df_train = dataset.prepare("train", col_set=["feature", "label"], data_key=DataHandlerLP.DK_L)
df_test = dataset.prepare("test", col_set=["feature"], data_key=DataHandlerLP.DK_I)

In [32]:
np.isfinite(df_train["feature"].values).mean(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.])

In [33]:
np.isfinite(df_test["feature"].values).mean(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.])

In [34]:
riskiest_features = None
if ENABLE_NEUTRALIZE:
    riskiest_features = get_riskest_features(df_train)

# Model Training & Inference

In [35]:
def objective(trial):
    task = {
        "model": {
            "class": "LGBModel",
            "module_path": "qlib.contrib.model.gbdt",
            "kwargs": {
                "loss": MODEL_LOSS,
                "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1),
                "learning_rate": trial.suggest_uniform("learning_rate", 0, 1),
                "subsample": trial.suggest_uniform("subsample", 0.5, 1),
                "lambda_l1": trial.suggest_loguniform("lambda_l1", 1e-8, 1e4),
                "lambda_l2": trial.suggest_loguniform("lambda_l2", 1e-8, 1e4),
                "num_leaves": trial.suggest_int("num_leaves", 100, 1024),
                "max_depth": 8,
                "enable_neutralize": ENABLE_NEUTRALIZE,
            },
        },
    }
    try:
        evals_result = dict()
        model = init_instance_by_config(task["model"])
        model.fit(dataset, riskiest_features=riskiest_features, reweighter=reweighter, evals_result=evals_result)
        if MODEL_LOSS == "lambdarank":
            return -max(evals_result["valid"][MODEL_LOSS_KEY_DICT[MODEL_LOSS]])
        else:
            return min(evals_result["valid"][MODEL_LOSS_KEY_DICT[MODEL_LOSS]])
    except:
        return 100

In [36]:
if (not USE_BEST_MODEL_PARAMS) or (not os.path.exists(MODEL_PARAMS_FILE)) or (get_params_from_file(MODEL_PARAMS_FILE, ALL_CONFIG_KEY) is None):
    study = optuna.create_study(study_name=f"lightgbm_alpha158_{ALL_CONFIG_KEY}_{timestamp()}", storage="sqlite:///db.sqlite3")
    study.optimize(objective, n_jobs=1, n_trials=50)
    update_params_to_file(MODEL_PARAMS_FILE, ALL_CONFIG_KEY, study.best_params)

In [37]:
model_params = get_params_from_file(MODEL_PARAMS_FILE, ALL_CONFIG_KEY)

In [38]:
model_params.update({
    "loss": "mse" if MODEL_LOSS == "mse_log" else MODEL_LOSS,
    "max_depth": 8,
    "enable_neutralize": ENABLE_NEUTRALIZE,
    "num_threads": 20,
})

In [39]:
model_params

{'colsample_bytree': 0.8879,
 'lambda_l1': 205.6999,
 'lambda_l2': 580.9768,
 'learning_rate': 0.2,
 'num_leaves': 210,
 'subsample': 0.8789,
 'loss': 'mse',
 'max_depth': 8,
 'enable_neutralize': True,
 'num_threads': 20}

In [40]:
model = init_instance_by_config({
    "class": "LGBModel",
    "module_path": "qlib.contrib.model.gbdt",
    "kwargs": model_params
})

ModuleNotFoundError. CatBoostModel are skipped. (optional: maybe installing CatBoostModel can fix it.)


In [41]:
# start exp to train model
mlflow.end_run()
with R.start(experiment_name=EXP_NAME):
    model.fit(dataset, riskiest_features=riskiest_features, reweighter=reweighter)
    R.save_objects(trained_model=model)

    rec = R.get_recorder()
    rid = rec.id # save the record id

    # Inference and saving signal
    sr = SignalRecord(model, dataset, rec)
    sr.generate()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

[94379:MainThread](2023-02-18 20:38:36,014) INFO - qlib.workflow - [exp.py:258] - Experiment 1 starts running ...
[94379:MainThread](2023-02-18 20:38:36,152) INFO - qlib.workflow - [recorder.py:341] - Recorder ba2c1dac727a44f18772e7b7144486de starts running under Experiment 1 ...


Training until validation scores don't improve for 50 rounds
[20]	train's l2: 0.976249	valid's l2: 0.99058
[40]	train's l2: 0.967297	valid's l2: 0.988505
[60]	train's l2: 0.959167	valid's l2: 0.987635
[80]	train's l2: 0.952344	valid's l2: 0.987426
[100]	train's l2: 0.94609	valid's l2: 0.987194
[120]	train's l2: 0.940499	valid's l2: 0.987466
[140]	train's l2: 0.935013	valid's l2: 0.987456
Early stopping, best iteration is:
[102]	train's l2: 0.945501	valid's l2: 0.987111


[94379:MainThread](2023-02-18 20:39:11,875) INFO - qlib.workflow - [record_temp.py:196] - Signal record 'pred.pkl' has been saved as the artifact of the Experiment 1


'The following are prediction results of the LGBModel model.'
                          score
datetime   instrument          
2019-01-02 SH600000    0.400908
           SH600004    0.698008
           SH600009    2.114469
           SH600010   -0.159841
           SH600011   -0.551085


[94379:MainThread](2023-02-18 20:39:13,900) INFO - qlib.timer - [log.py:128] - Time cost: 0.000s | waiting `async_log` Done


# Evaluation:
- Signal-based
- Portfolio-based: backtest 

In [None]:
###################################
# prediction, backtest & analysis
###################################
def get_port_analysis_config(topk, n_drop):
    port_analysis_config = {
        "executor": {
            "class": "SimulatorExecutor",
            "module_path": "qlib.backtest.executor",
            "kwargs": {
                "time_per_step": "day",
                "generate_portfolio_metrics": True,
            },
        },
        "strategy": {
            "class": "TopkDropoutStrategy",
            "module_path": "qlib.contrib.strategy.signal_strategy",
            "kwargs": {
                "signal": "<PRED>",
                "topk": topk,
                "n_drop": n_drop,
            },
        },
        "backtest": {
            "start_time": DATE_CONFIG[REGION]["backtest"]["start"],
            "end_time": DATE_CONFIG[REGION]["backtest"]["end"],
            "account": 100000000,
            "benchmark": REGION_CONFIG[REGION]["benchmark"],
            "exchange_kwargs": REGION_CONFIG[REGION]["exchange_kwargs"]
        },
    }
    return port_analysis_config


def get_best_topk_n_drop(rec, topk_list, n_drop_list, strategy="information_ratio"):
    keys = ["mean","std","annualized_return","information_ratio","max_drawdown"]
    excess_returns = {
        k: np.nan *  np.ones((len(topk_list), len(n_drop_list))) 
        for k in keys
    }
    for i,topk in enumerate(topk_list):
        for j,n_drop in enumerate(n_drop_list):
            if topk < n_drop:
                continue
            port_analysis_config = get_port_analysis_config(topk, n_drop)
            par = PortAnaRecord(rec, port_analysis_config, "day")
            par.generate()
            analysis_df = rec.load_object("portfolio_analysis/port_analysis_1day.pkl")
            for k in keys:
                excess_returns[k][i,j] = float(analysis_df.loc["excess_return_with_cost",k])
    ind = np.unravel_index(
        np.nanargmax(excess_returns[strategy], axis=None), 
        excess_returns[strategy].shape
    )
    return_df = pd.concat(
        {
            k: pd.DataFrame(excess_returns[k], index=topk_list, columns=n_drop_list) 
            for k in keys
        }, axis=0
    )
    strategy_params = {
        "topk": topk_list[ind[0]],
        "n_drop": n_drop_list[ind[1]]
    }
    return return_df, strategy_params


# backtest and analysis
with R.start(experiment_name=EXP_NAME, recorder_id=rid, resume=True):

    # signal-based analysis
    rec = R.get_recorder()
    sar = SigAnaRecord(rec)
    sar.generate()
    
    # portfolio-based analysis: backtest
    if USE_BEST_STRATEGY_PARAMS:
        strategy_params = get_paras_from_file(STRATEGY_PARAMS_FILE, ALL_CONFIG_KEY)
    else:
        return_df, strategy_params = get_best_topk_n_drop(rec, TOPK_LIST, N_DROP_LIST)
        update_report_df(REPORT_DF_FOLDER, ALL_CONFIG_KEY, return_df)
        update_params_to_file(STRATEGY_PARAMS_FILE, ALL_CONFIG_KEY, strategy_params)
    port_analysis_config = get_port_analysis_config(strategy_params["topk"], strategy_params["n_drop"])
    par = PortAnaRecord(rec, port_analysis_config, "day")
    par.generate()


`should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.

[94379:MainThread](2023-02-18 20:39:13,926) INFO - qlib.workflow - [exp.py:258] - Experiment 1 starts running ...
[94379:MainThread](2023-02-18 20:39:13,942) INFO - qlib.workflow - [recorder.py:341] - Recorder ba2c1dac727a44f18772e7b7144486de starts running under Experiment 1 ...
[94379:MainThread](2023-02-18 20:39:15,399) INFO - qlib.backtest caller - [__init__.py:93] - Create new exchange


{'IC': 0.01338027244430197,
 'ICIR': 0.21248286570800232,
 'Rank IC': 0.01623229880991675,
 'Rank ICIR': 0.26133439728036456}




backtest loop:   0%|          | 0/1000 [00:00<?, ?it/s]


Mean of empty slice

[94379:MainThread](2023-02-18 20:39:31,041) INFO - qlib.workflow - [record_temp.py:505] - Portfolio analysis record 'port_analysis_1day.pkl' has been saved as the artifact of the Experiment 1
[94379:MainThread](2023-02-18 20:39:31,063) INFO - qlib.workflow - [record_temp.py:530] - Indicator analysis record 'indicator_analysis_1day.pkl' has been saved as the artifact of the Experiment 1
[94379:MainThread](2023-02-18 20:39:31,120) INFO - qlib.backtest caller - [__init__.py:93] - Create new exchange


'The following are analysis results of benchmark return(1day).'
                       risk
mean               0.000402
std                0.012789
annualized_return  0.095626
information_ratio  0.484671
max_drawdown      -0.473380
'The following are analysis results of the excess return without cost(1day).'
                       risk
mean               0.000762
std                0.020282
annualized_return  0.181273
information_ratio  0.579326
max_drawdown      -0.412156
'The following are analysis results of the excess return with cost(1day).'
                       risk
mean              -0.000066
std                0.020340
annualized_return -0.015684
information_ratio -0.049984
max_drawdown      -0.651548
'The following are analysis results of indicators(1day).'
     value
ffr    1.0
pa     0.0
pos    0.0




In [None]:
strategy_params

# Loading results & Analysis

## loading data
Because Qlib leverage MLflow to save model & data.
All the data can be access by `mlflow ui`

In [None]:
# load recorder
recorder = R.get_recorder(recorder_id=rid, experiment_name=EXP_NAME)

In [None]:
# load previous results
pred_df = recorder.load_object("pred.pkl")
report_normal_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
positions = recorder.load_object("portfolio_analysis/positions_normal_1day.pkl")
analysis_df = recorder.load_object("portfolio_analysis/port_analysis_1day.pkl")

In [None]:
analysis_df

In [None]:
# Previous Model can be loaded. but it is not used.
loaded_model = recorder.load_object("trained_model")
loaded_model

## analysis position

### report

In [None]:
analysis_position.report_graph(report_normal_df)

### risk analysis

In [None]:
analysis_position.risk_analysis_graph(analysis_df, report_normal_df)

## analysis model

In [None]:
label_df = dataset.prepare("test", col_set="label")
label_df.columns = ['label']

### score IC

In [None]:
pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)
analysis_position.score_ic_graph(pred_label)

### model performance

In [None]:
analysis_model.model_performance_graph(pred_label)

# Trading

In [None]:
def get_pred_score(pred_df):
    pred_score = pred_df.loc[pred_df.index[-1][0]]
    pred_score = pred_score.sort_values("score", ascending=False)
    return pred_score.iloc[:,0]

In [None]:
pred_df.index[-1][0]

In [None]:
pred_score = get_pred_score(pred_df)

In [None]:
class TopkDropoutStrategy:
    def __init__(self, topk, n_drop):
        self.topk = topk
        self.n_drop = n_drop

    def excute(self, pred_score, current_stock_list):
        last = pred_score.reindex(current_stock_list).sort_values(ascending=False).index
        today = pred_score[~pred_score.index.isin(last)].sort_values(ascending=False).index
        sell = last[-self.n_drop:][::-1]
        buy = today[:self.topk]
        
        print("-"*15+"LAST"+"-"*15)
        print(pred_score[last[::-1]])
        
        print("-"*15+"BUY"+"-"*15)
        print(pred_score[buy])
        print("-"*34)

In [None]:
current_stock_list = ["SZ002821", "SH603087", "SH603486","SH603806"]

In [None]:
strategy = TopkDropoutStrategy(2*strategy_params["topk"], strategy_params["n_drop"])
strategy.excute(pred_score, current_stock_list)