<a href="https://colab.research.google.com/github/microsoft/qlib/blob/main/examples/workflow_by_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
#  Copyright (c) Microsoft Corporation.
#  Licensed under the MIT License.

In [11]:
from copy import deepcopy
import qlib
import pandas as pd
from qlib.constant import REG_US
from qlib.data import D
from qlib.utils import exists_qlib_data, init_instance_by_config, flatten_dict
from qlib.workflow import R
from qlib.workflow.record_temp import SignalRecord, PortAnaRecord
from qlib.tests.data import GetData
from qlib.contrib.report import analysis_model, analysis_position

In [12]:
MARKET = "nasdaq100"
BENCHMARK = "QQQ"

EXP_NAME = "nested"

## initialize qlib

In [13]:
###################################
# initialize qlib
###################################
# use custom data

provider_uri = "/root/onethingai-tmp/.qlib/qlib_data/us_data"  # target_dir
if not exists_qlib_data(provider_uri):
    print(f"Qlib data is not found in {provider_uri}")
qlib.init(provider_uri=provider_uri, region=REG_US)

[6706:MainThread](2024-06-20 16:54:29,407) INFO - qlib.Initialization - [config.py:416] - default_conf: client.
[6706:MainThread](2024-06-20 16:54:29,412) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[6706:MainThread](2024-06-20 16:54:29,414) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/root/onethingai-tmp/.qlib/qlib_data/us_data')}


In [14]:
from qlib.data import D
instruments = ['QQQ']
fields = ['$close', '$volume', 'Ref($close, 1)', 'Mean($close, 3)', '$high-$low']
F=D.features(instruments, fields, start_time='2020-04-29', end_time='2020-06-18', freq='day').head().to_string()
print(F)
len(F)

                         $close       $volume  Ref($close, 1)  Mean($close, 3)  $high-$low
instrument datetime                                                                       
QQQ        2020-04-29  2.646204  4.031772e+09        2.555581         2.602141    0.058362
           2020-04-30  2.645117  3.588434e+09        2.646204         2.615634    0.032503
           2020-05-01  2.570564  4.038276e+09        2.645117         2.620629    0.060295
           2020-05-04  2.600530  2.717923e+09        2.570564         2.605404    0.052320
           2020-05-05  2.630013  3.126931e+09        2.600530         2.600369    0.039753


636

## train model

In [15]:
###################################
# train model
###################################
data_handler_config = {
    "start_time": "2008-01-01",
    "end_time": "2020-08-01",
    "fit_start_time": "2008-01-01",
    "fit_end_time": "2014-12-31",
    "instruments": MARKET,
}

task = {
    "model": {
        "class": "LGBModel",
        "module_path": "qlib.contrib.model.gbdt",
        "kwargs": {
            "loss": "mse",
            "colsample_bytree": 0.8879,
            "learning_rate": 0.0421,
            "subsample": 0.8789,
            "lambda_l1": 205.6999,
            "lambda_l2": 580.9768,
            "max_depth": 8,
            "num_leaves": 210,
            "num_threads": 20,
        },
    },
    "dataset": {
        "class": "DatasetH",
        "module_path": "qlib.data.dataset",
        "kwargs": {
            "handler": {
                "class": "Alpha158",
                "module_path": "qlib.contrib.data.handler",
                "kwargs": data_handler_config,
            },
            "segments": {
                "train": ("2008-01-01", "2014-12-31"),
                "valid": ("2015-01-01", "2016-12-31"),
                "test": ("2020-01-01", "2020-08-01"),
            },
        },
    },
}

# model initialization
model = init_instance_by_config(task["model"])
dataset = init_instance_by_config(task["dataset"])

with R.start(experiment_name="train_model"):
    R.log_params(**flatten_dict(task))
    model.fit(dataset)
    R.save_objects(trained_model=model)
    rid = R.get_recorder().id

[6706:MainThread](2024-06-20 16:54:42,605) INFO - qlib.timer - [log.py:127] - Time cost: 13.102s | Loading data Done
[6706:MainThread](2024-06-20 16:54:42,909) INFO - qlib.timer - [log.py:127] - Time cost: 0.090s | DropnaLabel Done
[6706:MainThread](2024-06-20 16:54:50,753) INFO - qlib.timer - [log.py:127] - Time cost: 7.842s | CSZScoreNorm Done
[6706:MainThread](2024-06-20 16:54:50,756) INFO - qlib.timer - [log.py:127] - Time cost: 8.147s | fit & process data Done
[6706:MainThread](2024-06-20 16:54:50,758) INFO - qlib.timer - [log.py:127] - Time cost: 21.256s | Init data Done
[6706:MainThread](2024-06-20 16:54:50,763) INFO - qlib.workflow - [exp.py:258] - Experiment 2 starts running ...
[6706:MainThread](2024-06-20 16:54:50,778) INFO - qlib.workflow - [recorder.py:341] - Recorder aa84bb15ae35429e88d7e77fde50be86 starts running under Experiment 2 ...
usage: git diff --no-index [<options>] <path> <path>

Diff output format options
    -p, --patch           generate patch
    -s, --no-pa

Training until validation scores don't improve for 50 rounds
[20]	train's l2: 0.981829	valid's l2: 0.986552
[40]	train's l2: 0.979978	valid's l2: 0.986656


[6706:MainThread](2024-06-20 16:54:54,667) INFO - qlib.timer - [log.py:127] - Time cost: 0.089s | waiting `async_log` Done


Early stopping, best iteration is:
[7]	train's l2: 0.983135	valid's l2: 0.986524


# prediction, backtest & analysis

In [16]:
###################################
# prediction, backtest & analysis
###################################
port_analysis_config = {
    "executor": {
        "class": "SimulatorExecutor",
        "module_path": "qlib.backtest.executor",
        "kwargs": {
            "time_per_step": "day",
            "generate_portfolio_metrics": True,
        },
    },
    "strategy": {
        "class": "TopkDropoutStrategy",
        "module_path": "qlib.contrib.strategy.signal_strategy",
        "kwargs": {
            "model": model,
            "dataset": dataset,
            "topk": 50,
            "n_drop": 5,
        },
    },
    "backtest": {
        "start_time": "2020-01-01",
        "end_time": "2020-01-08",
        "account": 100000000,
        "benchmark": BENCHMARK,
        "exchange_kwargs": {
            "freq": "day",
            "limit_threshold": 0.095,
            "deal_price": "close",
            "open_cost": 0.0005,
            "close_cost": 0.0015,
            "min_cost": 5,
        },
    },
}

# backtest and analysis
with R.start(experiment_name=EXP_NAME, resume=True):
    recorder = R.get_recorder(recorder_id=rid, experiment_name="train_model")
    model = recorder.load_object("trained_model")

    # prediction
    recorder = R.get_recorder()
    ba_rid = recorder.id
    sr = SignalRecord(model, dataset, recorder)
    sr.generate()

    # backtest & analysis
    par = PortAnaRecord(
        recorder,
        port_analysis_config,
        "day"
    )
    par.generate()

[6706:MainThread](2024-06-20 16:54:54,695) INFO - qlib.workflow - [exp.py:258] - Experiment 1 starts running ...
[6706:MainThread](2024-06-20 16:54:54,720) INFO - qlib.workflow - [recorder.py:341] - Recorder 4548e927668c4433bd1931365f726875 starts running under Experiment 1 ...
usage: git diff --no-index [<options>] <path> <path>

Diff output format options
    -p, --patch           generate patch
    -s, --no-patch        suppress diff output
    -u                    generate patch
    -U, --unified[=<n>]   generate diffs with <n> lines context
    -W, --function-context
                          generate diffs with <n> lines context
    --raw                 generate the diff in raw format
    --patch-with-raw      synonym for '-p --raw'
    --patch-with-stat     synonym for '-p --stat'
    --numstat             machine friendly --stat
    --shortstat           output only the last line of --stat
    -X, --dirstat[=<param1,param2>...]
                          output the distributio

'The following are prediction results of the LGBModel model.'
                          score
datetime   instrument          
2020-01-02 AAL        -0.001310
           AAPL       -0.007562
           ADBE       -0.002970
           ADI        -0.002825
           ADP         0.004594


[6706:MainThread](2024-06-20 16:55:00,566) INFO - qlib.timer - [log.py:127] - Time cost: 0.000s | waiting `async_log` Done


AttributeError: 'float' object has no attribute 'lower'

# analyze graphs

In [None]:
# load recorder
recorder = R.get_recorder(recorder_id=ba_rid, experiment_name="backtest_analysis")

# load previous results
pred_df = recorder.load_object("pred.pkl")
report_normal_df = recorder.load_object("portfolio_analysis/report_normal_1day.pkl")
positions = recorder.load_object("portfolio_analysis/positions_normal_1day.pkl")
analysis_df = recorder.load_object("portfolio_analysis/port_analysis_1day.pkl")

## analysis position

### report

In [None]:
analysis_position.report_graph(report_normal_df)

### risk analysis

In [None]:
analysis_position.risk_analysis_graph(analysis_df, report_normal_df)

## analysis model

In [None]:
label_df = dataset.prepare("test", col_set="label")
label_df.columns = ["label"]

### score IC

In [None]:
pred_label = pd.concat([label_df, pred_df], axis=1, sort=True).reindex(label_df.index)
analysis_position.score_ic_graph(pred_label)

### model performance

In [None]:
analysis_model.model_performance_graph(pred_label)