In [1]:
import pandas as pd
import math
from scipy import stats
import time
import numpy as np
import os
import plotly.express as px
import matplotlib.pyplot as plt
import xgboost as xgb

import wandb
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn import preprocessing
import os
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import gc 
from wandb.xgboost import wandb_callback
from wandb.keras import WandbCallback

In [2]:
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class GroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_size : int, default=None
        Maximum size for a single training set.
    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.model_selection import GroupTimeSeriesSplit
    >>> groups = np.array(['a', 'a', 'a', 'a', 'a', 'a',\
                           'b', 'b', 'b', 'b', 'b',\
                           'c', 'c', 'c', 'c',\
                           'd', 'd', 'd'])
    >>> gtss = GroupTimeSeriesSplit(n_splits=3)
    >>> for train_idx, test_idx in gtss.split(groups, groups=groups):
    ...     print("TRAIN:", train_idx, "TEST:", test_idx)
    ...     print("TRAIN GROUP:", groups[train_idx],\
                  "TEST GROUP:", groups[test_idx])
    TRAIN: [0, 1, 2, 3, 4, 5] TEST: [6, 7, 8, 9, 10]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a']\
    TEST GROUP: ['b' 'b' 'b' 'b' 'b']
    TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] TEST: [11, 12, 13, 14]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b']\
    TEST GROUP: ['c' 'c' 'c' 'c']
    TRAIN: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]\
    TEST: [15, 16, 17]
    TRAIN GROUP: ['a' 'a' 'a' 'a' 'a' 'a' 'b' 'b' 'b' 'b' 'b' 'c' 'c' 'c' 'c']\
    TEST GROUP: ['d' 'd' 'd']
    """
    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_size=None
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_size = max_train_size

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))
        group_test_size = n_groups // n_folds
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []
            for train_group_idx in unique_groups[:group_test_start]:
                train_array_tmp = group_dict[train_group_idx]
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
            train_end = train_array.size
            if self.max_train_size and self.max_train_size < train_end:
                train_array = train_array[train_end -
                                          self.max_train_size:train_end]
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)
            yield [int(i) for i in train_array], [int(i) for i in test_array]
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# modified code for group gaps; source
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size
        self.verbose = verbose

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]

                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size

            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array  = test_array[group_gap:]


            if self.verbose > 0:
                    pass

            yield [int(i) for i in train_array], [int(i) for i in test_array]

In [3]:
wandb.login()

True

In [4]:
import datetime
class CONFIG:
    nfold = 5
    is_local = True
    local_path = '/Users/dylan/DylanLi/XJTLU/期权项目/Data/Output_data/option/'
    local_log_path = '/Users/dylan/DylanLi/XJTLU/期权项目/Code/model_nn/log/'
    colab_path = '/content/'
    seed = 1128
    log_dir = local_log_path + datetime.datetime.now().strftime(r"%Y%m%d-%H%M%S")

    

In [5]:
import datetime
class CONFIG:
    nfold = 5
    is_local = True
    local_path = '/Users/dylan/DylanLi/XJTLU/期权项目/Data/Output_data/option/'
    local_log_path = '/Users/dylan/DylanLi/XJTLU/期权项目/Code/model_nn/log/'
    colab_path = '/content/'
    seed = 42
    log_dir = local_log_path + datetime.datetime.now().strftime(r"%Y%m%d-%H%M%S")

    

In [6]:
if CONFIG.is_local:
    data = pd.read_csv(os.path.join(CONFIG.local_path, '2020_whole.csv'))
    
else:
    data = pd.read_csv(os.path.join(CONFIG.colab_path, '2020_whole.csv'))

    

In [7]:
if CONFIG.is_local:
    data = pd.read_csv(os.path.join(CONFIG.local_path, '2020_whole.csv'))
    
else:
    data = pd.read_csv(os.path.join(CONFIG.colab_path, '2020_whole.csv'))

In [8]:
import datetime
class CONFIG:
    nfold = 5
    is_local = True
    local_path = '/Users/dylan/iCollections/桌面文件/Repo/Option_Pricing/Data/Output_data/option/'
    local_log_path = '/Users/dylan/iCollections/桌面文件/Repo/Option_Pricing/Code/model_nn/log'
    colab_path = '/content/'
    seed = 42
    log_dir = local_log_path + datetime.datetime.now().strftime(r"%Y%m%d-%H%M%S")

    

In [9]:
if CONFIG.is_local:
    data = pd.read_csv(os.path.join(CONFIG.local_path, '2020_whole.csv'))
    
else:
    data = pd.read_csv(os.path.join(CONFIG.colab_path, '2020_whole.csv'))

In [10]:
# 只用 SP 试试 
data_C = data[data['call_put'] == 'C']]
data_P = data[data['call_put' == 'P']]

In [11]:
# 只用 SP 试试 
data_C = data[data['call_put'] == 'C']
data_P = data[data['call_put'] == 'P']

In [12]:
data_C

Empty DataFrame
Columns: [Unnamed: 0, Date, underlying, exchange, root_symbol, futures_symbol, fut_expiration_date, futures_close, opt_expiration_date, strike, call_put, style, bid, ask, settlement, volume, open_interest]
Index: []

In [13]:
# 只用 SP 试试 
data_C = data[data['call_put'] == 'c']
data_P = data[data['call_put'] == 'p']

In [14]:
c

In [15]:
data_C

         Unnamed: 0        Date underlying exchange root_symbol  \
1                 1  2020-02-05         NG    NYMEX          LN   
2                 2  2020-02-05         NG    NYMEX          LN   
3                 3  2020-02-05         NG    NYMEX          LN   
4                 4  2020-02-05         NG    NYMEX          LN   
5                 5  2020-02-05         NG    NYMEX          LN   
...             ...         ...        ...      ...         ...   
2103136     2103136  2020-09-28        MNQ      CME         MQ1   
2103137     2103137  2020-09-28        MNQ      CME         MQ1   
2103138     2103138  2020-09-28        MNQ      CME         MQ1   
2103139     2103139  2020-09-28        MNQ      CME         MQ1   
2103140     2103140  2020-09-28        MNQ      CME         MQ1   

        futures_symbol fut_expiration_date  futures_close opt_expiration_date  \
1            NG/20X.NX          10/28/2020          2.237          2020-10-27   
2            NG/20X.NX          1

In [16]:
def setup_cv(df, fold):
    """return a dataframe with fold index
    Use two method here, one with PurgedGroupTimeSeriesSplit(self designed GoupKFold API in scikit-learn), one with TimeSeriesSplit
    
    To avoid the problem of differents fold with high homogeneity, should use small n_splits:
    - If your purpose is performance estimation, you need models with low bias estimates (which means no systematic distortion of estimates). 
        You can achieve this by using a higher number of folds, usually between 10 and 20.
    - If your aim is parameter tuning, you need a mix of bias and variance, so it is advisable to use a medium number of folds, usually between 5 and 7.
    - If your purpose is just to apply variable selection and simplify your dataset, you need models with low variance estimates (or you will have disagreement). 
        Hence, a lower number of folds will suffice, usually between 3 and 5.
    
    To simulate the generation of time window, use large n_splits, the step can be numbers of sample(n) // n_splits
    e.g. when n_splits = n, step = n // n = 1
    
    Args:
        df (dataframe): dataframe with time window
        fold (int): fold number

    Returns:
        dataframe: dataframe with fold index
    """
    df.reset_index(drop = True, inplace = True)
    df.sort_values(by='Date', inplace = True)
    df['fold'] = -1
    if len(data.underlying.unique()) > 1:
        cv = PurgedGroupTimeSeriesSplit(n_splits=fold, group_gap=0)
        step = df.shape[0]//fold
        print(f"{'=' * 20} step : {step} {'=' * 20}")
        for index, (train_, val_) in enumerate(cv.split(df, groups=df.underlying)):
            print(f"{'=' * 20} val_{index} {'=' * 20}")
            print(val_)
            df.iloc[val_, -1] = index
    else :
        cv = TimeSeriesSplit(n_splits=fold)
        step = df.shape[0]//fold
        print(f"{'=' * 20} step : {step} {'=' * 20}")
        for index, (train_, val_) in enumerate(cv.split(df)):
            print(f"{'=' * 20} val_{index} {'=' * 20}")
            print(val_)
            df.iloc[val_, -1] = index
    return df
        

In [17]:
data_C = setup_cv(data, CONFIG.nfold)

In [18]:
def setup_cv(df, fold):
    """return a dataframe with fold index
    Use two method here, one with PurgedGroupTimeSeriesSplit(self designed GoupKFold API in scikit-learn), one with TimeSeriesSplit
    
    To avoid the problem of differents fold with high homogeneity, should use small n_splits:
    - If your purpose is performance estimation, you need models with low bias estimates (which means no systematic distortion of estimates). 
        You can achieve this by using a higher number of folds, usually between 10 and 20.
    - If your aim is parameter tuning, you need a mix of bias and variance, so it is advisable to use a medium number of folds, usually between 5 and 7.
    - If your purpose is just to apply variable selection and simplify your dataset, you need models with low variance estimates (or you will have disagreement). 
        Hence, a lower number of folds will suffice, usually between 3 and 5.
    
    To simulate the generation of time window, use large n_splits, the step can be numbers of sample(n) // n_splits
    e.g. when n_splits = n, step = n // n = 1
    
    Args:
        df (dataframe): dataframe with time window
        fold (int): fold number

    Returns:
        dataframe: dataframe with fold index
    """
    df.reset_index(drop = True, inplace = True)
    df.sort_values(by='Date', inplace = True)
    df['fold'] = -1
    if len(data.underlying.unique()) > 1:
        cv = PurgedGroupTimeSeriesSplit(n_splits=fold, group_gap=0)
        step = df.shape[0]//fold
        print(f"{'=' * 20} step : {step} {'=' * 20}")
        for index, (train_, val_) in enumerate(cv.split(df, groups=df.underlying)):
            print(f"{'=' * 20} val_{index} {'=' * 20}")
            print(val_[-50:])
            df.iloc[val_, -1] = index
    else :
        cv = TimeSeriesSplit(n_splits=fold)
        step = df.shape[0]//fold
        print(f"{'=' * 20} step : {step} {'=' * 20}")
        for index, (train_, val_) in enumerate(cv.split(df)):
            print(f"{'=' * 20} val_{index} {'=' * 20}")
            print(val_[-50:])
            df.iloc[val_, -1] = index
    return df
        

In [19]:
data_C = setup_cv(data, CONFIG.nfold)

In [20]:
data.fold.unique()

array([-1,  0,  1,  2,  3,  4])

In [21]:
data_C.fold.unique()

array([-1,  0,  1,  2,  3,  4])

In [22]:
data_C.columns

Index(['Unnamed: 0', 'Date', 'underlying', 'exchange', 'root_symbol',
       'futures_symbol', 'fut_expiration_date', 'futures_close',
       'opt_expiration_date', 'strike', 'call_put', 'style', 'bid', 'ask',
       'settlement', 'volume', 'open_interest', 'fold'],
      dtype='object')

In [23]:
if CONFIG.is_local:
    data = pd.read_csv(os.path.join(CONFIG.local_path, '2020_whole.csv'))
    
else:
    data = pd.read_csv(os.path.join(CONFIG.colab_path, '2020_whole.csv'))

In [24]:
# 只用 SP 试试 
data_C = data[data['call_put'] == 'c']
data_P = data[data['call_put'] == 'p']

In [25]:
def setup_cv(df, fold):
    """return a dataframe with fold index
    Use two method here, one with PurgedGroupTimeSeriesSplit(self designed GoupKFold API in scikit-learn), one with TimeSeriesSplit
    
    To avoid the problem of differents fold with high homogeneity, should use small n_splits:
    - If your purpose is performance estimation, you need models with low bias estimates (which means no systematic distortion of estimates). 
        You can achieve this by using a higher number of folds, usually between 10 and 20.
    - If your aim is parameter tuning, you need a mix of bias and variance, so it is advisable to use a medium number of folds, usually between 5 and 7.
    - If your purpose is just to apply variable selection and simplify your dataset, you need models with low variance estimates (or you will have disagreement). 
        Hence, a lower number of folds will suffice, usually between 3 and 5.
    
    To simulate the generation of time window, use large n_splits, the step can be numbers of sample(n) // n_splits
    e.g. when n_splits = n, step = n // n = 1
    
    Args:
        df (dataframe): dataframe with time window
        fold (int): fold number

    Returns:
        dataframe: dataframe with fold index
    """
    df.reset_index(drop = True, inplace = True)
    df.sort_values(by='Date', inplace = True)
    df['fold'] = -1
    if len(data.underlying.unique()) > 1:
        cv = PurgedGroupTimeSeriesSplit(n_splits=fold, group_gap=0)
        step = df.shape[0]//fold
        print(f"{'=' * 20} step : {step} {'=' * 20}")
        for index, (train_, val_) in enumerate(cv.split(df, groups=df.underlying)):
            print(f"{'=' * 20} val_{index} {'=' * 20}")
            print(val_[-50:])
            df.iloc[val_, -1] = index
    else :
        cv = TimeSeriesSplit(n_splits=fold)
        step = df.shape[0]//fold
        print(f"{'=' * 20} step : {step} {'=' * 20}")
        for index, (train_, val_) in enumerate(cv.split(df)):
            print(f"{'=' * 20} val_{index} {'=' * 20}")
            print(val_[-50:])
            df.iloc[val_, -1] = index
    return df
        

In [26]:
data_C = setup_cv(data, CONFIG.nfold)

In [27]:
data_C.fold.unique()

array([-1,  1,  2,  0,  3,  4])

In [28]:
data_C.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'Date', 'underlying', 'exchange',
       'root_symbol', 'futures_symbol', 'fut_expiration_date', 'futures_close',
       'opt_expiration_date', 'strike', 'call_put', 'style', 'bid', 'ask',
       'settlement', 'volume', 'open_interest', '1M', '3M', '6M', '12M',
       'fold'],
      dtype='object')

In [29]:
# 计算每个标的的RV
def cal_rv(df):
    """return rv of each unerlying

    Args:
        df (dataframe): df with rv
    """
    df_time = df.drop_duplicates(subset=['Date'], keep='first', inplace=False)
    df_time['sigma'] = np.std(df_time.futures_close.pct_change(1))
    dict_ = df_time[['Date', 'sigma']].set_index('Date').to_dict()['sigma']
    
    df['sigma'] = df['Date'].map(dict_)
    return df

In [30]:
# features and target

data_C = data_C.groupby('underlying').apply(cal_rv)
data_C['S_K'] = data_C.strike / data_C.futures_close
data_C.Date = pd.to_datetime(data_C.Date)
data_C.opt_expiration_date = pd.to_datetime(data_C.opt_expiration_date)
data_C['days'] = (data_C.opt_expiration_date - data_C.Date).dt.days/365

features = ['S_K', 'days', 'sigma']
target = ['settlement']

In [31]:
data_C

         Unnamed: 0.1  Unnamed: 0       Date underlying exchange root_symbol  \
1569641       1569641     1569641 2020-01-02         DA      CME          DA   
1572300       1572300     1572300 2020-01-02         ES      CME          EW   
1572299       1572299     1572299 2020-01-02         ES      CME          EW   
1572298       1572298     1572298 2020-01-02         ES      CME          EW   
1572297       1572297     1572297 2020-01-02         ES      CME          EW   
...               ...         ...        ...        ...      ...         ...   
1018733       1018733     1018733 2020-11-24         ES      CME         EW2   
1018734       1018734     1018734 2020-11-24         ES      CME         EW2   
1018735       1018735     1018735 2020-11-24         ES      CME         EW2   
1018725       1018725     1018725 2020-11-24         ES      CME         EW2   
1014297       1014297     1014297 2020-11-24         NG    NYMEX          NG   

        futures_symbol fut_expiration_d

In [32]:
data_C.reset_index(drop=True, inplace=True)

In [33]:
data_C.reset_index(drop=True, inplace=True)
data_C

         Unnamed: 0.1  Unnamed: 0       Date underlying exchange root_symbol  \
0             1569641     1569641 2020-01-02         DA      CME          DA   
1             1572300     1572300 2020-01-02         ES      CME          EW   
2             1572299     1572299 2020-01-02         ES      CME          EW   
3             1572298     1572298 2020-01-02         ES      CME          EW   
4             1572297     1572297 2020-01-02         ES      CME          EW   
...               ...         ...        ...        ...      ...         ...   
2103137       1018733     1018733 2020-11-24         ES      CME         EW2   
2103138       1018734     1018734 2020-11-24         ES      CME         EW2   
2103139       1018735     1018735 2020-11-24         ES      CME         EW2   
2103140       1018725     1018725 2020-11-24         ES      CME         EW2   
2103141       1014297     1014297 2020-11-24         NG    NYMEX          NG   

        futures_symbol fut_expiration_d

In [34]:
# features and target

data_C = data_C.groupby('underlying').apply(cal_rv)
data_C['S_K'] = data_C.strike / data_C.futures_close
data_C.Date = pd.to_datetime(data_C.Date)
data_C.opt_expiration_date = pd.to_datetime(data_C.opt_expiration_date)
data_C['days'] = (data_C.opt_expiration_date - data_C.Date).dt.days/365

features = ['S_K', 'days', 'sigma', '12M']
target = ['settlement']

In [35]:
data_C.reset_index(drop=True, inplace=True)
data_C

         Unnamed: 0.1  Unnamed: 0       Date underlying exchange root_symbol  \
0             1569641     1569641 2020-01-02         DA      CME          DA   
1             1572300     1572300 2020-01-02         ES      CME          EW   
2             1572299     1572299 2020-01-02         ES      CME          EW   
3             1572298     1572298 2020-01-02         ES      CME          EW   
4             1572297     1572297 2020-01-02         ES      CME          EW   
...               ...         ...        ...        ...      ...         ...   
2103137       1018733     1018733 2020-11-24         ES      CME         EW2   
2103138       1018734     1018734 2020-11-24         ES      CME         EW2   
2103139       1018735     1018735 2020-11-24         ES      CME         EW2   
2103140       1018725     1018725 2020-11-24         ES      CME         EW2   
2103141       1014297     1014297 2020-11-24         NG    NYMEX          NG   

        futures_symbol fut_expiration_d

In [36]:
def xgb_model(data,nfold):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'eta': 0.01,
        'random_state': CONFIG.seed,
    }
    scores = []
    models = []
    
    for f in range(nfold):
        data.reset_index(drop=True, inplace=True)
        train_idx = data[data.fold != f].index
        val_idx = data[data.fold == f].index
        min_max_scaler = preprocessing.StandardScaler()

        train_x = min_max_scaler.fit_transform(data.iloc[train_idx, :][features])
        train_y = data.iloc[train_idx, :][target]
        
        val_x = min_max_scaler.fit_transform(data.iloc[val_idx, :][features])
        val_y = data.iloc[val_idx, :]['settlement']
        if data.underlying.value_counts().shape[0] == 1:
            run = wandb.init(project="Option-project", entity="dylanli", reinit=True, name=f'xgb_single_{f}')
        else : 
            run = wandb.init(project="Option-project", entity="dylanli", reinit=True, name=f'xgb_multi_{f}')
        
        callback = wandb_callback()
        
        model = xgb.XGBRegressor()
        model.set_params(**params)
        model.fit(train_x, train_y, early_stopping_rounds=10, eval_set=[(val_x, val_y)], callbacks=[callback])
        model.save_model(f'/Users/dylan/DylanLi/XJTLU/期权项目/Code/model_xgb/model_{f}.json')
        # plot
        ax = xgb.plot_importance(model)
        plt.show(ax)
        wandb.log({"chart" : ax})

        print(f"{'='*20}{f} model trained{'='*20}")
        off_pred = model.predict(val_x, ntree_limit=model.best_ntree_limit)
        off_score = np.sqrt(mean_squared_error(val_y, off_pred))
        scores.append(off_score)
        models.append(model)
        print(f"{'=' * 20} fold_{f} {'=' * 20}")
        print(f'rmse: {off_score}')
        
        
        del train_x, train_y, val_x, val_y
        gc.collect()
        
        run.finish()
    
    return scores, models
        
        

In [37]:
scores, models = xgb_model(data, CONFIG.nfold)

In [38]:
scores, models = xgb_model(data_C, CONFIG.nfold)

In [39]:
data_C[features]

              S_K      days     sigma   12M
0        0.937866  0.093151  0.126342  1.44
1        1.080086  0.326027  0.022691  1.44
2        1.077017  0.326027  0.022691  1.44
3        1.073949  0.326027  0.022691  1.44
4        1.070881  0.326027  0.022691  1.44
...           ...       ...       ...   ...
2103137  0.919414  0.046575  0.022691  0.09
2103138  0.920790  0.046575  0.022691  0.09
2103139  0.922166  0.046575  0.022691  0.09
2103140  0.867112  0.046575  0.022691  0.09
2103141  0.936937  0.000000  0.177440  0.09

[2103142 rows x 4 columns]

In [40]:
data_C[features] == np.inf

           S_K   days  sigma    12M
0        False  False  False  False
1        False  False  False  False
2        False  False  False  False
3        False  False  False  False
4        False  False  False  False
...        ...    ...    ...    ...
2103137  False  False  False  False
2103138  False  False  False  False
2103139  False  False  False  False
2103140  False  False  False  False
2103141  False  False  False  False

[2103142 rows x 4 columns]

In [41]:
data_C[data_C[features] == np.inf]

         Unnamed: 0.1  Unnamed: 0 Date underlying exchange root_symbol  \
0                 NaN         NaN  NaT        NaN      NaN         NaN   
1                 NaN         NaN  NaT        NaN      NaN         NaN   
2                 NaN         NaN  NaT        NaN      NaN         NaN   
3                 NaN         NaN  NaT        NaN      NaN         NaN   
4                 NaN         NaN  NaT        NaN      NaN         NaN   
...               ...         ...  ...        ...      ...         ...   
2103137           NaN         NaN  NaT        NaN      NaN         NaN   
2103138           NaN         NaN  NaT        NaN      NaN         NaN   
2103139           NaN         NaN  NaT        NaN      NaN         NaN   
2103140           NaN         NaN  NaT        NaN      NaN         NaN   
2103141           NaN         NaN  NaT        NaN      NaN         NaN   

        futures_symbol fut_expiration_date  futures_close opt_expiration_date  \
0                  NaN        

In [42]:
data_C[data_C[features] == np.inf].any()

Unnamed: 0.1           False
Unnamed: 0             False
Date                   False
underlying             False
exchange               False
root_symbol            False
futures_symbol         False
fut_expiration_date    False
futures_close          False
opt_expiration_date    False
strike                 False
call_put               False
style                  False
bid                    False
ask                    False
settlement             False
volume                 False
open_interest          False
1M                     False
3M                     False
6M                     False
12M                    False
fold                   False
sigma                  False
S_K                     True
days                   False
dtype: bool

In [43]:
data_C['S/K'] == np.inf

In [44]:
data_C['S_K'][data_C['S_K'] == np.inf]

521164    inf
564215    inf
567445    inf
Name: S_K, dtype: float64

In [45]:
data_C.drop(data_C.S_K == np.inf)

In [46]:
data_C.drop(data_C.S_K != np.inf)

In [47]:
data_C.replace([np.inf, -np.inf], np.nan, inplace=True)

In [48]:
data_C.dropna(inplace=True)

In [49]:
scores, models = xgb_model(data_C, CONFIG.nfold)

In [50]:
def xgb_model(data,nfold):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'eta': 0.01,
        'random_state': CONFIG.seed,
    }
    scores = []
    models = []
    
    for f in range(nfold):
        data.reset_index(drop=True, inplace=True)
        train_idx = data[data.fold != f].index
        val_idx = data[data.fold == f].index
        min_max_scaler = preprocessing.StandardScaler()

        train_x = min_max_scaler.fit_transform(data.iloc[train_idx, :][features])
        train_y = min_max_scaler.fit_transform(data.iloc[train_idx, :][target])
        
        val_x = min_max_scaler.fit_transform(data.iloc[val_idx, :][features])
        val_y = min_max_scaler.fit_transform(data.iloc[val_idx, :]['settlement'])
        if data.underlying.value_counts().shape[0] == 1:
            run = wandb.init(project="Option-project", entity="dylanli", reinit=True, name=f'xgb_single_{f}')
        else : 
            run = wandb.init(project="Option-project", entity="dylanli", reinit=True, name=f'xgb_multi_{f}')
        
        callback = wandb_callback()
        
        model = xgb.XGBRegressor()
        model.set_params(**params)
        model.fit(train_x, train_y, early_stopping_rounds=10, eval_set=[(val_x, val_y)], callbacks=[callback])
        model.save_model(f'/Users/dylan/iCollections/桌面文件/Repo/Option_Pricing/Code/model_xgb/model_{f}.json')
        # plot
        ax = xgb.plot_importance(model)
        plt.show(ax)
        wandb.log({"chart" : ax})

        print(f"{'='*20}{f} model trained{'='*20}")
        off_pred = model.predict(val_x, ntree_limit=model.best_ntree_limit)
        off_score = np.sqrt(mean_squared_error(val_y, off_pred))
        scores.append(off_score)
        models.append(model)
        print(f"{'=' * 20} fold_{f} {'=' * 20}")
        print(f'rmse: {off_score}')
        
        
        del train_x, train_y, val_x, val_y
        gc.collect()
        
        run.finish()
    
    return scores, models
        
        

In [51]:
scores, models = xgb_model(data_C, CONFIG.nfold)

In [52]:
def xgb_model(data,nfold):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'eta': 0.01,
        'random_state': CONFIG.seed,
    }
    scores = []
    models = []
    
    for f in range(nfold):
        data.reset_index(drop=True, inplace=True)
        train_idx = data[data.fold != f].index
        val_idx = data[data.fold == f].index
        min_max_scaler = preprocessing.StandardScaler()

        train_x = min_max_scaler.fit_transform(data.iloc[train_idx, :][features])
        train_y = min_max_scaler.fit_transform(data.iloc[train_idx, :][target])
        
        val_x = min_max_scaler.fit_transform(data.iloc[val_idx, :][features])
        val_y = min_max_scaler.fit_transform(data.iloc[val_idx, :][target])
        if data.underlying.value_counts().shape[0] == 1:
            run = wandb.init(project="Option-project", entity="dylanli", reinit=True, name=f'xgb_single_{f}')
        else : 
            run = wandb.init(project="Option-project", entity="dylanli", reinit=True, name=f'xgb_multi_{f}')
        
        callback = wandb_callback()
        
        model = xgb.XGBRegressor()
        model.set_params(**params)
        model.fit(train_x, train_y, early_stopping_rounds=10, eval_set=[(val_x, val_y)], callbacks=[callback])
        model.save_model(f'/Users/dylan/iCollections/桌面文件/Repo/Option_Pricing/Code/model_xgb/model_{f}.json')
        # plot
        ax = xgb.plot_importance(model)
        plt.show(ax)
        wandb.log({"chart" : ax})

        print(f"{'='*20}{f} model trained{'='*20}")
        off_pred = model.predict(val_x, ntree_limit=model.best_ntree_limit)
        off_score = np.sqrt(mean_squared_error(val_y, off_pred))
        scores.append(off_score)
        models.append(model)
        print(f"{'=' * 20} fold_{f} {'=' * 20}")
        print(f'rmse: {off_score}')
        
        
        del train_x, train_y, val_x, val_y
        gc.collect()
        
        run.finish()
    
    return scores, models
        
        

In [53]:
scores, models = xgb_model(data_C, CONFIG.nfold)

In [54]:
sweep_config = {
    'method' : 'bayes',
    'metric' : {
        'name' : 'rmse',
        'goal' : 'minimize',
    },
    'parameters' : {
        'epochs': {
            'values' : [50, 100 ,200]
        },
        'dropout' : {
            'values': [0.8, 0.6 , 0.5, 0.4]
        },
        'learning_rate' : {
            'values': [0.01, 0.001, 0.003, 0.1]
        },
        'optimizer' : {
            'values' : ['adam',  'sgd']
        },
        'activation' : {
            'values' : ['relu', 'swish', 'selu']
        },
        'nn_nodes': {
            'values' : [[128,64],[128,64,32],[128,64,16],[128,32,16],[64,16], []]
        },
        'early_terminate' : {
            'type': 'hyperband',
            'max_iter' : 10,
            's' : 2,
        }
    }
}

In [55]:
# Initialize a new sweep
# Arguments:
#     – sweep_config: the sweep config dictionary defined above
#     – entity: Set the username for the sweep
#     – project: Set the project name for the sweep
sweep_id = wandb.sweep(sweep_config, entity="dylanli", project="Option-project")

In [56]:
sweep_config = {
    'method' : 'bayes',
    'metric' : {
        'name' : 'rmse',
        'goal' : 'minimize',
    },
    'parameters' : {
        'epochs': {
            'values' : [50, 100 ,200]
        },
        'dropout' : {
            'values': [0.8, 0.6 , 0.5, 0.4]
        },
        'learning_rate' : {
            'values': [0.01, 0.001, 0.003, 0.1]
        },
        'optimizer' : {
            'values' : ['adam',  'sgd']
        },
        'activation' : {
            'values' : ['relu', 'swish', 'selu']
        },
        'nn_nodes': {
            'values' : [[128,64],[128,64,32],[128,64,16],[128,32,16],[64,16], []]
        },
        'early_terminate' : {
            'type': 'hyperband',
            'max_iter' : 10,
            's' : 2,
        }
    }
}

In [57]:
# Initialize a new sweep
# Arguments:
#     – sweep_config: the sweep config dictionary defined above
#     – entity: Set the username for the sweep
#     – project: Set the project name for the sweep
sweep_id = wandb.sweep(sweep_config, entity="dylanli", project="Option-project")

In [58]:
sweep_config = {
    'method' : 'bayes',
    'metric' : {
        'name' : 'rmse',
        'goal' : 'minimize',
    },
    'parameters' : {
        'epochs': {
            'values' : [50, 100 ,200]
        },
        'dropout' : {
            'values': [0.8, 0.6 , 0.5, 0.4]
        },
        'learning_rate' : {
            'values': [0.01, 0.001, 0.003, 0.1]
        },
        'optimizer' : {
            'values' : ['adam',  'sgd']
        },
        'activation' : {
            'values' : ['relu', 'swish', 'selu']
        },
        'nn_nodes': {
            'values' : [[128,64],[128,64,32],[128,64,16],[128,32,16],[64,16], []]
        }
    },
    'early_terminate' : {
        'type' : 'hyperband',
        'max_iter' : 10,
        's' : 3
    }
}

In [59]:
# Initialize a new sweep
# Arguments:
#     – sweep_config: the sweep config dictionary defined above
#     – entity: Set the username for the sweep
#     – project: Set the project name for the sweep
sweep_id = wandb.sweep(sweep_config, entity="dylanli", project="Option-project")

In [60]:

def nn_model(data=data, nfold=CONFIG.nfold,log=CONFIG.log_dir, is_train=True):
    
    config_defaults = {
    'batch_size': 32,
    'epochs': 100,
    'dropout': 0.5,
    'activation_type': 'swish',
    'optimizer': 'adam',
    'seed': 42,
    'learning_rate': 1e-3,
    'nn_nodes' : [32, 32, 32],
    }

    if data.underlying.value_counts().shape[0] == 1:
        run = wandb.init(config=config_defaults, project="Option-project", entity="dylanli", reinit=True, name=f'nn_single')
    else : 
        run = wandb.init(config=config_defaults, project="Option-project", entity="dylanli", reinit=True, name=f'nn_multi')
    
    
    config = wandb.config
    
    # Deine the optimizer
    if config.optimizer=='sgd':
        optimizer = keras.optimizers.SGD(lr=config.learning_rate, decay=1e-5, momentum=0.9, nesterov=True)
    elif config.optimizer=='adam':
        optimizer = keras.optimizers.Adam(lr=config.learning_rate, beta_1=0.9, beta_2=0.999, clipnorm=1.0)
        
    def dense_block(x, n_nodes, p=0.5, activation='swish'):
        x = layers.Dropout(p)(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dense(n_nodes, activation=activation)(x)
        return x
    
    def get_nn(dense_blocks, optimizer, dropout):
        input_ = layers.Input(shape=(len(features),))
        x = layers.BatchNormalization()(input_)
        x = layers.Dense(256, activation='swish')(x)
        
        #TODO 多层连接
        if len(dense_blocks) >= 1:
            p = dropout
            for units in dense_blocks:
                x = dense_block(x, units, p)
                p -= 0.05
            
        output = layers.Dense(1)(x)
        
        model = keras.Model(input_, output)
            
        model.compile(optimizer, loss=tf.keras.losses.MeanAbsoluteError(name='val_loss'), metrics=[tf.keras.losses.MeanAbsoluteError(name='val_loss')])
        
        return model
    
    models = []
    scores = []
    if is_train:
        for f in range(nfold):
            data.reset_index(drop=True, inplace=True)
            
            train_idx = data[data.fold != f].index
            val_idx = data[data.fold == f].index
            scaler = preprocessing.StandardScaler()
            
            train_x = scaler.fit_transform(data.iloc[train_idx, :][features])
            train_y = data.iloc[train_idx, :]['settlement']
            
            val_x = scaler.fit_transform(data.iloc[val_idx, :][features])
            val_y = np.array(data.iloc[val_idx, :]['settlement'])
            
            checkpoint = keras.callbacks.ModelCheckpoint(filepath=f'/Users/dylan/DylanLi/XJTLU/期权项目/Code/model_nn/model_nn_{f}.hdf5', save_best_only=True)
            early_stop = keras.callbacks.EarlyStopping(patience=config.epochs//4)
            tesnsorboard = keras.callbacks.TensorBoard(log, histogram_freq=1)
            model = get_nn(config.nn_nodes, optimizer, config.dropout)
            print(f"{'=' * 20} fold_{f} Training {'=' * 20}")
            history = model.fit(train_x, train_y, epochs=config.epochs, batch_size=config.batch_size, validation_data=(val_x, val_y), callbacks=[checkpoint, early_stop, WandbCallback()], validation_freq=[1,1,1])
            
            pd.DataFrame(history.history, columns=['loss', 'val_loss']).plot()
            plt.title("MSE")
            plt.show()
            #loss, mae, mse = model.evaluate(val_x, val_y)
            #print(f"evaluation: loss: {loss}, mae: {mae}, mse: {mse}")
            
            model.load_weights(f'/Users/dylan/DylanLi/XJTLU/期权项目/Code/model_nn/model_nn_{f}.hdf5')
            off_pred = model.predict(val_x)
            off_score = np.sqrt(mean_squared_error(val_y, off_pred))
            #off_score = 0
            
            scores.append(off_score)
            print(f"{'=' * 20} fold_{f} score {'=' * 20}")
            print(f"rmse: {off_score}")
            
            del train_x, train_y, val_x, val_y
            gc.collect()
            
            run.finish()
            break
        
    #wandb.log({"Fold scores": scores})
    #return scores, models
        

In [61]:
wandb.agent(sweep_id, function=nn_model)