In [1]:
import numpy as np
# import numba 
# from numba import jit
import gc
import pandas as pd
import time
import seaborn as sns

import warnings
from tqdm import tqdm 
warnings.filterwarnings('ignore')


In [2]:
# 压缩df函数
def reduce_mem_usage(df):
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

In [3]:
# 读取从原始数据
df = pd.read_pickle('Bar_1d_raw.pkl')
df['date'] = pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9322640 entries, 0 to 9322639
Data columns (total 11 columns):
 #   Column         Dtype         
---  ------         -----         
 0   turn           float16       
 1   adjust_factor  float16       
 2   instrument     object        
 3   date           datetime64[ns]
 4   open           float32       
 5   volume         float32       
 6   low            float32       
 7   deal_number    float32       
 8   close          float32       
 9   high           float32       
 10  amount         float32       
dtypes: datetime64[ns](1), float16(2), float32(7), object(1)
memory usage: 497.9+ MB


In [5]:
def factor_cal(stk):
    stk.index = stk['date']
    stk['ret'] = stk['close'] / stk['open'] - 1 
    stk['range'] = stk['high'] / stk['low'] - 1 
    stk['night_jump'] = stk['open'] / stk['close'].shift(1) - 1 
    stk['label'] = stk['close'].shift(-2) / stk['open'].shift(-1) - 1
    stk['label'][stk['low'].shift(-1) == stk['high'].shift(-1)] = np.nan

    result = stk.loc[:,['date','instrument','label']]

    # 简单滞后
    single = ['turn','amount','open','close','high','low','ret','range','night_jump']
    for i in range(5):
        columns = [x+'_'+str(i) for x in single]
        result[columns] = stk[single].shift(i)

    # 区间求平均、最大值、最小值、标准差
    window = 5
    # item = ['mean','ts_max','ts_min','std','ts_rank']

    columns = ['mean'+'_'+str(window)+x for x in single]
    result[columns] = stk[single].rolling(window=window).mean()

    columns = ['ts_max'+'_'+str(window)+x for x in single]
    result[columns] = stk[single].rolling(window=window).max()

    columns = ['ts_min'+'_'+str(window)+x for x in single]
    result[columns] = stk[single].rolling(window=window).min()

    columns = ['std'+'_'+str(window)+x for x in single]
    result[columns] = stk[single].rolling(window=window).std()

    def ts_rank(a):
        sorted_id = sorted(range(len(a)), key=lambda k: a[k], reverse=True)
        return sorted_id[-1]

    columns = ['ts_rank'+'_'+str(window)+x for x in single]
    result[columns] = stk[single].rolling(window=window).apply(ts_rank)

    # 相关性类
    item = [
        ['volume','ret'],
        ['volume','open'],
        ['volume','high'],
        ['volume','low'],
        ['volume','close'],
        ['volume','turn'],
        ['volume','range'],
        ['volume','night_jump'],

        ['ret','open'],
        ['ret','high'],
        ['ret','low'],
        ['ret','close'],
        ['ret','turn'],
        ['ret','range'],
        ['ret','night_jump'],

        ['high','low'],
        ['high','open'],
        ['high','close'],
        ['high','turn'],
        ['high','range'],
        ['high','night_jump'],

        ['low','open'],
        ['low','close'],
        ['low','turn'],
        ['low','range'],
        ['low','night_jump'],

        ['open','close'],
        ['open','turn'],
        ['open','range'],
        ['open','night_jump'],
        
        ['turn','close'],
        ['turn','range'],
        ['turn','night_jump'],
        
        ['close','range'],
        ['close','night_jump'],
        
        ['night_jump','range']
    ]


    for i in item:
        result['corr_'+i[0]+'_'+i[1]+'_'+str(window)] = stk[i[0]].rolling(window=window).corr(stk[i[1]])
    
    # 压缩df函数
    def reduce_mem_usage(df):
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        for col in df.columns:
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if pd.isnull(c_min) or pd.isnull(c_max):
                    continue
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float)
        return df
    
    result = reduce_mem_usage(result)

    return result

后续的因子生成由于过于耗时，因此采用multiprocessing进行cpu并行运算，因此需要在CNN_factor_generate.py文件中实现；
标准化和缩尾处理也都在py文件中统一实现

In [6]:
# 在此只展示最终结果
final_result = pd.read_pickle('Factor_raw.pkl')
final_result.info()
final_result

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9322640 entries, 0 to 9322639
Columns: 129 entries, date to corr_night_jump_range_5
dtypes: float16(65), float32(41), float64(21), object(2)
memory usage: 4.2+ GB


Unnamed: 0,date,instrument,label,turn_0,amount_0,open_0,close_0,high_0,low_0,ret_0,...,corr_open_close_5,corr_open_turn_5,corr_open_range_5,corr_open_night_jump_5,corr_turn_close_5,corr_turn_range_5,corr_turn_night_jump_5,corr_close_range_5,corr_close_night_jump_5,corr_night_jump_range_5
0,2009-12-01,000001.SZA,-0.002884,1.826172,1.268305e+09,867.118591,868.554810,868.913879,836.598877,0.001657,...,,,,,,,,,,
1,2009-12-02,000001.SZA,-0.010330,1.166992,8.360752e+08,871.427246,875.376892,888.302856,871.068176,0.004532,...,,,,,,,,,,
2,2009-12-03,000001.SZA,0.046753,1.160156,8.128908e+08,868.913879,859.937500,875.017822,850.961121,-0.010330,...,,,,,,,,,,
3,2009-12-04,000001.SZA,0.014359,2.705078,1.960035e+09,859.937500,908.409973,910.205261,846.293396,0.056366,...,,,,,,,,,,
4,2009-12-07,000001.SZA,-0.028702,1.442383,1.068545e+09,900.151672,910.564270,926.003662,894.047729,0.011566,...,0.416106,-0.443809,-0.370469,,0.554395,0.974121,,0.616102,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9322635,2022-06-24,873223.BJA,0.008087,3.447266,5.216233e+06,3.720000,3.710000,3.750000,3.680000,-0.002687,...,0.733098,0.480479,0.111848,0.878463,0.854137,0.138672,0.294189,0.303526,0.486387,-0.365967
9322636,2022-06-27,873223.BJA,0.122986,4.437500,6.718233e+06,3.710000,3.730000,3.740000,3.680000,0.005390,...,0.756181,0.262901,0.246210,0.798268,0.456216,-0.720703,0.570801,0.194470,0.535276,-0.353027
9322637,2022-06-28,873223.BJA,-0.042847,34.656250,5.816583e+07,3.740000,4.320000,4.480000,3.720000,0.155029,...,0.272853,0.277577,0.275182,0.733629,0.999809,0.998535,0.457031,0.998974,0.442018,0.425781
9322638,2022-06-29,873223.BJA,,18.875000,3.149140e+07,4.200000,4.020000,4.280000,3.940000,-0.042847,...,0.329016,0.321369,0.196706,-0.746067,0.999781,0.990723,0.066895,0.989742,0.049030,0.155884


In [3]:
# 在此只展示最终结果
final_result = pd.read_pickle('Factor_final.pkl')
final_result.info()
final_result

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8772991 entries, 0 to 8772990
Columns: 131 entries, date to corr_night_jump_range_5
dtypes: float16(129), object(2)
memory usage: 2.2+ GB


Unnamed: 0,date,instrument,label_raw,group,label,turn_0,amount_0,open_0,close_0,high_0,...,corr_open_close_5,corr_open_turn_5,corr_open_range_5,corr_open_night_jump_5,corr_turn_close_5,corr_turn_range_5,corr_turn_night_jump_5,corr_close_range_5,corr_close_night_jump_5,corr_night_jump_range_5
0,2009-12-01,000001.SZA,-0.002884,2.0,-0.539551,-0.715820,3.767578,0.355469,0.344727,0.341797,...,,,,,,,,,,
1,2009-12-01,000002.SZA,0.017349,7.0,0.346924,-0.682129,6.734375,0.567383,0.546875,0.546387,...,,,,,,,,,,
2,2009-12-01,000005.SZA,-0.044342,0.0,-2.355469,2.421875,2.607422,-0.059052,-0.056061,-0.056366,...,,,,,,,,,,
3,2009-12-01,000006.SZA,0.007633,5.0,-0.078735,-0.451416,-0.018265,-0.016052,-0.016113,-0.016251,...,,,,,,,,,,
4,2009-12-01,000007.SZA,0.009033,5.0,-0.017380,-0.464111,-0.537109,-0.066956,-0.066895,-0.067017,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8772986,2022-06-14,871970.BJA,0.000000,5.0,0.089417,0.282471,-0.418945,-0.129517,-0.131836,-0.127319,...,1.492188,-1.597656,-1.115234,0.888672,-2.474609,0.986816,-2.029297,-2.113281,1.657227,-1.918945
8772987,2022-06-14,871981.BJA,0.011681,7.0,0.505859,-0.446777,-0.422607,-0.106323,-0.108643,-0.104919,...,1.153320,-0.517578,0.083130,-1.022461,-0.573730,0.740723,-1.633789,-0.263916,-0.808105,-1.965820
8772988,2022-06-14,872925.BJA,-0.003778,4.0,-0.045288,-0.696289,-0.430664,-0.119690,-0.121399,-0.117676,...,-0.069702,-0.292725,0.836426,1.281250,-0.812988,-3.501953,0.235596,-0.000726,0.766113,0.584961
8772989,2022-06-14,873169.BJA,-0.004784,4.0,-0.081116,-0.125122,-0.424561,-0.134155,-0.136230,-0.131714,...,1.453125,-1.747070,1.269531,0.352783,-2.550781,-3.972656,-0.503418,0.814453,0.101868,1.404297


In [9]:
final_result['ret_0'].describe()

count    9.322640e+06
mean     1.676559e-03
std      2.754211e-02
min     -3.520508e-01
25%     -1.191711e-02
50%      0.000000e+00
75%      1.388550e-02
max      1.269531e+00
Name: ret_0, dtype: float64

In [8]:
len(final_result['label'][final_result['label']>0.02])

1677842