In [1]:
import pandas as pd
import numpy as np
import itertools
import lightgbm as lgbm
import matplotlib.pyplot as plt

from script import *
from feature_engineering import *
import db_operations as dbop
from constants import *
import data_process as dp

import time

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
pd.set_option('display.max_rows',200)
idx = pd.IndexSlice
agg_operations =['mean','median','max','min','std','size']

In [3]:
start = 20130101
cursor = dbop.connect_db("sqlite3").cursor()

df_d = dbop.create_df(cursor, STOCK_DAY[TABLE], start=start)
df_d = dp.proc_stock_d(dp.prepare_stock_d(df_d))
df_d.drop(columns=['open0','high0','low0','vol0'],inplace=True)
df_d['pct'] = df_d.sort_index().groupby('code')['close'].pct_change()*100
df_d['amt']/=1e5
df_d = df_d.astype('float32')
print(df_d.columns)
print(df_d.shape)
df_d.info(memory_usage='deep')
df_d.tail()

select * from stock_day where date>=20130101
Index(['open', 'high', 'low', 'close', 'vol', 'amt', 'adj_factor', 'avg',
       'close0', 'avg0', 'pct'],
      dtype='object')
(5391073, 11)
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 5391073 entries, (000001.SZ, 2013-01-04 00:00:00) to (688399.SH, 2020-04-22 00:00:00)
Data columns (total 11 columns):
open          float32
high          float32
low           float32
close         float32
vol           float32
amt           float32
adj_factor    float32
avg           float32
close0        float32
avg0          float32
pct           float32
dtypes: float32(11)
memory usage: 407.2 MB


Unnamed: 0_level_0,Unnamed: 1_level_0,open,high,low,close,vol,amt,adj_factor,avg,close0,avg0,pct
code,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
688399.SH,2020-04-16,83.400002,96.0,82.440002,94.900002,31260.830078,2.815096,1.0,90.051849,94.900002,90.051849,15.731705
688399.SH,2020-04-17,95.949997,95.949997,88.879997,92.0,24047.009766,2.208113,1.0,91.82486,92.0,91.82486,-3.055853
688399.SH,2020-04-20,95.790001,109.970001,95.199997,108.730003,34636.898438,3.590274,1.0,103.654602,108.730003,103.654602,18.18478
688399.SH,2020-04-21,108.0,118.199997,102.0,114.440002,33081.828125,3.580782,1.0,108.240166,114.440002,108.240166,5.251539
688399.SH,2020-04-22,108.0,118.5,107.489998,109.089996,21584.169922,2.432959,1.0,112.719612,109.089996,112.719612,-4.674941


In [4]:
df_d_basic = dbop.create_df(cursor, STOCK_DAILY_BASIC[TABLE], start=start)
df_d_basic = dp.prepare_stock_d_basic(df_d_basic)
#
df_d_basic["pb*pe_ttm"] = df_d_basic["pb"] * df_d_basic["pe_ttm"]
df_d_basic["pb*pe"] = df_d_basic["pb"] * df_d_basic["pe"]

df_d_basic = df_d_basic.astype('float32')
print(df_d_basic.columns)
print(df_d_basic.shape)
df_d_basic.info(memory_usage='deep')
df_d_basic.tail()

select * from stock_daily_basic where date>=20130101
Index(['close', 'turnover_rate', 'turnover_rate_f', 'volume_ratio', 'pe',
       'pe_ttm', 'pb', 'ps', 'ps_ttm', 'total_share', 'float_share',
       'free_share', 'total_mv', 'circ_mv', 'pb*pe_ttm', 'pb*pe'],
      dtype='object')
(5067549, 16)
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 5067549 entries, (000001.SZ, 2013-01-04 00:00:00) to (688399.SH, 2020-04-22 00:00:00)
Data columns (total 16 columns):
close              float32
turnover_rate      float32
turnover_rate_f    float32
volume_ratio       float32
pe                 float32
pe_ttm             float32
pb                 float32
ps                 float32
ps_ttm             float32
total_share        float32
float_share        float32
free_share         float32
total_mv           float32
circ_mv            float32
pb*pe_ttm          float32
pb*pe              float32
dtypes: float32(16)
memory usage: 328.9 MB


AttributeError: 'MultiIndex' object has no attribute 'tail'

In [5]:
df = df_d
periods = [5,10,20,30,40,50,60,70,80,120,250]
for k in periods:
    df['{:d}ma'.format(k)]=df.reset_index('code').groupby('code')['close'].rolling(k).mean()
    df['{:d}ma_avg'.format(k)]=df.reset_index('code').groupby('code')['amt'].rolling(k).sum()\
            /df.reset_index('code').groupby('code')['vol'].rolling(k).sum()*1e6
print(df.columns)
    
for col2 in ["{}ma".format(k) for k in periods]:
    col1 = 'close'
    df['{}/{}'.format(col1,col2)] = df[col1]/df[col2]

for col2 in ["{}ma_avg".format(k) for k in periods]:
    col1 = 'close'
    df['{}/{}'.format(col1,col2)] = df[col1]/df[col2]

print(df.columns)

Index(['open', 'high', 'low', 'close', 'vol', 'amt', 'adj_factor', 'avg',
       'close0', 'avg0', 'pct', '5ma', '5ma_avg', '10ma', '10ma_avg', '20ma',
       '20ma_avg', '30ma', '30ma_avg', '40ma', '40ma_avg', '50ma', '50ma_avg',
       '60ma', '60ma_avg', '70ma', '70ma_avg', '80ma', '80ma_avg', '120ma',
       '120ma_avg', '250ma', '250ma_avg'],
      dtype='object')
Index(['open', 'high', 'low', 'close', 'vol', 'amt', 'adj_factor', 'avg',
       'close0', 'avg0', 'pct', '5ma', '5ma_avg', '10ma', '10ma_avg', '20ma',
       '20ma_avg', '30ma', '30ma_avg', '40ma', '40ma_avg', '50ma', '50ma_avg',
       '60ma', '60ma_avg', '70ma', '70ma_avg', '80ma', '80ma_avg', '120ma',
       '120ma_avg', '250ma', '250ma_avg', 'close/5ma', 'close/10ma',
       'close/20ma', 'close/30ma', 'close/40ma', 'close/50ma', 'close/60ma',
       'close/70ma', 'close/80ma', 'close/120ma', 'close/250ma',
       'close/5ma_avg', 'close/10ma_avg', 'close/20ma_avg', 'close/30ma_avg',
       'close/40ma_avg', 'close/

In [None]:
start_date = '-'.join([str(start)[:4],str(start)[4:6],str(start)[6:8]])
df_r_spl = pd.read_parquet(r"database\return_spl_5%_10%_20_8_15%").loc[idx[:,start_date:],:].sort_index()
df_r_spc = pd.read_parquet(r"database\return_spc_5%_10%_20_8_15%").loc[idx[:,start_date:],:].sort_index()

features = []
for col2 in ["{}ma".format(k) for k in periods]+["{}ma_avg".format(k) for k in periods]:
    col1 = 'close'
    features.append('{}/{}'.format(col1,col2))
print(features)
# result_50 = ml.assess_feature3(df[features],df_r.r,q_bin=50)
result_1000_spl = ml.assess_feature3(df[features],df_r_spl.r,q_bin=1000)
result_1000_spc = ml.assess_feature3(df[features],df_r_spc.r,q_bin=1000)

# print(result_50.sort_values(['median_std','mean_std'],ascending=False))
result_1000_spl.sort_values(['median_std','mean_std'],ascending=False)
result_1000_spc.sort_values(['median_std','mean_std'],ascending=False)

['close/5ma', 'close/10ma', 'close/20ma', 'close/30ma', 'close/40ma', 'close/50ma', 'close/60ma', 'close/70ma', 'close/80ma', 'close/120ma', 'close/250ma', 'close/5ma_avg', 'close/10ma_avg', 'close/20ma_avg', 'close/30ma_avg', 'close/40ma_avg', 'close/50ma_avg', 'close/60ma_avg', 'close/70ma_avg', 'close/80ma_avg', 'close/120ma_avg', 'close/250ma_avg']
close/10ma 
                                    mean    median       max       min
bin                                                                  
(0.5809000000000001, 0.71942]  0.082183  0.093023  1.162069 -0.761686
(0.71942, 0.75458]             0.007593  0.018323  1.997348 -0.785517
(0.75458, 0.77781]            -0.000889  0.003470  1.807018 -0.469574
(0.77781, 0.79441]            -0.032175 -0.018285  2.296864 -0.948107
(0.79441, 0.80857]            -0.014689 -0.013783  0.577887 -0.806952
...                                 ...       ...       ...       ...
(1.2017, 1.21753]             -0.008147 -0.019685  1.358182 -0.537433
(1

close/40ma 
                       mean    median       max       min
bin                                                     
(0.3109, 0.6004]  0.122181  0.129930  1.162069 -0.735319
(0.6004, 0.653]   0.018931  0.039173  0.660514 -0.761686
(0.653, 0.6809]   0.018329  0.033333  1.204987 -0.656250
(0.6809, 0.6997]  0.018934  0.028182  0.689553 -0.671642
(0.6997, 0.7134]  0.016878  0.022495  0.571330 -0.785517
...                    ...       ...       ...       ...
(1.407, 1.4328]  -0.004429 -0.015169  0.708018 -0.569543
(1.4328, 1.4659] -0.007504 -0.018761  1.143802 -0.515532
(1.4659, 1.5142] -0.006890 -0.021000  0.684800 -0.624032
(1.5142, 1.6041] -0.010399 -0.023244  0.948618 -0.519029
(1.6041, 8.7121] -0.015785 -0.028213  0.771374 -0.521228

[1000 rows x 4 columns] 

close/40ma_avg 
                       mean    median       max       min
bin                                                     
(0.3552, 0.5965]  0.121149  0.134301  1.162069 -0.735319
(0.5965, 0.6438]  0.028241  0.0

close/80ma 
                       mean    median       max       min
bin                                                     
(0.2937, 0.5811]  0.090657  0.070000  0.839938 -0.735319
(0.5811, 0.6174]  0.047594  0.040382  0.944298 -0.690141
(0.6174, 0.6403]  0.033407  0.028571  1.532468 -0.761686
(0.6403, 0.6565]  0.026239  0.022648  1.531114 -0.725000
(0.6565, 0.6694]  0.021602  0.021069  0.920454 -0.641791
...                    ...       ...       ...       ...
(1.5928, 1.6296] -0.005469 -0.016418  0.914486 -0.485616
(1.6296, 1.6783] -0.006466 -0.015645  0.948618 -0.514138
(1.6783, 1.7493] -0.008122 -0.018393  0.863799 -0.488662
(1.7493, 1.884]  -0.007389 -0.020222  0.771374 -0.624032
(1.884, 9.668]   -0.011638 -0.023601  0.679706 -0.489167

[1000 rows x 4 columns] 

close/80ma_avg 
                       mean    median       max       min
bin                                                     
(0.3614, 0.5711]  0.096697  0.077543  0.839938 -0.735319
(0.5711, 0.6032]  0.049176  0.0

close/30ma_avg 
                       mean    median       max       min
bin                                                     
(0.3539, 0.5896]  0.133665  0.178641  1.162069 -0.735319
(0.5896, 0.6466]  0.018934  0.024546  1.000690 -0.761686
(0.6466, 0.675]   0.027173  0.029254  1.204987 -0.620690
(0.675, 0.6939]   0.023836  0.016133  0.943485 -0.568826
(0.6939, 0.708]   0.022067  0.011693  1.282143 -0.785517
...                    ...       ...       ...       ...
(1.2898, 1.3077] -0.005756 -0.030350  1.314306 -0.576585
(1.3077, 1.3315] -0.006720 -0.030303  1.288807 -0.624032
(1.3315, 1.3663] -0.011979 -0.034575  1.299060 -0.623921
(1.3663, 1.4303] -0.009402 -0.035574  1.218391 -0.618421
(1.4303, 2.2896] -0.016039 -0.040356  0.771374 -0.521228

[1000 rows x 4 columns] 

close/40ma 
                       mean    median       max       min
bin                                                     
(0.3109, 0.6004]  0.140810  0.169749  1.162069 -0.735319
(0.6004, 0.653]   0.033716  0.0

close/70ma_avg 
                       mean    median       max       min
bin                                                     
(0.3615, 0.58]    0.117937  0.094930  0.844978 -0.735319
(0.58, 0.6125]    0.052248  0.028777  1.204987 -0.761686
(0.6125, 0.6324]  0.040383  0.023601  0.944298 -0.671642
(0.6324, 0.6469]  0.035303  0.017038  0.638773 -0.606557
(0.6469, 0.6584]  0.032952  0.014744  0.983276 -0.690141
...                    ...       ...       ...       ...
(1.4462, 1.4757] -0.004510 -0.030267  1.314306 -0.479094
(1.4757, 1.5124] -0.005706 -0.031391  1.299060 -0.504948
(1.5124, 1.5678] -0.007535 -0.031160  1.218391 -0.514138
(1.5678, 1.6654] -0.011891 -0.035765  0.771374 -0.576585
(1.6654, 2.7187] -0.013825 -0.035045  0.679706 -0.624032

[1000 rows x 4 columns] 

close/80ma 
                       mean    median       max       min
bin                                                     
(0.2937, 0.5811]  0.092991  0.055318  0.839938 -0.735319
(0.5811, 0.6174]  0.050400  0.0

Unnamed: 0,mean_std,median_std,max_std,min_std,mean_q96%,median_q96%,max_q96%,min_q96%,mean_q4%,median_q4%,max_q4%,min_q4%
close/5ma,0.016904,0.007607,1.041629,0.157992,0.005829,-0.003515,3.688887,-0.264684,-0.000888,-0.011524,0.495808,-0.822227
close/40ma_avg,0.004501,0.006183,0.88961,0.154584,0.006471,-0.002301,3.221329,-0.256727,-0.000498,-0.011513,0.440352,-0.810003
close/30ma_avg,0.004036,0.006173,0.88928,0.156275,0.005335,-0.002969,3.235219,-0.253525,-0.00029,-0.011439,0.463407,-0.814359
close/40ma,0.004558,0.006114,0.877299,0.155433,0.007034,-0.001837,3.200595,-0.256663,-0.000603,-0.011611,0.457116,-0.814359
close/30ma,0.004056,0.006009,0.878874,0.156571,0.00578,-0.002543,3.221329,-0.25642,-0.000304,-0.011434,0.466384,-0.814359
close/50ma_avg,0.004792,0.005985,0.895512,0.155345,0.006869,-0.001692,3.235219,-0.254192,-0.000668,-0.011742,0.442842,-0.807067
close/50ma,0.004681,0.00576,0.882969,0.155885,0.007368,-0.001446,3.207666,-0.254908,-0.000648,-0.011733,0.431895,-0.814359
close/60ma_avg,0.004768,0.005487,0.889979,0.152058,0.006774,-0.00219,3.207666,-0.258712,-0.000327,-0.011509,0.437593,-0.802431
close/20ma,0.003496,0.005444,0.865005,0.157688,0.005199,-0.002666,3.193819,-0.257492,-0.000289,-0.011696,0.474988,-0.820575
close/20ma_avg,0.003438,0.00542,0.876146,0.156494,0.005325,-0.003753,3.221329,-0.262553,-0.000299,-0.011576,0.48246,-0.822455


Unnamed: 0,mean_std,median_std,max_std,min_std,mean_q96%,median_q96%,max_q96%,min_q96%,mean_q4%,median_q4%,max_q4%,min_q4%
close/5ma,0.017072,0.008798,1.139876,0.157166,0.008137,-0.011024,4.627586,-0.277932,-0.001342,-0.020704,0.655135,-0.840702
close/40ma_avg,0.005797,0.007956,0.952313,0.154238,0.009775,-0.008363,3.789437,-0.271695,-0.000629,-0.021901,0.560265,-0.836245
close/50ma_avg,0.006286,0.007909,0.975276,0.154517,0.010407,-0.007519,3.891422,-0.270915,-0.000584,-0.0223,0.541696,-0.833141
close/40ma,0.005899,0.007893,0.956193,0.153609,0.010732,-0.007547,3.879257,-0.272771,-0.000776,-0.022585,0.586418,-0.839117
close/50ma,0.006158,0.007785,0.961145,0.15378,0.011318,-0.006739,3.879257,-0.27387,-0.000808,-0.022575,0.556947,-0.839117
close/30ma_avg,0.00502,0.007523,0.956549,0.153672,0.007786,-0.010334,3.891422,-0.275055,-0.00049,-0.022526,0.581235,-0.839117
close/30ma,0.00515,0.00743,0.959533,0.154938,0.00903,-0.009278,3.891422,-0.275282,-0.00045,-0.022431,0.59049,-0.839117
close/60ma_avg,0.006037,0.007342,0.967131,0.153372,0.010072,-0.007338,3.789437,-0.273802,-0.000523,-0.022632,0.523191,-0.832003
close/60ma,0.005901,0.007118,0.963026,0.155379,0.011041,-0.006733,3.789437,-0.273797,-0.000826,-0.023307,0.56693,-0.836245
close/20ma,0.00455,0.007025,0.931877,0.155925,0.007275,-0.010326,3.735416,-0.273222,-0.000927,-0.022569,0.606259,-0.839117


由上可知，avg类变量在等频分箱下差异更小，区分度弱于非avg，但在末端(1%)下的收益率明显高于非avg。