In [1]:
import numpy as np
from tqdm import tqdm

In [80]:
#########################################
# Diversified import based on execution environment (notebook/standard interpreter)
#########################################
def is_notebook() -> bool:
    try:
        if "get_ipython" in globals().keys():
            get_ipython = globals()["get_ipython"]
            shell = get_ipython().__class__.__name__
            if shell == "ZMQInteractiveShell":
                return True  # Jupyter notebook or qtconsole
        # elif shell == "TerminalInteractiveShell":
        #   return False  # Terminal running IPython
        #   else:
        return False  # Other type (?)
    except NameError:
        return False  # Probably standard Python interpreter


if is_notebook():
    from multiprocessing.pool import ThreadPool as Pool
    from threading import Lock
else:
    from multiprocessing.pool import Pool
    from multiprocessing import Lock

In [123]:
all_data = {}
with np.load(r"C:\Users\ASUS\rc_work\20230103.npz") as npz_file:
    stock_list = npz_file.files
    for stk in stock_list:
        all_data[stk] = npz_file[stk]

In [118]:
name_lst = ['交易所代码','自然日','时间','成交编号','成交代码','委托代码','BS标志','成交价格','成交数量','叫卖序号','叫买序号','空列']

def get_second(_num):
    _hour = _num//1e4
    _left_hour = _num - _hour*1e4
    _min = _left_hour//1e2
    _second = _left_hour - _min*1e2
    result = _second + _min*60 + _hour*60*60
    return result
  
def get_transaction_time_sub(_transaction_time_data):
    # 这里需要减小计算量
    if len(_transaction_time_data) == 1:
        result = 0
    else:
        _first_time = np.min(_transaction_time_data)
        _last_time = np.max(_transaction_time_data)
        _first_time_dropmillisecond = _first_time//1e3
        _last_time_dropmillisecond = _last_time//1e3
        if _first_time_dropmillisecond == _last_time_dropmillisecond:
            result = 0
        elif (_first_time_dropmillisecond >= 130000 and _last_time_dropmillisecond >= 130000) or (_first_time_dropmillisecond <= 113000 and _last_time_dropmillisecond <= 113000):
            result = get_second(_last_time_dropmillisecond) - get_second(_first_time_dropmillisecond)
        else:
            result = get_second(_last_time_dropmillisecond) - get_second(_first_time_dropmillisecond) - 5400
    return result

class long_sell_vol_prop(object):
    def __init__(self,data,name_lst = name_lst):
        self.id_BS = name_lst.index('BS标志')
        self.id_time = name_lst.index('时间')   
        self.id_sell = name_lst.index('叫卖序号')  
        self.id_vol = name_lst.index('成交数量')
        self.all_data = self.clean_data_BS_time(data)
        
    def clean_data_BS_time(self,_data):
        # 1、去掉BS标志列为nan的行；
        _nan_index = ~np.isnan(_data[:, self.id_BS]) # nan的布尔索引
        # 2、用时间列去掉九点三十分之前的所有数据；
        _sat_time_index = (_data[:,self.id_time] >= 93000000)
        _remain_index =  _nan_index*_sat_time_index
        _data_cleaned = _data[_remain_index,:]
        return _data_cleaned
    
    def get_transaction_time(self):
        # 3、统计当天每个卖单成交的起始和结束时间，计算出成交的时间间隔，精确到秒；
        # 利用np.split会快很多
        _data_sorted = self.all_data[np.argsort(self.all_data[:, self.id_sell]),:]
        _sell_index, _sell_counts = np.unique(_data_sorted[:,self.id_sell], return_index=True)
        _para_lst = np.split(_data_sorted[:, id_time], _sell_counts[1:])
        _res_lst = []
        for _transaction_time_data in _para_lst:
            _res_lst.append(get_transaction_time_sub(_transaction_time_data))
        # with Pool(24) as p:
        #     _res_lst = list(tqdm(p.imap(get_transaction_time_sub,_para_lst),total = len(_sell_index)))
        _sales_order_transaction_time = np.array(_res_lst)
        # 4. 将所有卖单的成交时长排序并统计出0.9分位数的值，将大于0.9分位数的卖单标记为长卖单；
        # i:从小到大排序；同时对sell_index排序
        _sorted_indices = np.argsort(_sales_order_transaction_time)
        _sales_order_transaction_time_sorted = _sales_order_transaction_time[_sorted_indices]
        _sell_index_sorted = _sell_index[_sorted_indices]
        # ii:统计出0.9分位数的值
        # 直接使用 np.percentile(sales_order_transaction_time, 90)
        _quantile_90 = np.percentile(_sales_order_transaction_time_sorted, 90)
        # iii:将大于0.9分位数的卖单标记为长卖单
        # 实际计算出来卖单的0.9分位数为0；所以就设置大于0的卖单为长买单，长卖单标记为True
        _sell_index_sorted_marked_long = (_sales_order_transaction_time_sorted > _quantile_90)
        # 输出长卖单序号
        return _sell_index_sorted[_sell_index_sorted_marked_long]
    
    def get_factor(self):
        # 长卖单序号
        _long_sell_index = self.get_transaction_time()
        # 长卖单的当天总成交量
        _long_sell_all_vol = 0
        for _long_sell_name in _long_sell_index:
            _long_sell_name_index = (self.all_data[:,self.id_sell] == _long_sell_name)
            _long_sell_name_vol = np.sum(self.all_data[_long_sell_name_index,self.id_vol])
            _long_sell_all_vol += _long_sell_name_vol
        # 当天总成交量
        _all_vol = np.sum(self.all_data[:,id_vol])
        _long_sell_vol_prop_factor = _long_sell_all_vol/_all_vol
        return _long_sell_vol_prop_factor

In [119]:
example1 = long_sell_vol_prop(all_data)
example1.get_factor()

0.26048106217891753

In [7]:
# 1、去掉BS标志列为nan的行；
id_BS = name_lst.index('BS标志')
nan_index = np.isnan(all_data[:, id_BS]) # nan的布尔索引
all_data_BS_cleaned = all_data[~nan_index,:] # cleaned N1,12

In [8]:
# 2、用时间列去掉九点三十分之前的所有数据；
id_time = name_lst.index('时间')
sat_time_index = (all_data_BS_cleaned[:,id_time] >= 93000000)
all_data_BS_time_cleaned = all_data_BS_cleaned[sat_time_index,:]

In [9]:
def get_second(_num):
    _num = int(_num)
    _hour = _num//(10**7)
    _left_hour = _num%(10**7)
    _min = _left_hour//(10**5)
    _left_min = _left_hour%(10**5)
    _second = _left_min//1000
    result = _second + _min*60 + _hour*60*60
    return result

def get_transaction_time(_name):
    _id_transaction_time = (all_data_BS_time_cleaned[:,id_sell] == _name)
    _transaction_time_data = all_data_BS_time_cleaned[_id_transaction_time,id_time]
    _first_time = np.min(_transaction_time_data)
    _last_time = np.max(_transaction_time_data)
    if (_first_time >= 130000000 and _last_time >= 130000000) or (_first_time <= 113000000 and _last_time <= 113000000):
        result = get_second(_last_time) - get_second(_first_time)
    else:
        result = get_second(_last_time) - get_second(_first_time) - 5400
    return result

In [24]:
# 3、统计当天每个卖单成交的起始和结束时间，计算出成交的时间间隔，精确到秒；
id_sell = name_lst.index('叫卖序号')
sell_index = np.unique(all_data_BS_time_cleaned[:,id_sell])
sales_order_transaction_time = []
for _name in tqdm(sell_index):
    sales_order_transaction_time.append(get_transaction_time(_name))
sales_order_transaction_time = np.array(sales_order_transaction_time)

100%|██████████████████████████████████████████████████████████████████████████| 91619/91619 [01:03<00:00, 1448.80it/s]


In [41]:
# 4. 将所有卖单的成交时长排序并统计出0.9分位数的值，将大于0.9分位数的卖单标记为长卖单；

# i:从小到大排序；同时对sell_index排序
sorted_indices = np.argsort(sales_order_transaction_time)
sales_order_transaction_time_sorted = sales_order_transaction_time[sorted_indices]
sell_index_sorted = sell_index[sorted_indices]
# ii:统计出0.9分位数的值
quantile_90_index = len(sales_order_transaction_time_sorted) / 10 * 9
floor_quantile_90_index = np.floor(quantile_90_index)
ceil_quantile_90_index = np.ceil(quantile_90_index)
quantile_90 = sales_order_transaction_time_sorted[int(floor_quantile_90_index)]*(ceil_quantile_90_index - quantile_90_index) + sales_order_transaction_time_sorted[int(ceil_quantile_90_index)]*(quantile_90_index - floor_quantile_90_index)

# 或者直接使用 np.percentile(sales_order_transaction_time, 90)
quantile_90 = np.percentile(sales_order_transaction_time, 90)

# iii:将大于0.9分位数的卖单标记为长卖单
# 实际计算出来卖单的0.9分位数为0；所以就设置大于0的卖单为长买单，长卖单标记为True
sell_index_sorted_marked_long = (sales_order_transaction_time_sorted > quantile_90)

In [42]:
# 5，统计计算出所有长卖单的当天总成交量，并除以当天总成交量（去掉九点三十分之前的所有数据）得到长卖单成交量占比因子；
id_vol = name_lst.index('成交数量')
long_sell_index_sorted = sell_index_sorted[sell_index_sorted_marked_long]
# 长卖单的当天总成交量
long_sell_all_vol = 0
for _long_sell_name in long_sell_index_sorted:
    _long_sell_index = (all_data_BS_time_cleaned[:,id_sell] == _long_sell_name)
    _long_sell_index_vol = np.sum(all_data_BS_time_cleaned[_long_sell_index,id_vol])
    long_sell_all_vol += _long_sell_index_vol
# 当天总成交量
all_vol = np.sum(all_data_BS_time_cleaned[:,id_vol])

long_sell_vol_prop_factor = long_sell_all_vol/all_vol