### 一、因子计算: 从低频到高频

In [1]:
# 引入需要的包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

这里都采用比较早的数据,仅在本框架中作为示例使用.

数据集price.csv为某期货从2019-01-02到2021-07-06的分钟级信息,包括开盘价,最高价,最低价,收盘价,成交量,成交额,未平仓合约数.其交易时间为09:00-10:15; 10:30-11:30; 13:30-15:00,没有夜盘数据.

In [27]:
prices = pd.read_csv('price.csv')
prices.head()

Unnamed: 0,Time,open,high,low,close,volume,amount,open_interest
0,2019-01-02 09:01:00,11087.89,11102.522,11058.91,11086.725,5998.0,677275760.0,268956
1,2019-01-02 09:02:00,11085.447,11098.557,11083.997,11085.915,1858.0,209742220.0,269140
2,2019-01-02 09:03:00,11082.769,11084.326,11047.499,11047.733,3696.0,416132420.0,269534
3,2019-01-02 09:04:00,11048.676,11054.233,11021.453,11032.088,4988.0,558616660.0,270420
4,2019-01-02 09:05:00,11033.408,11034.207,11010.193,11030.349,4250.0,477031180.0,271370


计算若干简单低频因子: 动量因子,波动率因子和成交量因子(日级,先聚合再计算).

In [28]:
# 动量因子
windows_1d = 60 * 24 * 1
windows_7d = 60 * 24 * 7
windows_30d = 60 * 24 * 30
windows_60d = 60 * 24 * 60

# 设置索引为时间格式
prices['Time'] = pd.to_datetime(prices['Time'])  # 转换为时间格式
prices = prices.set_index('Time') 

# 以天为单位聚合
prices_daily = prices.resample('D').agg({
    "open": "first",
    "high": "max",
    "low": "min",
    "close": "last",
    "volume": "sum"
}).dropna()


In [None]:
prices_daily['momentum_7'] = prices_daily['close'] / prices_daily['close'].shift(1) - 1
prices_daily['momentum_30'] = prices_daily['close']/prices_daily['close'].shift(30) - 1
prices_daily['momentum_60'] = prices_daily['close'] / prices_daily['close'].shift(60) - 1
prices_daily['votality_7'] = (prices_daily['close'] / prices_daily['close'].shift(1) - 1).rolling(7).std()
prices_daily['votality_30'] = (prices_daily['close'] / prices_daily['close'].shift(1) - 1).rolling(30).std()

# 映射回分钟级数据
prices['date'] = prices.index.date 
prices_daily['date'] = prices_daily.index.date
prices = prices.merge(
    prices_daily[['date', 'momentum_7', 'momentum_30', 'momentum_60','votality_7','votality_30']],
    on='date',
    how='left'
)
prices = prices.drop('date', axis=1)

当然,因为这里我们使用的是分钟级数据,也可以考虑计算更高频的因子.
示例: 计算尾盘15分钟的动量以及相对强度.

In [18]:
# 计算每日最后15分钟动量

def calc_tail_15min_momentum(group):
    tail_15 = group.tail(15)
    start_price = tail_15['close'].iloc[0]
    end_price = tail_15['close'].iloc[-1]
    return (end_price - start_price) / start_price

# 按日期分组计算
tail_momentum = prices.groupby(prices.index.date).apply(calc_tail_15min_momentum)

# 索引转换为datetime并合并
tail_momentum.index = pd.to_datetime(tail_momentum.index)
prices_daily['tail_15min'] = tail_momentum

# 查看结果
prices_daily.head()

Unnamed: 0_level_0,open,high,low,close,volume,tail_15min
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-02,11087.89,11102.522,10671.298,10678.334,380920.0,-0.013104
2019-01-03,10696.405,10759.78,10454.518,10476.169,355554.0,-0.0048
2019-01-04,10405.123,10609.978,10405.123,10553.595,284602.0,0.0003
2019-01-07,10488.425,10521.046,10300.94,10371.413,322834.0,0.001643
2019-01-08,10373.536,10520.665,10373.278,10464.144,262926.0,-0.000881


进一步的,如果有逐笔委托(order),逐笔成交(trade)等数据,则可以计算一些高频量价因子.
这里以订单流失因子(OFI)为例进行计算,所用数据为2021年12月31日某支股票的逐笔委托,逐笔成交和实时盘口数据.

In [2]:
tick = pd.read_csv('600985_tick.csv')
tick.head()

Unnamed: 0,Time,BidPrice1,BidVolume1,BidPrice2,BidVolume2,BidPrice3,BidVolume3,BidPrice4,BidVolume4,BidPrice5,...,OfferPrice16,OfferVolume16,OfferPrice17,OfferVolume17,OfferPrice18,OfferVolume18,OfferPrice19,OfferVolume19,OfferPrice20,OfferVolume20
0,09:30:00.080,11.14,14500.0,11.13,8700.0,11.12,15000.0,11.11,19600.0,11.1,...,11.32,16800.0,11.33,4700.0,11.34,6000.0,11.35,35500.0,11.36,3200.0
1,09:30:00.240,11.14,14500.0,11.13,8700.0,11.12,15000.0,11.11,19600.0,11.1,...,11.32,16800.0,11.33,4700.0,11.34,6000.0,11.35,35500.0,11.36,3200.0
2,09:30:00.280,11.14,14500.0,11.13,8700.0,11.12,15000.0,11.11,19600.0,11.1,...,11.32,16800.0,11.33,4700.0,11.34,6000.0,11.35,35500.0,11.36,3200.0
3,09:30:00.280,11.14,14500.0,11.13,8700.0,11.12,15000.0,11.11,19600.0,11.1,...,11.32,16800.0,11.33,4700.0,11.34,6000.0,11.35,35500.0,11.36,3200.0
4,09:30:00.320,11.14,14500.0,11.13,8700.0,11.12,15000.0,11.11,19600.0,11.1,...,11.32,16800.0,11.33,4700.0,11.34,6000.0,11.35,35500.0,11.36,3200.0


In [3]:
# 这部分数据简单处理一下,以1s为单位进行一个聚合
BASE_DATE = '20211231'
print(f"原始数据行数: {len(tick)}")


time_str = tick['Time'].astype(str).str.replace(':', '').str.replace('.', '')

# 拼接日期生成17位Timestamp: 20211231093000080
tick['Timestamp'] = (BASE_DATE + time_str).astype('int64')

# 创建秒级时间戳（去掉后3位毫秒）
tick['TimestampSecond'] = tick['Timestamp'] // 1000

# 聚合
trade_1s = tick.groupby('TimestampSecond').last().reset_index()

# 删除辅助列
trade_1s = trade_1s.drop(columns=['TimestampSecond'])

# 调整列顺序
cols = ['Time', 'Timestamp'] + [col for col in trade_1s.columns if col not in ['Time', 'Timestamp']]
trade_1s = trade_1s[cols]

print(f"\n聚合后数据行数: {len(trade_1s)}")

原始数据行数: 8583

聚合后数据行数: 595


In [15]:
trade_1s

Unnamed: 0,Time,Timestamp,BidPrice1,BidVolume1,BidPrice2,BidVolume2,BidPrice3,BidVolume3,BidPrice4,BidVolume4,...,OfferPrice16,OfferVolume16,OfferPrice17,OfferVolume17,OfferPrice18,OfferVolume18,OfferPrice19,OfferVolume19,OfferPrice20,OfferVolume20
0,09:30:00.920,20211231093000920,11.15,2100.0,11.13,8700.0,11.12,15000.0,11.11,21700.0,...,11.31,6000.0,11.32,16800.0,11.33,4700.0,11.34,6000.0,11.35,35500.0
1,09:30:01.990,20211231093001990,11.13,2900.0,11.12,15000.0,11.11,23300.0,11.10,34600.0,...,11.31,6000.0,11.32,14300.0,11.33,4700.0,11.34,6000.0,11.35,41300.0
2,09:30:02.960,20211231093002960,11.13,11400.0,11.12,26000.0,11.11,25600.0,11.10,39000.0,...,11.30,19400.0,11.31,6000.0,11.32,14300.0,11.33,4700.0,11.34,6600.0
3,09:30:03.900,20211231093003900,11.13,11400.0,11.12,26000.0,11.11,28600.0,11.10,39000.0,...,11.30,21400.0,11.31,6000.0,11.32,14300.0,11.33,4700.0,11.34,6600.0
4,09:30:04.920,20211231093004920,11.13,11400.0,11.12,26000.0,11.11,28900.0,11.10,41200.0,...,11.29,6000.0,11.30,21400.0,11.31,6000.0,11.32,14300.0,11.33,5200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590,09:39:55.810,20211231093955810,11.09,49100.0,11.08,105500.0,11.07,136600.0,11.06,162600.0,...,11.26,19500.0,11.27,13000.0,11.28,22900.0,11.29,13100.0,11.30,39700.0
591,09:39:56.880,20211231093956880,11.09,51900.0,11.08,105500.0,11.07,136600.0,11.06,162600.0,...,11.26,19500.0,11.27,13000.0,11.28,22900.0,11.29,13100.0,11.30,39700.0
592,09:39:57.900,20211231093957900,11.09,51900.0,11.08,106700.0,11.07,136600.0,11.06,162600.0,...,11.26,19500.0,11.27,13000.0,11.28,22900.0,11.29,13100.0,11.30,39700.0
593,09:39:58.710,20211231093958710,11.09,51900.0,11.08,106700.0,11.07,136600.0,11.06,162600.0,...,11.25,35800.0,11.26,19500.0,11.27,13000.0,11.28,22900.0,11.29,13100.0


In [None]:
# 计算OFI因子
# OFI_t = ΔV_t^B - ΔV_t^A

def calculate_ofi(df):
    """
    计算订单流不平衡（Order Flow Imbalance）因子
    
    参数:
            - BidPrice1, BidVolume1 (买一价、买一量)
            - OfferPrice1, OfferVolume1 (卖一价、卖一量)
    """
    
    # 复制数据避免修改原始数据
    data = df.copy()
    
    # 获取当前时刻的价格和数量
    P_B_t = data['BidPrice1']
    V_B_t = data['BidVolume1']
    P_A_t = data['OfferPrice1']
    V_A_t = data['OfferVolume1']
    
    # 获取上一时刻的价格和数量
    P_B_t_1 = P_B_t.shift(1)
    V_B_t_1 = V_B_t.shift(1)
    P_A_t_1 = P_A_t.shift(1)
    V_A_t_1 = V_A_t.shift(1)
    
    # 计算 ΔV^B（买方变化）
    delta_V_B = np.zeros(len(data))
    # 条件1: P_t^B < P_{t-1}^B (买一价下降)
    mask_B_down = P_B_t < P_B_t_1
    delta_V_B[mask_B_down] = -V_B_t_1[mask_B_down]
    
    # 条件2: P_t^B = P_{t-1}^B (买一价不变)
    mask_B_equal = P_B_t == P_B_t_1
    delta_V_B[mask_B_equal] = V_B_t[mask_B_equal] - V_B_t_1[mask_B_equal]
    
    # 条件3: P_t^B > P_{t-1}^B (买一价上升)
    mask_B_up = P_B_t > P_B_t_1
    delta_V_B[mask_B_up] = V_B_t[mask_B_up]
    
    # 计算 ΔV^A（卖方变化）
    delta_V_A = np.zeros(len(data))
    
    # 条件1: P_t^A < P_{t-1}^A (卖一价下降)
    mask_A_down = P_A_t < P_A_t_1
    delta_V_A[mask_A_down] = V_A_t[mask_A_down]
    
    # 条件2: P_t^A = P_{t-1}^A (卖一价不变)
    mask_A_equal = P_A_t == P_A_t_1
    delta_V_A[mask_A_equal] = V_A_t[mask_A_equal] - V_A_t_1[mask_A_equal]
    
    # 条件3: P_t^A > P_{t-1}^A (卖一价上升)
    mask_A_up = P_A_t > P_A_t_1
    delta_V_A[mask_A_up] = -V_A_t_1[mask_A_up]
    
    # ==================== 计算OFI ====================
    data['Delta_VB'] = delta_V_B
    data['Delta_VA'] = delta_V_A
    data['OFI'] = delta_V_B - delta_V_A
    
    return data



# 对聚合后的数据计算OFI
trade_1s = calculate_ofi(trade_1s)

print(f"数据行数: {len(trade_1s)}")
print(f"\nOFI统计信息:")



数据行数: 595

OFI统计信息:
count       595.000000
mean      -1117.310924
std       14661.769826
min     -177800.000000
25%       -3200.000000
50%           0.000000
75%        2250.000000
max      106000.000000
Name: OFI, dtype: float64


In [None]:
trade_1s

Unnamed: 0,Time,Timestamp,BidPrice1,BidVolume1,BidPrice2,BidVolume2,BidPrice3,BidVolume3,BidPrice4,BidVolume4,...,OfferVolume17,OfferPrice18,OfferVolume18,OfferPrice19,OfferVolume19,OfferPrice20,OfferVolume20,Delta_VB,Delta_VA,OFI
0,09:30:00.920,20211231093000920,11.15,2100.0,11.13,8700.0,11.12,15000.0,11.11,21700.0,...,16800.0,11.33,4700.0,11.34,6000.0,11.35,35500.0,0.0,0.0,0.0
1,09:30:01.990,20211231093001990,11.13,2900.0,11.12,15000.0,11.11,23300.0,11.10,34600.0,...,14300.0,11.33,4700.0,11.34,6000.0,11.35,41300.0,-2100.0,5800.0,-7900.0
2,09:30:02.960,20211231093002960,11.13,11400.0,11.12,26000.0,11.11,25600.0,11.10,39000.0,...,6000.0,11.32,14300.0,11.33,4700.0,11.34,6600.0,8500.0,10800.0,-2300.0
3,09:30:03.900,20211231093003900,11.13,11400.0,11.12,26000.0,11.11,28600.0,11.10,39000.0,...,6000.0,11.32,14300.0,11.33,4700.0,11.34,6600.0,0.0,-2200.0,2200.0
4,09:30:04.920,20211231093004920,11.13,11400.0,11.12,26000.0,11.11,28900.0,11.10,41200.0,...,21400.0,11.31,6000.0,11.32,14300.0,11.33,5200.0,0.0,-1900.0,1900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590,09:39:55.810,20211231093955810,11.09,49100.0,11.08,105500.0,11.07,136600.0,11.06,162600.0,...,13000.0,11.28,22900.0,11.29,13100.0,11.30,39700.0,-700.0,-4700.0,4000.0
591,09:39:56.880,20211231093956880,11.09,51900.0,11.08,105500.0,11.07,136600.0,11.06,162600.0,...,13000.0,11.28,22900.0,11.29,13100.0,11.30,39700.0,2800.0,0.0,2800.0
592,09:39:57.900,20211231093957900,11.09,51900.0,11.08,106700.0,11.07,136600.0,11.06,162600.0,...,13000.0,11.28,22900.0,11.29,13100.0,11.30,39700.0,0.0,0.0,0.0
593,09:39:58.710,20211231093958710,11.09,51900.0,11.08,106700.0,11.07,136600.0,11.06,162600.0,...,19500.0,11.27,13000.0,11.28,22900.0,11.29,13100.0,0.0,300.0,-300.0
