In [1]:
import matplotlib.rcsetup
import pandas as pd
import numpy as np
import helper
matplotlib.rcParams['font.family'] = ['Heiti TC']

In [10]:
'''万得杠杆因子计算'''
factors = ['fa_ltdebttoasset', 'fa_debttoasset', 'fa_cashrecovratio_ttm', 'fa_equityassetradio']
industries = pd.read_excel('../申万一级权重/申银万国一级行业指数_copy.xlsx')

# 数据读取
all_ind_data = {}
all_weights = {}
for i in industries.index:
    ind_name = industries.loc[i, '名称']
    code = industries.loc[i, '代码']
    sliced_ind_name = ind_name[:-4]
    # print(sliced_ind_name)
    if sliced_ind_name == '银行' or sliced_ind_name == '非银金融':
        continue

    dfs = pd.read_excel('./个股分类因子/' + sliced_ind_name + '.xlsx', sheet_name=factors, index_col=0, skiprows=3)

    all_ind_data[ind_name] = dfs  # 所有数据存在all_ind_data里

    weight = pd.read_csv('../申万一级权重/wind_weights/' + sliced_ind_name + '_weights.csv', index_col=0)
    weight.index = pd.to_datetime(weight.index)
    all_weights[ind_name] = weight

mkt_data = pd.read_csv('../A_流通市值/A股流通市值.csv', index_col=0)
mkt_data.index = pd.to_datetime(mkt_data.index, format='%Y%m%d')
wind_ind_data = pd.read_csv('../A_流通市值/A股wind行业分类.csv', index_col=0).ffill().bfill()
wind_ind_data.index = pd.to_datetime(wind_ind_data.index, format='%Y%m%d')
wind_ind_data = wind_ind_data // 1000000

print('数据读取完毕')

数据读取完毕


In [None]:
# 创建数据存储
save_data = {}
for f in factors:
    save_data[f] = pd.DataFrame(None, index=wind_ind_data.index, columns=wind_ind_data.columns)

# 数据处理
for date in mkt_data.index:  # 时间截面
    print(date)
    all_stocks_codes = []  # list of Series, 之后用来concat
    ltdebt = []
    dtoa = []
    cashback = []
    equity = []

    for ind in industries['名称']:  # 循环申万工业
        if ind == '银行(申万)' or ind == '非银金融(申万)':
            continue
        stocks_codes = all_weights[ind].loc[date]  # 申万指数成分股(BJ excluded)
        stocks_codes = stocks_codes.dropna()
        stocks_codes = stocks_codes[stocks_codes != 0].drop_duplicates()  # 筛选出所有权重不是0也不是空的的股票
        stocks_codes = list(stocks_codes.index)

        # 从所有factor中提取数据并合并
        ltdebt.append(all_ind_data[ind]['fa_ltdebttoasset'].loc[date, stocks_codes])
        dtoa.append(all_ind_data[ind]['fa_debttoasset'].loc[date, stocks_codes])
        cashback.append(all_ind_data[ind]['fa_cashrecovratio_ttm'].loc[date, stocks_codes])
        equity.append(all_ind_data[ind]['fa_equityassetradio'].loc[date, stocks_codes])
        all_stocks_codes.append(pd.Series(stocks_codes))

    # concat, 读取mkt, wind_ind
    series_ltdebt = pd.concat(ltdebt).drop_duplicates()
    series_dtoa = pd.concat(dtoa).drop_duplicates()
    series_cashback = pd.concat(cashback).drop_duplicates()
    series_equity = pd.concat(equity).drop_duplicates()
    series_all_codes = pd.concat(all_stocks_codes).drop_duplicates()
    series_wind_ind = wind_ind_data.loc[date, list(series_all_codes)]
    series_mkt = mkt_data.loc[date, list(series_all_codes)]

    # 处理因子
    factor_series = {'fa_ltdebttoasset': series_ltdebt,
                     'fa_debttoasset': series_dtoa,
                     'fa_cashrecovratio_ttm': series_cashback,
                     'fa_equityassetradio': series_equity}
    for f in factors:
        s = factor_series[f]
        s = helper.mad_outlier(s)
        s = helper.md_fill_black(s, series_wind_ind)
        s = helper.ind_mkt_neutralization(s, series_mkt, series_wind_ind)
        s = helper.wind_standardize(s, series_mkt)
        save_data[f].loc[date] = s

for f in factors:
    save_data[f].to_csv('./处理后个股二级因子/处理后' + f + '.csv')

画图，看分布，正式跑的时候可掠过

In [None]:
import matplotlib.pyplot as plt

def plot_histogram(data: pd.Series):
    plt.figure(figsize=(8, 6))
    plt.hist(data, bins=30, edgecolor='k', alpha=0.7)
    plt.title('Histogram')
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.show()

dfs = {}
for f in factors[0:2]:
    print(f)
    data = pd.read_csv('./处理后个股二级因子/处理后' + f + '.csv', index_col=0)
    data.index = pd.to_datetime(data.index)
    for date in data.index:
        print(date)
        values = data.loc[date].dropna()
        print(values.mean(), values.std())
        plot_histogram(values)
        break

# print(mkt_data.isnull().sum(axis=1))

检验处理过程 正式跑的时候掠过

In [None]:
'''计算20240329的处理后因子'''
#
t_date_ = '20220228'
t_factor_name = '股东权益比率'
#
t_file_path = './对比计算结果/' + t_date_ + '/'

t_facotr = pd.read_excel(t_file_path + '因子分析报告_' + t_factor_name + '_' + t_date_ + '.xlsx', index_col=0)
t_mkt = pd.read_excel(t_file_path + t_date_ + '.xlsx', index_col=0)
t_mkt.columns = ['name', 'wind_ind','mkt', 'mkt_w']

t_joined_data = pd.concat([t_facotr.loc[:,['原始数据']], t_mkt], axis=1, join='inner') # inner: t_mkt里没有一些已退市的股票信息

t_s = t_joined_data['原始数据'].copy()
t_series_wind_ind = t_joined_data['wind_ind']
t_series_mkt = t_joined_data['mkt_w']
t_s = helper.mad_outlier(t_s)
t_s = helper.md_fill_black(t_s, t_series_wind_ind)
t_s = helper.ind_mkt_neutralization(t_s, t_series_mkt, t_series_wind_ind)

t_s = helper.wind_standardize(t_s, t_series_mkt)

t_output = pd.concat([t_facotr['原始数据'], t_s], axis=1, join='inner')  # inner: t_facotr里包含一些已退市的股票数据（无法获得）
t_output.to_csv(t_file_path + t_date_ + '_testing.csv')
print(t_output)

'''mean and std 分析'''
test = pd.read_csv(t_file_path + t_date_ + '_testing.csv', index_col=0)
print(test['new_factor'].mean(), test['new_factor'].std())
actual = pd.read_excel(t_file_path + '因子分析报告_' + t_factor_name + '_' + t_date_ + '.xlsx', index_col=0)

actual = actual.loc[test.index]
print('corr:', test['new_factor'].corr(actual['处理后数据']))
test['处理后数据'] = actual['处理后数据']
print(test.isnull().sum())
test.to_csv(t_file_path + t_date_ + '_testing.csv')