## 调整文件目录

In [2]:
""" 由于笔记本被收纳到单独的文件夹里，运行时 需且仅需一次 运行本模块来调整工作目录到项目目录下 """
import os


# 获取当前工作目录
current_dir = os.getcwd()
print("Current directory:", current_dir)

# 设置工作目录为上一级目录
parent_dir = os.path.dirname(current_dir)
os.chdir(parent_dir)
print("New working directory:", os.getcwd())

Current directory: e:\VSCodeFiles\quant\best\notebooks
New working directory: e:\VSCodeFiles\quant\best


## 正文

### 获取数据与数据预处理

In [15]:
import qstock as qs
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from WindPy import w

import tushare as ts
token = '519550beb1e16be8a2bbd60fd5148d3d851074e6f5ec866832560432'
ts.set_token(token)
pro = ts.pro_api()
w.start()

.ErrorCode=0
.Data=[Already connected!]

In [3]:


# 获取沪深300指数从2005年至今的高开低收等行情数据，index是日期
target_code = 'HS300'

data = qs.get_data(code_list=[target_code], start='20050101', freq='d')[['open', 'high', 'low', 'close']]
data.to_csv(f'dataset/test_data/{target_code}_2005_2024.csv')

# 删除名称列、排序并去除空值
data = data.sort_index().fillna(method='ffill').dropna()
# 插入日期列
data.insert(0, 'date', data.index)
# 将日期从datetime格式转换为str格式
data['date'] = data['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
# 按收盘价计算每日涨幅
data['pct'] = data['close'] / data['close'].shift(1) - 1.0
data = data.dropna().reset_index(drop=True)

                                             

### barra


In [6]:
def get_daily_data(stock_code, start_date, periods=10):
    """获取指定股票代码在指定日期范围内的每日基本数据"""
    # 生成连续日期
    date_range = pd.date_range(start=start_date, periods=periods, freq='D')
    
    # 格式化日期
    formatted_dates = [date.strftime('%Y%m%d') for date in date_range]
    
    # 查询数据
    data_frames = []
    for date in formatted_dates:
        df = pro.daily_basic(ts_code=stock_code, trade_date=date, fields='')
        df = pd.merge(df, pro.daily(ts_code=stock_code, trade_date=date, fields='ts_code,pct_chg'), left_on='ts_code', right_on='ts_code')
        data_frames.append(df)
    
    # 合并数据
    combined_data = pd.concat(data_frames, ignore_index=True)
    
    return combined_data

In [7]:
start_date = '20230810'
end_date = pd.to_datetime(start_date) + pd.Timedelta(days=60)
end_date = end_date.strftime('%Y%m%d')

stock_code = '600519.SH,600230.SH'
daily_data = get_daily_data(stock_code, start_date, 10)


In [8]:
daily_data

Unnamed: 0,ts_code,trade_date,close,turnover_rate,turnover_rate_f,volume_ratio,pe,pe_ttm,pb,ps,ps_ttm,dv_ratio,dv_ttm,total_share,float_share,free_share,total_mv,circ_mv,pct_chg
0,600230.SH,20230810,15.3,0.5822,1.0831,0.57,15.156,16.1554,1.6363,1.2958,1.2476,0.3942,2.3681,41614.4936,41186.3502,22139.6078,636701.8,630151.2,0.5256
1,600519.SH,20230810,1875.0,0.1468,0.3357,0.87,37.5559,34.1839,11.7232,18.9796,17.3114,2.3245,2.5505,125619.78,125619.78,54913.6536,235537100.0,235537100.0,-0.8083
2,600230.SH,20230811,14.94,1.0598,1.9715,1.34,14.7994,15.7753,1.5978,1.2653,1.2183,0.4037,2.4252,41614.4936,41186.3502,22139.6078,621720.5,615324.1,-2.3529
3,600519.SH,20230811,1834.0,0.2132,0.4878,1.3,36.7347,33.4364,11.4668,18.5646,16.9329,2.3765,2.6075,125619.78,125619.78,54913.6536,230386700.0,230386700.0,-2.1867
4,600230.SH,20230814,14.97,0.6855,1.2752,0.9,14.8291,15.8069,1.601,1.2679,1.2207,0.4029,2.4203,41614.4936,41186.3502,22139.6078,622969.0,616559.7,0.2008
5,600519.SH,20230814,1803.0,0.1889,0.4322,1.13,36.1137,32.8712,11.273,18.2508,16.6466,2.4174,2.6523,125619.78,125619.78,54913.6536,226492500.0,226492500.0,-1.6903
6,600230.SH,20230815,14.71,0.767,1.4268,1.1,14.5716,15.5324,1.5732,1.2458,1.1995,0.4101,2.4631,41614.4936,41186.3502,22139.6078,612149.2,605851.2,-1.7368
7,600519.SH,20230815,1812.98,0.118,0.2698,0.7,36.3136,33.0532,11.3354,18.3518,16.7388,2.4041,2.6377,125619.78,125619.78,54913.6536,227746100.0,227746100.0,0.5535
8,600230.SH,20230816,14.73,0.3652,0.6795,0.51,14.5914,15.5535,1.5754,1.2475,1.2011,0.4095,2.4597,41614.4936,41186.3502,22139.6078,612981.5,606674.9,0.136
9,600519.SH,20230816,1820.0,0.105,0.2401,0.64,36.4542,33.1812,11.3793,18.4229,16.8036,2.3948,2.6275,125619.78,125619.78,54913.6536,228628000.0,228628000.0,0.3872


In [4]:
from typing import Any
from Modules.BaseModules import BaseModule
import numpy as np
from numpy.lib.stride_tricks import as_strided as strided
import pandas as pd
from statsmodels.regression.linear_model import OLS
from statsmodels.tools.tools import add_constant
import statsmodels.api as sm
from typing import List, Dict
from tqdm import tqdm
import pandas as pd
from Modules import Helper


class BaseFactor(BaseModule):
    def __init__(self) -> None:
        super().__init__()
        pass
    
    def calculate(self, *args: Any, **kwds: Any) -> Any:
        return args, kwds
    
    def __call__(self, *args: Any, **kwds: Any) -> Any:
        return self.calculate(*args, **kwds)


class BarraBaseFactor(BaseFactor):
    def __init__(self) -> None:
        super().__init__()
        pass
    
    @staticmethod
    def pivot_df(df: pd.DataFrame, pivot_dict: Dict) -> pd.DataFrame:
        """
        将DataFrame转换为以 pivot_dict 指定的的唯一值为索引、列名、值的新DataFrame。

        参数:
        df : pd.DataFrame
            包含所用数据的df。

        返回:
        pd.DataFrame
            转换后的DataFrame。
        """
        # 确保数据已经被设置
        if df is None:
            raise ValueError("Data DataFrame cannot be None.")

        # 使用pivot方法进行转换
        pivoted_df = df.pivot(**pivot_dict)

        return pivoted_df
    
    @staticmethod
    def get_risk_free_rate() -> float:
        """
        依据最新交易日的五年期国债收益率作为无风险收益率。
        
        返回:
        float
            无风险收益率。
        """
        # 确保数据已经被设置
        if df is None:
            raise ValueError("Data DataFrame cannot be None.")

        # 使用pivot方法进行转换
        pivoted_df = df.pivot(**pivot_dict)

        return pivoted_df


class BarraSizeFactor(BarraBaseFactor):
    """
    BarraSizeFactor 是一个用于计算 Barra Size 因子的类。
    定义：1.0 * LNCAP
    市值因子用LNCP来表示，表示公司股票总市值的自然对数。
    
    要求输入的dataframe为一个投资组合，列为市值的时间序列，行为投资组合中的资产。

    方法:
        calculate(df: pd.DataFrame, market_value_col: str) -> pd.DataFrame:
            计算给定DataFrame中的Barra Size因子。
    """

    def __init__(self, pivot_dict: Dict = None) -> None:
        super().__init__()
        if pivot_dict == None:
            self.pivot_dict = {'index': 'ts_code', 
                             'columns': 'trade_date', 
                             'values': 'total_mv'
                             }
        else:
            self.pivot_dict = pivot_dict
            

    def calculate(self, 
                  df: pd.DataFrame, 
                  ) -> pd.DataFrame:
        # 确保数据已经被设置
        if df is None:
            raise ValueError("Data DataFrame cannot be None.")
        df = self.pivot_df(df, self.pivot_dict)
        
        # 获取市值列并计算自然对数
        log_market_values = np.log(df.values)
        
        # 直接在原DataFrame上添加新列
        result_df = pd.DataFrame(log_market_values, index=df.index, columns=df.columns)

        return result_df
    
    
class BarraBetaFactor(BarraBaseFactor):
    """
    BarraBetaFactor 是一个用于计算 Barra beta 因子的类。
    定义: 1.0 * Beta
    反应个股相对于市场整体的波动情况。
    在 CNE5 里, Beta 被定义为个股超额收益相对于市场加权超额收益的波动情况。

    要求输入两个dataframe: 
        stock_returns: 资产的收益率(日频、月频、年频), index 为资产代码, columns 为交易日期。
        market_returns: 资产的市值(日频、月频、年频), index 为资产代码, columns 为交易日期。
        market_returns 的开始日期应至少比self.start早T-1期。
    
    ewam_window: int
        回归区间长度。
    half_life: int
        指数加权平均移动的半衰期。


    方法:
        calculate(df: pd.DataFrame, market_value_col: str) -> pd.DataFrame:
            计算给定DataFrame中的Barra Size因子。
    """
    def __init__(self) -> None:
        super().__init__()
        pass
    
    

In [157]:
class BarraBetaFactor(BarraBaseFactor):
    """
    BarraBetaFactor 是一个用于计算 Barra 的 Beta 因子的类。
    
    定义：1.0 * Beta
    Beta因子表示个股超额收益相对于市场超额收益的波动情况。
    
    要求输入的dataframe为一个投资组合，列为超额收益的时间序列，行为投资组合中的资产。
    
    方法:
        calculate(stock_returns_df: pd.DataFrame, market_returns_df: pd.DataFrame, risk_free_rate: float, window: int, halflife: int) -> pd.DataFrame:
            计算给定DataFrame中的Barra Beta因子。
    """

    def __init__(self, pivot_dict: dict = None) -> None:
        super().__init__()
        if pivot_dict is None:
            self.pivot_dict = {'index': 'ts_code', 
                               'columns': 'trade_date', 
                               'values': 'returns'}
        else:
            self.pivot_dict = pivot_dict

    def _calculate_beta(self, stock_returns, market_returns, risk_free_rate, window=252, halflife=63):
        # 计算超额收益
        excess_stock_returns = stock_returns - risk_free_rate
        excess_market_returns = market_returns - risk_free_rate

        # 计算权重
        lambda_ = np.exp(np.log(0.5) / halflife)
        weights = (lambda_ ** np.arange(window))[::-1]

        # 归一化权重
        weights /= weights.sum()

        # 计算加权协方差和加权方差
        cov = np.dot(weights, (excess_stock_returns[-window:] * excess_market_returns[-window:]))
        var = np.dot(weights, (excess_market_returns[-window:] ** 2))

        # 计算Beta
        beta = cov / var

        return beta

    def calculate(self, 
                  stock_returns_df: pd.DataFrame, 
                  market_returns_df: pd.DataFrame, 
                  risk_free_rate: float = 0.001, 
                  window: int = 252, 
                  halflife: int = 63) -> pd.DataFrame:
        # 确保数据已经被设置
        if stock_returns_df is None or market_returns_df is None:
            raise ValueError("Stock and Market Returns DataFrames cannot be None.")

        # 使用pivot方法进行转换
        stock_returns_df = self.pivot_df(stock_returns_df, self.pivot_dict)
        market_returns_df = self.pivot_df(market_returns_df, self.pivot_dict)

        # 计算Beta因子
        beta_values = {}
        for date in stock_returns_df.columns:
            daily_betas = []
            for stock in stock_returns_df.index:
                stock_returns = stock_returns_df.loc[stock, date]
                market_returns = market_returns_df.loc[stock, date]
                
                # 检查是否有足够的数据点
                if len(stock_returns) < window:
                    daily_betas.append(np.nan)
                    continue
                
                # 计算Beta
                beta = self._calculate_beta(stock_returns, market_returns, risk_free_rate, window, halflife)
                daily_betas.append(beta)

            beta_values[date] = daily_betas

        # 将Beta值添加到原始DataFrame中
        result_df = pd.DataFrame(beta_values, index=stock_returns_df.index, columns=stock_returns_df.columns)

        return result_df

In [180]:
bsf = BarraSizeFactor()

In [33]:
import QuantLib as ql

In [34]:
# Let the today date whenwe want to value a instrument be
today = ql.Date(15,6,2020)

In [183]:
test_data = bsf.pivot_df(daily_data, bsf.pivot_dict)

In [191]:
test_stocks_code = ','.join(test_data.index)
_market_res = []
for i in range(1, 20):
    _market_res.append(w.wss(test_stocks_code, "compindex2",f"index={i}; tradeDate=20240826"))


In [293]:
index_target = {}
for _i in range(len(test_data.index)):
    _belong_list = []
    for _idx, _item in enumerate(_market_res):
        if '是' in _item.Data[0][_i]:
            _belong_list.append(index_map[_idx + 1])
    print(test_data.index[_i], '属于', _belong_list)
    index_target[test_data.index[_i]] = Helper.prior_index(_belong_list)
    
index_target

600230.SH 属于 ['国证2000', '中证2000']
600519.SH 属于 ['上证50指数', '上证180指数', '沪深300指数', '中证100指数', '中证800指数']


{'600230.SH': '中证2000', '600519.SH': '沪深300指数'}

In [331]:
index_names = set([value for key, value in index_target.items()])
trade_date = test_data.columns

index_data = []
for _index_name in index_names:
    _temp_df = pd.read_csv(f'dataset/index_data/{_index_name}.csv')
    _temp_df = _temp_df.pivot(index='ts_code', columns='trade_date', values='pct_chg')
    _temp_df.columns = _temp_df.columns.astype(str)
    index_data.append(_temp_df)

index_data = pd.concat(index_data)

In [339]:
daily_data.columns

Index(['ts_code', 'trade_date', 'close', 'turnover_rate', 'turnover_rate_f',
       'volume_ratio', 'pe', 'pe_ttm', 'pb', 'ps', 'ps_ttm', 'dv_ratio',
       'dv_ttm', 'total_share', 'float_share', 'free_share', 'total_mv',
       'circ_mv'],
      dtype='object')

In [9]:
market_value_df = daily_data.pivot(index='ts_code', columns='trade_date', values='total_mv')
market_value_df = daily_data.pivot(index='ts_code', columns='trade_date', values='total_mv')
returns_df = daily_data.pivot(index='ts_code', columns='trade_date', values='pct_chg')

In [24]:
def calculate_market_excess_return(df_market_cap, df_excess_returns):
    # 确保两个DataFrame的索引相同
    df_market_cap = df_market_cap.reindex(df_excess_returns.index)
    
    # 计算总的市值
    total_market_cap = df_market_cap.sum(axis=1)
    
    # 计算每个股票的权重
    weights = df_market_cap / total_market_cap
    
    # 计算加权平均的市场超额收益
    market_excess_return = (weights * df_excess_returns).sum(axis=1)
    
    return market_excess_return