<font size=5>策略设置（研报）：
    
    根据龙头股筛选规则，每周进行观测，
    若当时存在羊群效应，并且龙头股的个数比较适宜，则该龙头股所在行业进入候选阶段，
    若最终挑出的行业数大于买进限制数，则按该行业当时的p值升序排序挑选，以（1-p值）为行业权重。
    但是期间如果观测到之前组合不包括的新的行业也适合买进，并且又超出限制数，则依然根据p值规则考虑是否用新行业替换持有的旧行业。
思路：
    
    获取申万一级行业指数；
    对每个行业指数用CSAD模型检测是否存在羊群效应；
    若存在羊群效应，对行业选择龙头股；
    若龙头股数量合适，行业加入备选池，记录p值，准备买入

In [42]:
import pandas as pd
import numpy as np
import akshare as ak
import statsmodels.formula as smFormula
import statsmodels.formula.api as smf
import statsmodels.api as smApi
from operator import methodcaller
import datetime
import math

In [14]:
# 获取申万一级行业指数
sector_index_list = ak.sw_index_spot()['指数代码']
# 获取所有股票的名称数据
stock_name = ak.stock_zh_a_spot()
stock_name = stock_name.applymap((lambda x: "".join(x.split()) if type(x) is str else x))

Please wait for a moment: 100%|████████████████████████████████████████████████████████| 55/55 [00:25<00:00,  2.12it/s]


In [68]:
def Herd_Effect(sector_code, beginning, ending):
    """
    对于给定的行业指数sector_code，时间区间beginning至ending，以CSAD检测是否存在羊群效应
    """
    """
    1. Data Processing
    """
    # 获取该index量价数据
    index_data = ak.sw_index_daily_indicator(index_code=sector_code, start_date=beginning, end_date=ending, data_type="Day")
    # index量价数据转为收益率，每日的涨跌幅作为收益率
    index_chg = pd.DataFrame(index_data[['chg_pct','date']]).astype('str')
    index_chg.set_index(["date"], inplace=True)
    index_chg = index_chg.astype(np.float32)
    index_chg = index_chg.sort_index()
    index_chg['close_change'] = index_chg['chg_pct']
    index_chg = (index_chg.drop(['chg_pct'],axis=1))[1:]
    # 获取该index成份数据
    stock_list = pd.read_csv('D:\\Python\\Flies\\Guanyun\\A股数据\\申万一级行业成份\\'+ sector_code +'.csv',dtype=str)
    # 获取该index成分股的量价数据，并且新建空dataframe：stock_of_index_data，将所有成分股的涨跌幅数据合并到这一张dataframe里
    df_index = pd.date_range(beginning, periods = 30)
    df_index = [datetime.datetime.strftime(x,'%F') for x in df_index]
    stock_of_index_data = pd.DataFrame(index = df_index)
    for stock_ in stock_list['stock_code'].values:
        if stock_[0:1] == '6':
            stock = 'sh' + stock_
        else: stock = 'sz' + stock_
        stock_list.loc[stock_list['stock_code'] == stock_,'stock_code'] = stock
        
        stock_data_all = pd.read_csv('D:\\Python\\Flies\\Guanyun\\A股数据\\股票量价\\'+ stock +'.csv')
        stock_data_all = stock_data_all[(stock_data_all['date']>=beginning) & (stock_data_all['date']<=ending)]

        stock_data = stock_data_all[['date','close']]
        stock_data.set_index(["date"], inplace=True)
        stock_data.rename(columns={'close':str(stock)},inplace=True)    

        stock_of_index_data = pd.concat([stock_of_index_data, stock_data], axis=1)
    # 缺失值处理
    stock_of_index_data = stock_of_index_data.dropna(axis=0, how='all') # 处理横纵向的整段空白
    stock_of_index_data = stock_of_index_data.dropna(axis=1, how='all')

    try: # 处理零散的空值
        for j in range(1,len(stock_of_index_data.columns.tolist())):
            for i in range(len(stock_of_index_data.index.values.tolist())-1,0,-1):
                if math.isnan(stock_of_index_data.iat[i,j]) and math.isnan(stock_of_index_data.iat[i-1,j]):
                    stock_of_index_data.iat[i,j]= stock_of_index_data.iat[i+1,j]
                elif math.isnan(stock_of_index_data.iat[i,j]) and math.isnan(stock_of_index_data.iat[i-1,j])==False:
                    stock_of_index_data.iat[i,j]=stock_of_index_data.iat[i-1,j]
    except:
        print(len(stock_of_index_data.columns.tolist()),len(stock_of_index_data.index.values.tolist()),i,j)

    #将成分股量价数据转为收益率，并将index行业指数的收益率数据index_chg也合并进来
    chg_stock_of_index_data = stock_of_index_data.astype(np.float32)
    chg_stock_of_index_data = (chg_stock_of_index_data.pct_change())[1:]
    chg_stock_of_index_data = pd.concat([chg_stock_of_index_data, index_chg], axis=1)

    """
    2. CSAD模型判断羊群效应
    """
    # 新建CSAD列准备存储生成的CSAD，将指数收益率close_change和其二次项close_change_2两列准备好，用于回归分析
    chg_stock_of_index_data = chg_stock_of_index_data.fillna(0)
    chg_stock_of_index_data['CSAD'] = 0
    chg_stock_of_index_data['close_change'] = abs(chg_stock_of_index_data['close_change'])
    chg_stock_of_index_data['close_change_2'] = chg_stock_of_index_data['close_change'] ** 2
    # 对每个date，生成CSAD = 1/N * sum(abs(R_i - R_m)), R_i为股票i收益率， R_m为指数收益率
    for date_ in chg_stock_of_index_data.index.values:
        CSAD = 0.0
        for stock in chg_stock_of_index_data.columns : 
            CSAD += abs(chg_stock_of_index_data.loc[date_][stock] - chg_stock_of_index_data.loc[date_]['close_change']) / len(stock_list[['stock_code','weight']].values)
        chg_stock_of_index_data.loc[date_,'CSAD'] = CSAD
#         print(CSAD)

    # 回归分析，判断羊群效应
    olsResult = smf.ols(formula='CSAD~close_change+close_change_2', data=chg_stock_of_index_data).fit()
    print(olsResult.summary()) #回归结果
    print(olsResult.params) #系数
#     chg_stock_of_index_data.to_csv('final.csv')

    p_value = 0.05
    if olsResult.params[2] < 0 and olsResult.pvalues[2] <= p_value : 
        print('有羊群效应')
        return True, olsResult.params[2],olsResult.pvalues[2]
    else:
        print('无羊群效应',olsResult.params[2],olsResult.pvalues[2])
        return False, olsResult.params[2],olsResult.pvalues[2]

Herd_Effect('801120', '2020-01-01', '2021-01-01')

                            OLS Regression Results                            
Dep. Variable:                   CSAD   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 7.592e+05
Date:                Sun, 30 May 2021   Prob (F-statistic):               0.00
Time:                        19:46:27   Log-Likelihood:                 684.57
No. Observations:                 242   AIC:                            -1363.
Df Residuals:                     239   BIC:                            -1353.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.0073      0.002      4.

(False, 0.01092398833488054, 1.865659488649823e-96)

In [58]:
# 从2020-05-24到2021-05-24一年里每个月测试一次，首先生成每个日期节点的list
ending = '2021-05-24'
date_list = [ending]
for i in range(6):
    date_list.append((datetime.datetime.strptime(date_list[-1], '%Y-%m-%d') - datetime.timedelta(days = 62)).strftime('%Y-%m-%d'))

In [66]:
# 对于每个月，判断每个行业是否存在羊群效应
res = []
for i in range(len(date_list)):
    for index in sector_index_list : 
        print(index, date_list[i+1], date_list[i])
        res.append(Herd_Effect(index, date_list[i+1], date_list[i]))

801010 2021-03-23 2021-05-24
                            OLS Regression Results                            
Dep. Variable:                   CSAD   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.173e+05
Date:                Sat, 29 May 2021   Prob (F-statistic):           4.57e-71
Time:                        23:48:19   Log-Likelihood:                 135.22
No. Observations:                  40   AIC:                            -264.4
Df Residuals:                      37   BIC:                            -259.4
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept      

                            OLS Regression Results                            
Dep. Variable:                   CSAD   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 1.627e+05
Date:                Sat, 29 May 2021   Prob (F-statistic):           1.08e-73
Time:                        23:48:50   Log-Likelihood:                 110.36
No. Observations:                  40   AIC:                            -214.7
Df Residuals:                      37   BIC:                            -209.7
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.0064      0.006      1.

                            OLS Regression Results                            
Dep. Variable:                   CSAD   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 8.773e+04
Date:                Sat, 29 May 2021   Prob (F-statistic):           9.84e-69
Time:                        23:49:18   Log-Likelihood:                 136.72
No. Observations:                  40   AIC:                            -267.4
Df Residuals:                      37   BIC:                            -262.4
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.0012      0.003      0.

IndexError: list index out of range

3.2龙头股筛选规则

我们选用了三种筛选规则，主要是从价、量、还有规模上去考虑的。

    （1）基本限制：所考查的行业内的个股在过去几周的涨幅必须大于某个阈值，并且该行业指数同期的涨幅的绝对值在这个阈值一半的范围内。

    （2）成交量限制：所考查的行业内的个股在最近一周的成交量要大于过去两周成交量的和，保证最近有持续放量，这是为了防止有价无市的个股混进龙头股行列，保证筛选的龙头股质量。

    （3）规模限制：所考查的行业内的个股当时的流通市值是位于全市场个股流通市值的某个下分位和上分位之间的，这能保证筛选出来的龙头股基本上是当时市值比较适中的个股。

筛选龙头股

In [69]:
def head_stock( stock_code, index_code, beginning, ending ):
    """
    这一段还是用绝对值判断的，正在从绝对值改成相对值或者范围
    """
    # 数据处理
    stock_data_2 = pd.read_csv('D:\\Python\\Flies\\Guanyun\\A股数据\\股票量价\\'+ stock_code +'.csv')
    stock_data_2 = stock_data_2[(stock_data_2['date']>=beginning) & (stock_data_2['date']<=ending)]
#     stock_data_2 = ak.stock_zh_a_daily(symbol=stock_code, start_date=beginning, end_date=ending, adjust="qfq")
    sw_index_df = ak.sw_index_daily_indicator(index_code=index_code, start_date=beginning, end_date=ending, data_type="Day")
    one_week_ago = (datetime.datetime.strptime(ending, '%Y-%m-%d') - datetime.timedelta(days = 7)).strftime('%Y-%m-%d') # 一周之前的日期
    three_week_ago = (datetime.datetime.strptime(ending, '%Y-%m-%d') - datetime.timedelta(days = 21)).strftime('%Y-%m-%d') # 三周之前的日期

    # （1）基本限制：该股过去三周的涨幅大于20%，行业指数同期的涨幅的绝对值小于10%
    week_3_stock = stock_data_2[((stock_data_2['date'])>=three_week_ago)]
    wekk_3_index = sw_index_df[((sw_index_df['date'])>=three_week_ago)]
    chg_3_week_stock = float(week_3_stock.iloc[-1]['close']) / float(week_3_stock.iloc[0]['close']) - 1
    chg_3_week_index = float(wekk_3_index.iloc[0]['close']) / float(wekk_3_index.iloc[-1]['close']) - 1
    if chg_3_week_stock >= 0.2 and abs(chg_3_week_index) <= 0.1 :
        sign_1 = True
    else : sign_1 = False

    # （2）成交量限制
    two_week_volume = stock_data_2[((stock_data_2['date'])>=three_week_ago) & ((stock_data_2['date'])<=one_week_ago)]['volume'].sum()
    one_week_volume = stock_data_2[((stock_data_2['date'])>=one_week_ago) & ((stock_data_2['date'])<=ending)]['volume'].sum()
    if two_week_volume <= one_week_volume :
        sign_2 = True
    else : sign_2 = False

    # （3）规模限制
    # print(sw_index_df)
    stock_float_mv = stock_data_2.iloc[-1]['outstanding_share'] * stock_data_2.iloc[-1]['close'] / 10**8
    avg_float_mv = float(sw_index_df.iloc[0]['avg_float_mv'])
    if stock_float_mv >= 0.8 * avg_float_mv :   # 这里需要根据规则继续修改
        sign_3 = True
    else : sign_3 = False

    if sign_1 * sign_2 * sign_3 == 1 : 
        print('龙头股')
    else :
        print('非龙头股')



In [70]:
beginning = '2020-03-24'
ending = '2020-05-24'
head_stock( "sz000002", "801010", beginning, ending )

非龙头股


In [None]:
# 对指数801120判断哪些股票是龙头股(跑出来结果是没有龙头股，可能是绝对值的问题)
sector_code = "801120"
stock_list = pd.read_csv('D:\\Python\\Flies\\Guanyun\\A股数据\\申万一级行业成份\\'+ sector_code +'.csv',dtype=str)
for stock in stock_list['stock_code'] : 
    if stock[0:1] == '6':
        stock = 'sh' + stock
    else: stock = 'sz' + stock
    head_stock( stock, sector_code, beginning, ending )