In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

# Daily - data structure adjustment

In [91]:
price_file_path = 'Data/PRC/日度数据-股票价格.xlsx'
volume_file_path = 'Data/PRC/日度数据-股票交易量.xlsx'
return_file_path = 'Data/PRC/日度数据-股票回报率.xlsx'

price_df = pd.read_excel(price_file_path, sheet_name=0)
volume_df = pd.read_excel(volume_file_path, sheet_name=0)
return_df = pd.read_excel(return_file_path, sheet_name=0)

In [92]:
def transform_to_panel(df, value_name):
    # Extract company names and stock codes
    company_names = df.iloc[0, 1:].values
    stock_codes = df.iloc[1, 1:].values
    date_column = df.iloc[2:, 0].values

    # Prepare a list to store the long format data
    long_data = []

    # Loop over the columns (excluding the first date column)
    for i, (name, code) in enumerate(zip(company_names, stock_codes), start=1):
        # Extract the values for the company
        values = df.iloc[2:, i].values
        # Combine date, company name, stock code, and values
        company_data = list(zip([name] * len(date_column), [code] * len(date_column), date_column, values))
        long_data.extend(company_data)

    # Convert the list to a DataFrame
    long_df = pd.DataFrame(long_data, columns=['公司名称', '股票代码', '日期', value_name])

    # Convert the date column to datetime
    long_df['日期'] = pd.to_datetime(long_df['日期'], errors='coerce')

    # Drop rows with invalid dates
    long_df.dropna(subset=['日期'], inplace=True)

    return long_df


In [93]:
price_panel = transform_to_panel(price_df, '股票价格')
volume_panel = transform_to_panel(volume_df, '股票交易量')
return_panel = transform_to_panel(return_df, '股票回报率')

In [94]:
print(price_panel)

              公司名称  股票代码         日期  股票价格
0        000776.SZ  股票价格 2004-01-02  4.58
1        000776.SZ  股票价格 2004-01-05  4.50
2        000776.SZ  股票价格 2004-01-06  4.24
3        000776.SZ  股票价格 2004-01-07  4.34
4        000776.SZ  股票价格 2004-01-08  4.47
...            ...   ...        ...   ...
1457695  603993.SH  股票价格 2023-12-25  4.93
1457696  603993.SH  股票价格 2023-12-26  4.92
1457697  603993.SH  股票价格 2023-12-27  5.08
1457698  603993.SH  股票价格 2023-12-28  5.16
1457699  603993.SH  股票价格 2023-12-29  5.20

[1457700 rows x 4 columns]


In [95]:
print(volume_panel)

              公司名称   股票代码         日期        股票交易量
0        000776.SZ  股票交易量 2004-01-02     257179.0
1        000776.SZ  股票交易量 2004-01-05     345250.0
2        000776.SZ  股票交易量 2004-01-06     868450.0
3        000776.SZ  股票交易量 2004-01-07     528968.0
4        000776.SZ  股票交易量 2004-01-08     735885.0
...            ...    ...        ...          ...
1457695  603993.SH  股票交易量 2023-12-25   76115676.0
1457696  603993.SH  股票交易量 2023-12-26   56692336.0
1457697  603993.SH  股票交易量 2023-12-27  151966596.0
1457698  603993.SH  股票交易量 2023-12-28  163415379.0
1457699  603993.SH  股票交易量 2023-12-29   98730274.0

[1457700 rows x 4 columns]


In [96]:
print(return_panel)

              公司名称   股票代码         日期     股票回报率
0        000776.SZ  股票回报率 2004-01-02  1.777778
1        000776.SZ  股票回报率 2004-01-05 -1.746725
2        000776.SZ  股票回报率 2004-01-06 -5.777778
3        000776.SZ  股票回报率 2004-01-07  2.358491
4        000776.SZ  股票回报率 2004-01-08  2.995392
...            ...    ...        ...       ...
1457695  603993.SH  股票回报率 2023-12-25  0.612245
1457696  603993.SH  股票回报率 2023-12-26 -0.202840
1457697  603993.SH  股票回报率 2023-12-27  3.252033
1457698  603993.SH  股票回报率 2023-12-28  1.574803
1457699  603993.SH  股票回报率 2023-12-29  0.775194

[1457700 rows x 4 columns]


In [97]:
price_panel.drop(columns=['股票代码'], inplace=True)
volume_panel.drop(columns=['股票代码'], inplace=True)
return_panel.drop(columns=['股票代码'], inplace=True)

price_panel.rename(columns={'公司名称': '股票代码'}, inplace=True)
volume_panel.rename(columns={'公司名称': '股票代码'}, inplace=True)
return_panel.rename(columns={'公司名称': '股票代码'}, inplace=True)

In [98]:
print("Price Panel dtypes:\n", price_panel.dtypes)
print("Volume Panel dtypes:\n", volume_panel.dtypes)
print("Return Panel dtypes:\n", return_panel.dtypes)

price_panel['股票代码'] = price_panel['股票代码'].astype(str)
volume_panel['股票代码'] = volume_panel['股票代码'].astype(str)
return_panel['股票代码'] = return_panel['股票代码'].astype(str)
price_panel['日期'] = pd.to_datetime(price_panel['日期'])
volume_panel['日期'] = pd.to_datetime(volume_panel['日期'])
return_panel['日期'] = pd.to_datetime(return_panel['日期'])

Price Panel dtypes:
 股票代码            object
日期      datetime64[ns]
股票价格           float64
dtype: object
Volume Panel dtypes:
 股票代码             object
日期       datetime64[ns]
股票交易量           float64
dtype: object
Return Panel dtypes:
 股票代码             object
日期       datetime64[ns]
股票回报率           float64
dtype: object


In [99]:
merged_panel = pd.merge(price_panel, volume_panel, on=['股票代码', '日期'])

In [100]:
merged_panel = pd.merge(merged_panel, return_panel, on=['股票代码', '日期'])

In [101]:
print(merged_panel)

              股票代码         日期  股票价格        股票交易量     股票回报率
0        000776.SZ 2004-01-02  4.58     257179.0  1.777778
1        000776.SZ 2004-01-05  4.50     345250.0 -1.746725
2        000776.SZ 2004-01-06  4.24     868450.0 -5.777778
3        000776.SZ 2004-01-07  4.34     528968.0  2.358491
4        000776.SZ 2004-01-08  4.47     735885.0  2.995392
...            ...        ...   ...          ...       ...
1457695  603993.SH 2023-12-25  4.93   76115676.0  0.612245
1457696  603993.SH 2023-12-26  4.92   56692336.0 -0.202840
1457697  603993.SH 2023-12-27  5.08  151966596.0  3.252033
1457698  603993.SH 2023-12-28  5.16  163415379.0  1.574803
1457699  603993.SH 2023-12-29  5.20   98730274.0  0.775194

[1457700 rows x 5 columns]


## Daily - Bid & Ask

In [102]:
file_path = 'Data/PRC/沪深300 ask price&bid price.xlsx'
sheets = pd.read_excel(file_path, sheet_name=['bid price', 'ask price'])

bid_df = sheets['bid price']
ask_df = sheets['ask price']

In [103]:
bid_panel = transform_to_panel(bid_df, 'bid price')
ask_panel = transform_to_panel(ask_df, 'ask price')

In [112]:
print(bid_panel)

              股票代码         日期  bid price
0        000776.SZ 2004-01-02       8.65
1        000776.SZ 2004-01-05       8.98
2        000776.SZ 2004-01-06       9.55
3        000776.SZ 2004-01-07       9.53
4        000776.SZ 2004-01-08       9.47
...            ...        ...        ...
1458293        NaN        NaT      55.00
1458294        NaN        NaT      53.23
1458295        NaN        NaT      52.29
1458296        NaN        NaT      53.45
1458297        NaN        NaT      53.50

[1457700 rows x 3 columns]


In [113]:
print(ask_panel)

              股票代码         日期  ask price
0        000776.SZ 2004-01-02       8.41
1        000776.SZ 2004-01-05       8.60
2        000776.SZ 2004-01-06       9.00
3        000776.SZ 2004-01-07       9.25
4        000776.SZ 2004-01-08       9.23
...            ...        ...        ...
1458293        NaN        NaT      52.85
1458294        NaN        NaT      51.80
1458295        NaN        NaT      51.18
1458296        NaN        NaT      51.68
1458297        NaN        NaT      52.88

[1457700 rows x 3 columns]


In [106]:
bid_panel.drop(columns=['股票代码'], inplace=True)
ask_panel.drop(columns=['股票代码'], inplace=True)

bid_panel.rename(columns={'公司名称': '股票代码'}, inplace=True)
ask_panel.rename(columns={'公司名称': '股票代码'}, inplace=True)

In [107]:
bid_panel['股票代码'] = price_panel['股票代码'].astype(str)
ask_panel['股票代码'] = volume_panel['股票代码'].astype(str)
bid_panel['日期'] = pd.to_datetime(price_panel['日期'])
ask_panel['日期'] = pd.to_datetime(volume_panel['日期'])

In [108]:
bid_ask_merged_panel = pd.merge(bid_panel, ask_panel, on=['股票代码', '日期'])

In [109]:
print(bid_ask_merged_panel)

              股票代码         日期  bid price  ask price
0        000776.SZ 2004-01-02       8.65       8.41
1        000776.SZ 2004-01-05       8.98       8.60
2        000776.SZ 2004-01-06       9.55       9.00
3        000776.SZ 2004-01-07       9.53       9.25
4        000776.SZ 2004-01-08       9.47       9.23
...            ...        ...        ...        ...
1814701        NaN        NaT      53.50      52.85
1814702        NaN        NaT      53.50      51.80
1814703        NaN        NaT      53.50      51.18
1814704        NaN        NaT      53.50      51.68
1814705        NaN        NaT      53.50      52.88

[1814706 rows x 4 columns]


In [110]:
merged_panel = pd.merge(merged_panel, bid_ask_merged_panel, on=['股票代码', '日期'])

In [111]:
print(merged_panel)

              股票代码         日期  股票价格        股票交易量     股票回报率  bid price  \
0        000776.SZ 2004-01-02  4.58     257179.0  1.777778       8.65   
1        000776.SZ 2004-01-05  4.50     345250.0 -1.746725       8.98   
2        000776.SZ 2004-01-06  4.24     868450.0 -5.777778       9.55   
3        000776.SZ 2004-01-07  4.34     528968.0  2.358491       9.53   
4        000776.SZ 2004-01-08  4.47     735885.0  2.995392       9.47   
...            ...        ...   ...          ...       ...        ...   
1457097  603993.SH 2023-12-25  4.93   76115676.0  0.612245      58.00   
1457098  603993.SH 2023-12-26  4.92   56692336.0 -0.202840      57.36   
1457099  603993.SH 2023-12-27  5.08  151966596.0  3.252033      57.74   
1457100  603993.SH 2023-12-28  5.16  163415379.0  1.574803      57.25   
1457101  603993.SH 2023-12-29  5.20   98730274.0  0.775194      56.15   

         ask price  
0             8.41  
1             8.60  
2             9.00  
3             9.25  
4             9.23

In [115]:
merged_panel.rename(columns={'股票代码': 'TICKER'}, inplace=True)
merged_panel.rename(columns={'日期': 'date'}, inplace=True)
merged_panel.rename(columns={'股票价格': 'PRC'}, inplace=True)
merged_panel.rename(columns={'股票交易量': 'VOL'}, inplace=True)
merged_panel.rename(columns={'股票回报率': 'RET'}, inplace=True)

In [117]:
missing_values_after = merged_panel.isnull().sum()
print(missing_values_after)

TICKER            0
date              0
PRC          483918
VOL          485310
RET          485109
bid price         0
ask price         0
dtype: int64


In [120]:
identification_file_path = 'Data/PRC/公司标识数据.xlsx'
identification_df = pd.read_excel(identification_file_path, sheet_name=0)

print(identification_df.head())

      TICKER COMNAM CSRCIC-1 所属证监会行业名称\n[行业级别] 门类行业 CSRCIC-2  \
0  000776.SZ   广发证券        J                    金融业      J67   
1  000999.SZ   华润三九        C                    制造业      C27   
2  002415.SZ   海康威视        C                    制造业      C39   
3  002603.SZ   以岭药业        C                    制造业      C27   
4  300308.SZ   中际旭创        C                    制造业      C39   

  所属证监会行业代码\n[行业级别] 大类行业 上市地点  上市板  
0                 资本市场服务   深圳   主板  
1                  医药制造业   深圳   主板  
2       计算机、通信和其他电子设备制造业   深圳   主板  
3                  医药制造业   深圳   主板  
4       计算机、通信和其他电子设备制造业   深圳  创业板  


In [121]:
identification_df = identification_df[['TICKER', 'COMNAM', 'CSRCIC-1', 'CSRCIC-2']]
merged_panel = pd.merge(merged_panel, identification_df, left_on='TICKER', right_on='TICKER', how='left')

In [122]:
print(merged_panel)

            TICKER       date   PRC          VOL       RET  bid price  \
0        000776.SZ 2004-01-02  4.58     257179.0  1.777778       8.65   
1        000776.SZ 2004-01-05  4.50     345250.0 -1.746725       8.98   
2        000776.SZ 2004-01-06  4.24     868450.0 -5.777778       9.55   
3        000776.SZ 2004-01-07  4.34     528968.0  2.358491       9.53   
4        000776.SZ 2004-01-08  4.47     735885.0  2.995392       9.47   
...            ...        ...   ...          ...       ...        ...   
1457097  603993.SH 2023-12-25  4.93   76115676.0  0.612245      58.00   
1457098  603993.SH 2023-12-26  4.92   56692336.0 -0.202840      57.36   
1457099  603993.SH 2023-12-27  5.08  151966596.0  3.252033      57.74   
1457100  603993.SH 2023-12-28  5.16  163415379.0  1.574803      57.25   
1457101  603993.SH 2023-12-29  5.20   98730274.0  0.775194      56.15   

         ask price COMNAM CSRCIC-1 CSRCIC-2  
0             8.41   广发证券        J      J67  
1             8.60   广发证券      

In [127]:
merged_panel['date'] = pd.to_datetime(merged_panel['date'])

merged_panel['year'] = merged_panel['date'].dt.year
merged_panel['month'] = merged_panel['date'].dt.month

def fill_missing_values(df, column):
    df[column] = df.groupby(['CSRCIC-2', 'year', 'month'])[column].transform(lambda x: x.fillna(x.median()))
    return df
## TICKER/COMNAM/CSRCIC-1/CSRCIC-2 are all used

columns_to_fill = ['PRC', 'VOL', 'RET']

for column in columns_to_fill:
    merged_panel = fill_missing_values(merged_panel, column)

merged_panel.drop(columns=['year', 'month'], inplace=True)

In [128]:
missing_values_after = merged_panel.isnull().sum()
print(missing_values_after)

TICKER           0
date             0
PRC          15310
VOL          15427
RET          15310
bid price        0
ask price        0
COMNAM           0
CSRCIC-1         0
CSRCIC-2         0
dtype: int64


In [129]:
merged_panel.fillna(0, inplace=True)

In [130]:
missing_values_after = merged_panel.isnull().sum()
print(missing_values_after)

TICKER       0
date         0
PRC          0
VOL          0
RET          0
bid price    0
ask price    0
COMNAM       0
CSRCIC-1     0
CSRCIC-2     0
dtype: int64


In [132]:
merged_panel.rename(columns={'bid price': 'BID'}, inplace=True)
merged_panel.rename(columns={'ask price': 'ASK'}, inplace=True)

In [133]:
desired_column_order = ['date', 'TICKER', 'COMNAM', 'CSRCIC-1', 'CSRCIC-2', 'PRC', 'VOL', 'RET', 'BID', 'ASK']
merged_panel = merged_panel[desired_column_order]
print(merged_panel.head())

        date     TICKER COMNAM CSRCIC-1 CSRCIC-2   PRC       VOL       RET  \
0 2004-01-02  000776.SZ   广发证券        J      J67  4.58  257179.0  1.777778   
1 2004-01-05  000776.SZ   广发证券        J      J67  4.50  345250.0 -1.746725   
2 2004-01-06  000776.SZ   广发证券        J      J67  4.24  868450.0 -5.777778   
3 2004-01-07  000776.SZ   广发证券        J      J67  4.34  528968.0  2.358491   
4 2004-01-08  000776.SZ   广发证券        J      J67  4.47  735885.0  2.995392   

    BID   ASK  
0  8.65  8.41  
1  8.98  8.60  
2  9.55  9.00  
3  9.53  9.25  
4  9.47  9.23  


In [134]:
output_file_path = 'Data/PRC/Daily1.csv'
merged_panel.to_csv(output_file_path, index=False)

# Daily - Liquidity

In [135]:
merged_panel['date'] = pd.to_datetime(merged_panel['date'])

In [136]:
merged_panel['DV'] = merged_panel['VOL'] * merged_panel['PRC']
merged_panel['BAS'] = merged_panel['ASK'] - merged_panel['BID']
merged_panel['year_month'] = merged_panel['date'].dt.to_period('M')
monthly_dv = merged_panel.groupby(['TICKER', 'year_month'])['DV'].sum().reset_index()
monthly_bas = merged_panel.groupby(['TICKER', 'year_month'])['BAS'].mean().reset_index()
monthly_df = pd.merge(monthly_dv, monthly_bas, on=['TICKER', 'year_month'])

In [137]:
print(monthly_df.head())

      TICKER year_month            DV       BAS
0  000001.SZ    2004-01  1.528854e+09 -0.030769
1  000001.SZ    2004-02  4.379420e+09 -0.031000
2  000001.SZ    2004-03  3.903233e+09 -0.022609
3  000001.SZ    2004-04  1.848258e+09 -0.027273
4  000001.SZ    2004-05  5.555516e+08 -0.029375


In [138]:
missing_values = monthly_df.isnull().sum()
print(missing_values)

TICKER        0
year_month    0
DV            0
BAS           0
dtype: int64


In [139]:
monthly_df.to_csv('Data/PRC/Monthly - Liquidity1.csv', index=False)

# Total Return Volatility - Standard Deviation

In [140]:
print(merged_panel)

              date     TICKER COMNAM CSRCIC-1 CSRCIC-2   PRC          VOL  \
0       2004-01-02  000776.SZ   广发证券        J      J67  4.58     257179.0   
1       2004-01-05  000776.SZ   广发证券        J      J67  4.50     345250.0   
2       2004-01-06  000776.SZ   广发证券        J      J67  4.24     868450.0   
3       2004-01-07  000776.SZ   广发证券        J      J67  4.34     528968.0   
4       2004-01-08  000776.SZ   广发证券        J      J67  4.47     735885.0   
...            ...        ...    ...      ...      ...   ...          ...   
1457097 2023-12-25  603993.SH   洛阳钼业        B      B09  4.93   76115676.0   
1457098 2023-12-26  603993.SH   洛阳钼业        B      B09  4.92   56692336.0   
1457099 2023-12-27  603993.SH   洛阳钼业        B      B09  5.08  151966596.0   
1457100 2023-12-28  603993.SH   洛阳钼业        B      B09  5.16  163415379.0   
1457101 2023-12-29  603993.SH   洛阳钼业        B      B09  5.20   98730274.0   

              RET    BID    ASK            DV   BAS year_month  
0        1

In [141]:
merged_panel = merged_panel[pd.to_numeric(merged_panel['RET'], errors='coerce').notnull()]
merged_panel['RET'] = merged_panel['RET'].astype(float)

In [142]:
monthly_volatility = merged_panel.groupby(['TICKER', 'year_month'])['RET'].std().reset_index()
monthly_volatility.rename(columns={'RET': 'TRV'}, inplace=True)

In [143]:
monthly_volatility['year'] = monthly_volatility['year_month'].dt.year
annual_volatility = monthly_volatility.groupby(['TICKER', 'year'])['TRV'].mean().reset_index()
annual_volatility.rename(columns={'TRV': 'annual_TRV'}, inplace=True)

In [144]:
missing_values_after = annual_volatility.isnull().sum()
print(missing_values_after)

TICKER        0
year          0
annual_TRV    0
dtype: int64


In [145]:
annual_volatility = annual_volatility[['TICKER', 'year', 'annual_TRV']]

annual_volatility.to_csv('/Users/yuyangdou/RSM/BAM/11Model/Data/PRC/annual_volatility.csv', index=False)

In [146]:
print(annual_volatility)

         TICKER  year  annual_TRV
0     000001.SZ  2004    1.803115
1     000001.SZ  2005    2.299663
2     000001.SZ  2006    2.446014
3     000001.SZ  2007    3.476504
4     000001.SZ  2008    4.149569
...         ...   ...         ...
5995  688981.SH  2019    0.000000
5996  688981.SH  2020    4.726529
5997  688981.SH  2021    2.210705
5998  688981.SH  2022    1.650728
5999  688981.SH  2023    2.202027

[6000 rows x 3 columns]


# Regression - Idiosyncratic Volatility & Market Beta

In [147]:
merged_panel2 = pd.read_csv('Data/PRC/Daily1.csv')

In [148]:
market_return_path = '/Users/yuyangdou/RSM/BAM/11Model/Data/PRC/市场回报率.xlsx'
market_return_df = pd.read_excel(market_return_path, sheet_name=0)

merged_panel2['date'] = pd.to_datetime(merged_panel2['date'])
market_return_df['date'] = pd.to_datetime(market_return_df['date'])

merged_panel2 = pd.merge(merged_panel2, market_return_df, on='date', how='left')

In [149]:
merged_panel2['date'] = pd.to_datetime(merged_panel2['date'])
merged_panel2['year'] = merged_panel2['date'].dt.year
merged_panel2['year_month'] = merged_panel2['date'].dt.to_period('M')

merged_panel2['RET'] = pd.to_numeric(merged_panel2['RET'], errors='coerce')
merged_panel2['sprtrn'] = pd.to_numeric(merged_panel2['sprtrn'], errors='coerce')

In [150]:
print(merged_panel2)

              date     TICKER COMNAM CSRCIC-1 CSRCIC-2   PRC          VOL  \
0       2004-01-02  000776.SZ   广发证券        J      J67  4.58     257179.0   
1       2004-01-05  000776.SZ   广发证券        J      J67  4.50     345250.0   
2       2004-01-06  000776.SZ   广发证券        J      J67  4.24     868450.0   
3       2004-01-07  000776.SZ   广发证券        J      J67  4.34     528968.0   
4       2004-01-08  000776.SZ   广发证券        J      J67  4.47     735885.0   
...            ...        ...    ...      ...      ...   ...          ...   
1457097 2023-12-25  603993.SH   洛阳钼业        B      B09  4.93   76115676.0   
1457098 2023-12-26  603993.SH   洛阳钼业        B      B09  4.92   56692336.0   
1457099 2023-12-27  603993.SH   洛阳钼业        B      B09  5.08  151966596.0   
1457100 2023-12-28  603993.SH   洛阳钼业        B      B09  5.16  163415379.0   
1457101 2023-12-29  603993.SH   洛阳钼业        B      B09  5.20   98730274.0   

              RET    BID    ASK    sprtrn  year year_month  
0        1.777

In [153]:
merged_panel2['IV'] = np.nan
merged_panel2['MB'] = np.nan

companies = merged_panel2['TICKER'].unique()
for company in companies:
    company_data = merged_panel2[merged_panel2['TICKER'] == company]
    
    
    company_data = company_data.dropna(subset=['RET', 'sprtrn'])
    
    if len(company_data) > 1:  
        X = sm.add_constant(company_data['sprtrn'])
        y = company_data['RET']
        
        model = sm.OLS(y, X).fit()
        
        merged_panel2.loc[merged_panel2['TICKER'] == company, 'MB'] = model.params['sprtrn']
        merged_panel2.loc[merged_panel2['TICKER'] == company, 'IV'] = model.resid.std()

annual_iv_mb = merged_panel2.groupby(['TICKER', 'year']).agg({
    'IV': 'mean',
    'MB': 'mean'
}).reset_index()

In [154]:
missing_values_after = annual_iv_mb.isnull().sum()
print(missing_values_after)

TICKER    0
year      0
IV        0
MB        0
dtype: int64


In [155]:
annual_iv_mb.to_csv('Data/PRC/annual_iv_mb.csv', index=False)

In [156]:
print(annual_iv_mb)

         TICKER  year        IV        MB
0     000001.SZ  2004  1.724663  1.047493
1     000001.SZ  2005  1.724663  1.047493
2     000001.SZ  2006  1.724663  1.047493
3     000001.SZ  2007  1.724663  1.047493
4     000001.SZ  2008  1.724663  1.047493
...         ...   ...       ...       ...
5995  688981.SH  2019  3.066400  0.009212
5996  688981.SH  2020  3.066400  0.009212
5997  688981.SH  2021  3.066400  0.009212
5998  688981.SH  2022  3.066400  0.009212
5999  688981.SH  2023  3.066400  0.009212

[6000 rows x 4 columns]


# Monthly

In [3]:
monthly_df = pd.read_excel('Data/PRC/月度数据.xlsx', sheet_name='Monthly')
identity_df = pd.read_excel('Data/PRC/公司标识数据.xlsx', sheet_name='万得')

merged_df = pd.merge(monthly_df, identity_df, on=['TICKER', 'COMNAM'], how='inner')
merged_df.to_csv('Data/PRC/M合并1.csv', index=False)

In [6]:
columns_to_drop = ['上市地点', '上市板']
merged_df.drop(columns=columns_to_drop, inplace=True)

merged_df['日期'] = pd.to_datetime(merged_df['日期']).dt.to_period('M')

columns_to_rename = {
    '日期': 'date',
    '月度回报率': 'RET',
    '股票价格': 'PRC',
    '股票交易量': 'VOL',
    '流通股数量': 'SHROUT'
}
merged_df.rename(columns=columns_to_rename, inplace=True)

In [7]:
print(merged_df)

      COMNAM     TICKER     date        RET   PRC           VOL        SHROUT  \
0       广发证券  000776.SZ  2004-01  12.222222  5.05  1.197356e+07  9.318396e+07   
1       广发证券  000776.SZ  2004-02   9.108911  5.51  2.938734e+07  9.318396e+07   
2       广发证券  000776.SZ  2004-03  -7.078040  5.12  4.805419e+07  9.318396e+07   
3       广发证券  000776.SZ  2004-04 -14.648438  4.37  2.873926e+07  9.318396e+07   
4       广发证券  000776.SZ  2004-05 -15.789474  3.68  1.019008e+07  9.318396e+07   
...      ...        ...      ...        ...   ...           ...           ...   
71995   洛阳钼业  603993.SH  2023-08  -7.131280  5.73  1.963778e+09  2.159924e+10   
71996   洛阳钼业  603993.SH  2023-09   3.141361  5.91  1.771370e+09  2.159924e+10   
71997   洛阳钼业  603993.SH  2023-10  -6.260575  5.54  1.438451e+09  2.159924e+10   
71998   洛阳钼业  603993.SH  2023-11  -5.956679  5.21  1.826941e+09  2.159924e+10   
71999   洛阳钼业  603993.SH  2023-12  -0.191939  5.20  2.445170e+09  2.159924e+10   

      CSRCIC-1 所属证监会行业名称\n[

In [8]:
missing_values = merged_df.isnull().sum()
print(missing_values)

COMNAM                        0
TICKER                        0
date                          0
RET                       23652
PRC                       23852
VOL                       24520
SHROUT                    23744
CSRCIC-1                      0
所属证监会行业名称\n[行业级别] 门类行业        0
CSRCIC-2                      0
所属证监会行业代码\n[行业级别] 大类行业        0
dtype: int64


In [9]:
num_cols = ['PRC', 'SHROUT', 'VOL', 'RET']
for col in num_cols:
    merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce')

In [10]:
for col in num_cols:
    median_value = merged_df[col].median()
    merged_df[col].fillna(median_value, inplace=True)

In [11]:
missing_values = merged_df.isnull().sum()
print(missing_values)

COMNAM                    0
TICKER                    0
date                      0
RET                       0
PRC                       0
VOL                       0
SHROUT                    0
CSRCIC-1                  0
所属证监会行业名称\n[行业级别] 门类行业    0
CSRCIC-2                  0
所属证监会行业代码\n[行业级别] 大类行业    0
dtype: int64


In [12]:
merged_df.to_csv('Data/PRC/Monthly_Merged.csv', index=False)

# Macro Economics

In [13]:
macro_df = pd.read_excel('Data/PRC/中国国库券利率_3个月.xlsx', sheet_name='Macro')

In [22]:
macro_df['date'] = pd.to_datetime(macro_df['date']).dt.to_period('M')

In [23]:
merged2_df = pd.merge(merged_df, macro_df, on=['date'], how='left')

In [24]:
print(merged2_df)

      COMNAM     TICKER     date        RET   PRC           VOL        SHROUT  \
0       广发证券  000776.SZ  2004-01  12.222222  5.05  1.197356e+07  9.318396e+07   
1       广发证券  000776.SZ  2004-02   9.108911  5.51  2.938734e+07  9.318396e+07   
2       广发证券  000776.SZ  2004-03  -7.078040  5.12  4.805419e+07  9.318396e+07   
3       广发证券  000776.SZ  2004-04 -14.648438  4.37  2.873926e+07  9.318396e+07   
4       广发证券  000776.SZ  2004-05 -15.789474  3.68  1.019008e+07  9.318396e+07   
...      ...        ...      ...        ...   ...           ...           ...   
71995   洛阳钼业  603993.SH  2023-08  -7.131280  5.73  1.963778e+09  2.159924e+10   
71996   洛阳钼业  603993.SH  2023-09   3.141361  5.91  1.771370e+09  2.159924e+10   
71997   洛阳钼业  603993.SH  2023-10  -6.260575  5.54  1.438451e+09  2.159924e+10   
71998   洛阳钼业  603993.SH  2023-11  -5.956679  5.21  1.826941e+09  2.159924e+10   
71999   洛阳钼业  603993.SH  2023-12  -0.191939  5.20  2.445170e+09  2.159924e+10   

      CSRCIC-1 所属证监会行业名称\n[

In [25]:
missing_values = merged2_df.isnull().sum()
print(missing_values)

COMNAM                    0
TICKER                    0
date                      0
RET                       0
PRC                       0
VOL                       0
SHROUT                    0
CSRCIC-1                  0
所属证监会行业名称\n[行业级别] 门类行业    0
CSRCIC-2                  0
所属证监会行业代码\n[行业级别] 大类行业    0
10Y                       0
tbl                       0
T10Y3MM                   0
gpce                      0
dtype: int64


In [26]:
merged2_df.to_csv('Data/PRC/macro.csv', index=False)

# Balanced Panel???

In [27]:
company_counts = merged2_df['COMNAM'].value_counts()
unbalanced_companies = company_counts[company_counts < 240]

print("Companies with less than 240 records:")
print(unbalanced_companies)

Companies with less than 240 records:
Series([], Name: count, dtype: int64)


In [28]:
company_counts = merged2_df['COMNAM'].value_counts()
complete_companies = company_counts[company_counts == 240]

print(f"The number of companies with exactly 240 records is: {complete_companies.shape[0]}")
print("Companies with exactly 240 records are:")
print(complete_companies.index.tolist())

The number of companies with exactly 240 records is: 300
Companies with exactly 240 records are:
['广发证券', '中国广核', 'TCL中环', '浪潮信息', '中信特钢', '申万宏源', '中国交建', '国投电力', '大华股份', '恒生电子', '汇川技术', '海天味业', '成都银行', '中海油服', '三六零', '广汽集团', '山西汾酒', '北方华创', '亿纬锂能', '恒力石化', '亿联网络', '泸州老窖', '云南白药', '徐工机械', '潍柴动力', '中联重科', 'TCL科技', '华侨城A', '中兴通讯', '万科A', '平安银行', '今世缘', '京沪高铁', '邮储银行', '恒立液压', '闻泰科技', '通威股份', '中国巨石', '华润三九', '三峡能源', '爱美客', '卓胜微', '迈瑞医疗', '康龙化成', '宁德时代', '晶盛机电', '天赐材料', '天齐锂业', '晶澳科技', '东方盛虹', '传音控股', '中微公司', '澜起科技', '万泰生物', '公牛集团', '金龙鱼', '中国电信', '华能水电', '福莱特', '蓝思科技', '三环集团', '荣盛石化', '三花智控', '北新建材', '中国核电', '国泰君安', '中芯国际', '天合光能', '奇安信-U', '东鹏饮料', '科沃斯', '合盛硅业', '中煤能源', '中国能建', '长安汽车', '格力电器', '中航西飞', '华泰证券', '中国人寿', '中国中冶', '中国太保', '中国铝业', '工商银行', '中国中铁', '交通银行', '中国平安', '农业银行', '中国铁建', '北京银行', '兴业银行', '中国国航', '中国神华', '南京银行', '中国建筑', '中国中车', '五粮液', '中国石油', '白云山', '华东医药', '方正证券', '中国电建', '长城汽车', '新华保险', '歌尔股份', '京东方A', '兴业证券', '中信银行', '中国重工', '中国银行', '建设银行', '紫金矿业', '中国中免', '大秦铁路', '招商证券

# Momentum - Monthly

In [29]:
merged2_df = merged2_df.sort_values(by=['TICKER', 'date'])
merged2_df['12m_momentum'] = merged2_df.groupby('TICKER')['RET'].rolling(12).apply(lambda x: (x + 1).prod() - 1).reset_index(drop=True)

In [30]:
industry_groups = merged2_df.groupby(['CSRCIC-2', 'date'])
merged2_df['industry_momentum'] = industry_groups['12m_momentum'].transform('mean')

In [31]:
merged2_df['1m_reversal'] = merged2_df.groupby('TICKER')['RET'].shift(1)

In [32]:
missing_values = merged2_df.isnull().sum()
print(missing_values)

COMNAM                       0
TICKER                       0
date                         0
RET                          0
PRC                          0
VOL                          0
SHROUT                       0
CSRCIC-1                     0
所属证监会行业名称\n[行业级别] 门类行业       0
CSRCIC-2                     0
所属证监会行业代码\n[行业级别] 大类行业       0
10Y                          0
tbl                          0
T10Y3MM                      0
gpce                         0
12m_momentum              3300
industry_momentum         3300
1m_reversal                300
dtype: int64


In [36]:
def fill_missing_with_monthly_median(df, column):
    df[column] = df.groupby('TICKER')[column].transform(lambda x: x.fillna(x.median()))
    return df

merged2_df = fill_missing_with_monthly_median(merged2_df, '12m_momentum')
merged2_df = fill_missing_with_monthly_median(merged2_df, 'industry_momentum')
merged2_df = fill_missing_with_monthly_median(merged2_df, '1m_reversal')

missing_values = merged2_df.isnull().sum()
print(missing_values)

COMNAM                    0
TICKER                    0
date                      0
RET                       0
PRC                       0
VOL                       0
SHROUT                    0
CSRCIC-1                  0
所属证监会行业名称\n[行业级别] 门类行业    0
CSRCIC-2                  0
所属证监会行业代码\n[行业级别] 大类行业    0
10Y                       0
tbl                       0
T10Y3MM                   0
gpce                      0
12m_momentum              0
industry_momentum         0
1m_reversal               0
dtype: int64


# Market Value & Excess Return(target variable)

In [37]:
merged2_df['EXRET'] = merged2_df['RET'] - merged2_df['tbl']
merged2_df['MV'] = merged2_df['PRC'] * merged2_df['SHROUT']

In [38]:
merged2_df.to_csv('Data/PRC/merged2_with_EXRET_MV.csv', index=False)
print(merged2_df.head())

      COMNAM     TICKER     date        RET    PRC          VOL        SHROUT  \
51360   平安银行  000001.SZ  2004-01   9.048179   9.28  165652539.0  1.409362e+09   
51361   平安银行  000001.SZ  2004-02  11.637931  10.36  416409885.0  1.409362e+09   
51362   平安银行  000001.SZ  2004-03   2.895753  10.66  359113615.0  1.409362e+09   
51363   平安银行  000001.SZ  2004-04 -12.476548   9.33  185640796.0  1.409362e+09   
51364   平安银行  000001.SZ  2004-05   3.108253   9.62   59981735.0  1.409362e+09   

      CSRCIC-1 所属证监会行业名称\n[行业级别] 门类行业 CSRCIC-2 所属证监会行业代码\n[行业级别] 大类行业  \
51360        J                    金融业      J66                 货币金融服务   
51361        J                    金融业      J66                 货币金融服务   
51362        J                    金融业      J66                 货币金融服务   
51363        J                    金融业      J66                 货币金融服务   
51364        J                    金融业      J66                 货币金融服务   

            10Y       tbl   T10Y3MM      gpce  12m_momentum  \
51360  3.78

In [39]:
missing_values = merged2_df.isnull().sum()
print(missing_values)

COMNAM                    0
TICKER                    0
date                      0
RET                       0
PRC                       0
VOL                       0
SHROUT                    0
CSRCIC-1                  0
所属证监会行业名称\n[行业级别] 门类行业    0
CSRCIC-2                  0
所属证监会行业代码\n[行业级别] 大类行业    0
10Y                       0
tbl                       0
T10Y3MM                   0
gpce                      0
12m_momentum              0
industry_momentum         0
1m_reversal               0
EXRET                     0
MV                        0
dtype: int64


# Final Merge - excluding Financial Ratio

In [72]:
final_df = pd.read_csv('Data/PRC/merged2_with_EXRET_MV.csv')

In [73]:
print(final_df)

      COMNAM     TICKER     date        RET    PRC          VOL        SHROUT  \
0       平安银行  000001.SZ  2004-01   9.048179   9.28  165652539.0  1.409362e+09   
1       平安银行  000001.SZ  2004-02  11.637931  10.36  416409885.0  1.409362e+09   
2       平安银行  000001.SZ  2004-03   2.895753  10.66  359113615.0  1.409362e+09   
3       平安银行  000001.SZ  2004-04 -12.476548   9.33  185640796.0  1.409362e+09   
4       平安银行  000001.SZ  2004-05   3.108253   9.62   59981735.0  1.409362e+09   
...      ...        ...      ...        ...    ...          ...           ...   
71995   中芯国际  688981.SH  2023-08   5.607477  54.24  504143339.0  7.943158e+09   
71996   中芯国际  688981.SH  2023-09  -5.696903  51.15  734782312.0  7.945499e+09   
71997   中芯国际  688981.SH  2023-10  11.925709  57.25  695973804.0  7.945886e+09   
71998   中芯国际  688981.SH  2023-11  -6.200873  53.70  528386389.0  7.946542e+09   
71999   中芯国际  688981.SH  2023-12  -1.266294  53.02  367673738.0  7.946556e+09   

      CSRCIC-1 所属证监会行业名称\n[

In [74]:
monthly_liquidity = pd.read_csv('/Users/yuyangdou/RSM/BAM/11Model/Data/PRC/Monthly - Liquidity1.csv')
monthly_liquidity.rename(columns={'year_month': 'date'}, inplace=True)

annual_volatility = pd.read_csv('/Users/yuyangdou/RSM/BAM/11Model/Data/PRC/annual_volatility.csv')
annual_iv_mb = pd.read_csv('/Users/yuyangdou/RSM/BAM/11Model/Data/PRC/annual_iv_mb.csv')

In [75]:
final_df['date'] = pd.to_datetime(final_df['date']).dt.to_period('M').astype(str)
monthly_liquidity['date'] = pd.to_datetime(monthly_liquidity['date']).dt.to_period('M').astype(str)
annual_volatility['year'] = annual_volatility['year'].astype(str)
annual_iv_mb['year'] = annual_iv_mb['year'].astype(str)

In [76]:
final_df['year'] = final_df['date'].str[:4]

In [77]:
final_df = pd.merge(final_df, monthly_liquidity[['TICKER', 'date', 'DV', 'BAS']], on=['TICKER', 'date'], how='left')
final_df = pd.merge(final_df, annual_volatility[['TICKER', 'year', 'annual_TRV']], on=['TICKER', 'year'], how='left')
final_df = pd.merge(final_df, annual_iv_mb[['TICKER', 'year', 'IV', 'MB']], on=['TICKER', 'year'], how='left')

In [78]:
print(final_df.info())
print(final_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72000 entries, 0 to 71999
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   COMNAM                 72000 non-null  object 
 1   TICKER                 72000 non-null  object 
 2   date                   72000 non-null  object 
 3   RET                    72000 non-null  float64
 4   PRC                    72000 non-null  float64
 5   VOL                    72000 non-null  float64
 6   SHROUT                 72000 non-null  float64
 7   CSRCIC-1               72000 non-null  object 
 8   所属证监会行业名称
[行业级别] 门类行业  72000 non-null  object 
 9   CSRCIC-2               72000 non-null  object 
 10  所属证监会行业代码
[行业级别] 大类行业  72000 non-null  object 
 11  10Y                    72000 non-null  float64
 12  tbl                    72000 non-null  float64
 13  T10Y3MM                72000 non-null  float64
 14  gpce                   72000 non-null  float64
 15  12

In [79]:
missing_values_after = final_df.isnull().sum()
print(missing_values_after)

COMNAM                    0
TICKER                    0
date                      0
RET                       0
PRC                       0
VOL                       0
SHROUT                    0
CSRCIC-1                  0
所属证监会行业名称\n[行业级别] 门类行业    0
CSRCIC-2                  0
所属证监会行业代码\n[行业级别] 大类行业    0
10Y                       0
tbl                       0
T10Y3MM                   0
gpce                      0
12m_momentum              0
industry_momentum         0
1m_reversal               0
EXRET                     0
MV                        0
year                      0
DV                        0
BAS                       0
annual_TRV                0
IV                        0
MB                        0
dtype: int64


In [80]:
final_df.to_csv('Data/PRC/final_merged_dataset1.csv', index=False)

# Final Final Merge - including Financial Ratio

In [81]:
financial_ratio_df = pd.read_excel('/Users/yuyangdou/RSM/BAM/11Model/Data/PRC/年度数据.xlsx', sheet_name='Financial Ratios')

In [82]:
financial_ratio_df.rename(columns={
    '市盈率': 'Book-to-Market',
    '市净率': 'Earnings-Price',
    '市销率': 'Sales-to-Price',
    '股票代码': 'TICKER',
    '股票简称': 'COMNAM'
}, inplace=True)

In [83]:
final_df['year'] = final_df['year'].astype(int)
financial_ratio_df['year'] = financial_ratio_df['year'].astype(int)

In [84]:
final_df['TICKER'] = final_df['TICKER'].astype(str)
financial_ratio_df['TICKER'] = financial_ratio_df['TICKER'].astype(str)

In [85]:
final_df['COMNAM'] = final_df['COMNAM'].astype(str)
financial_ratio_df['COMNAM'] = financial_ratio_df['COMNAM'].astype(str)

In [86]:
final_df = pd.merge(final_df, financial_ratio_df[['year', 'TICKER', 'COMNAM', 'Book-to-Market', 'Earnings-Price', 'Sales-to-Price']], on=['year', 'TICKER', 'COMNAM'], how='left')

In [87]:
missing_values_after_merge = final_df.isnull().sum()
print(missing_values_after_merge)

COMNAM                        0
TICKER                        0
date                          0
RET                           0
PRC                           0
VOL                           0
SHROUT                        0
CSRCIC-1                      0
所属证监会行业名称\n[行业级别] 门类行业        0
CSRCIC-2                      0
所属证监会行业代码\n[行业级别] 大类行业        0
10Y                           0
tbl                           0
T10Y3MM                       0
gpce                          0
12m_momentum                  0
industry_momentum             0
1m_reversal                   0
EXRET                         0
MV                            0
year                          0
DV                            0
BAS                           0
annual_TRV                    0
IV                            0
MB                            0
Book-to-Market            22920
Earnings-Price            22920
Sales-to-Price            22920
dtype: int64


In [88]:
final_df['year'] = final_df['year'].astype(int)
columns_to_fill = ['Book-to-Market', 'Earnings-Price', 'Sales-to-Price']

for col in columns_to_fill:
    final_df[col] = final_df.groupby('TICKER')[col].transform(lambda x: x.fillna(x.median()))

In [89]:
missing_values_after_merge = final_df.isnull().sum()
print(missing_values_after_merge)

COMNAM                      0
TICKER                      0
date                        0
RET                         0
PRC                         0
VOL                         0
SHROUT                      0
CSRCIC-1                    0
所属证监会行业名称\n[行业级别] 门类行业      0
CSRCIC-2                    0
所属证监会行业代码\n[行业级别] 大类行业      0
10Y                         0
tbl                         0
T10Y3MM                     0
gpce                        0
12m_momentum                0
industry_momentum           0
1m_reversal                 0
EXRET                       0
MV                          0
year                        0
DV                          0
BAS                         0
annual_TRV                  0
IV                          0
MB                          0
Book-to-Market            240
Earnings-Price            240
Sales-to-Price            240
dtype: int64


In [90]:
missing_values_df = final_df[final_df[['Book-to-Market', 'Earnings-Price', 'Sales-to-Price']].isnull().any(axis=1)]
print(missing_values_df[['TICKER', 'COMNAM']].drop_duplicates())
print(missing_values_df)

          TICKER COMNAM
62400  601988.SH   中国银行
      COMNAM     TICKER     date       RET     PRC           VOL  \
62400   中国银行  601988.SH  2004-01  0.000000  15.615  2.951812e+08   
62401   中国银行  601988.SH  2004-02  0.194099  15.615  2.951812e+08   
62402   中国银行  601988.SH  2004-03  0.194099  15.615  2.951812e+08   
62403   中国银行  601988.SH  2004-04  0.194099  15.615  2.951812e+08   
62404   中国银行  601988.SH  2004-05  0.194099  15.615  2.951812e+08   
...      ...        ...      ...       ...     ...           ...   
62635   中国银行  601988.SH  2023-08 -4.092072   3.750  3.607017e+09   
62636   中国银行  601988.SH  2023-09  0.533333   3.770  3.194611e+09   
62637   中国银行  601988.SH  2023-10  3.448276   3.900  4.197437e+09   
62638   中国银行  601988.SH  2023-11  2.307692   3.990  3.401007e+09   
62639   中国银行  601988.SH  2023-12  0.000000   3.990  4.158012e+09   

             SHROUT CSRCIC-1 所属证监会行业名称\n[行业级别] 门类行业 CSRCIC-2  ...  \
62400  1.502802e+09        J                    金融业      J66  ... 

In [95]:
data_to_fill = {
    2004: {'Book-to-Market': None, 'Earnings-Price': None, 'Sales-to-Price': None},
    2005: {'Book-to-Market': None, 'Earnings-Price': None, 'Sales-to-Price': None},
    2006: {'Book-to-Market': None, 'Earnings-Price': None, 'Sales-to-Price': None},
    2007: {'Book-to-Market': None, 'Earnings-Price': None, 'Sales-to-Price': None},
    2008: {'Book-to-Market': None, 'Earnings-Price': None, 'Sales-to-Price': None},
    2009: {'Book-to-Market': None, 'Earnings-Price': None, 'Sales-to-Price': None},
    2010: {'Book-to-Market': None, 'Earnings-Price': None, 'Sales-to-Price': None},
    2011: {'Book-to-Market': None, 'Earnings-Price': None, 'Sales-to-Price': None},
    2012: {'Book-to-Market': None, 'Earnings-Price': None, 'Sales-to-Price': None},
    2013: {'Book-to-Market': None, 'Earnings-Price': None, 'Sales-to-Price': None},
    2014: {'Book-to-Market': None, 'Earnings-Price': None, 'Sales-to-Price': None},
    2015: {'Book-to-Market': None, 'Earnings-Price': None, 'Sales-to-Price': None},
    2016: {'Book-to-Market': None, 'Earnings-Price': None, 'Sales-to-Price': None},
    2017: {'Book-to-Market': 26.7646, 'Earnings-Price': 1.6515, 'Sales-to-Price': 9.3918},
    2018: {'Book-to-Market': 23.9463, 'Earnings-Price': 1.0478, 'Sales-to-Price': 6.9656},
    2019: {'Book-to-Market': 22.5103, 'Earnings-Price': 1.6601, 'Sales-to-Price': 6.9066},
    2020: {'Book-to-Market': 17.5073, 'Earnings-Price': 1.6623, 'Sales-to-Price': 5.3399},
    2021: {'Book-to-Market': 10.8757, 'Earnings-Price': 1.3493, 'Sales-to-Price': 3.1524},
    2022: {'Book-to-Market': 12.1351, 'Earnings-Price': 1.0868, 'Sales-to-Price': 2.7993},
    2023: {'Book-to-Market': 16.7234, 'Earnings-Price': 1.3092, 'Sales-to-Price': 3.9163}
}

for year, values in data_to_fill.items():
    final_df.loc[(final_df['TICKER'] == '601881.SH') & (final_df['year'] == year), 'Book-to-Market'] = values['Book-to-Market']
    final_df.loc[(final_df['TICKER'] == '601881.SH') & (final_df['year'] == year), 'Earnings-Price'] = values['Earnings-Price']
    final_df.loc[(final_df['TICKER'] == '601881.SH') & (final_df['year'] == year), 'Sales-to-Price'] = values['Sales-to-Price']

In [96]:
missing_values_after_fill = final_df.isnull().sum()
print(missing_values_after_fill)

COMNAM                      0
TICKER                      0
date                        0
RET                         0
PRC                         0
VOL                         0
SHROUT                      0
CSRCIC-1                    0
所属证监会行业名称\n[行业级别] 门类行业      0
CSRCIC-2                    0
所属证监会行业代码\n[行业级别] 大类行业      0
10Y                         0
tbl                         0
T10Y3MM                     0
gpce                        0
12m_momentum                0
industry_momentum           0
1m_reversal                 0
EXRET                       0
MV                          0
year                        0
DV                          0
BAS                         0
annual_TRV                  0
IV                          0
MB                          0
Book-to-Market            396
Earnings-Price            396
Sales-to-Price            396
dtype: int64


In [97]:
company_medians = final_df[final_df['TICKER'] == '601881.SH'][['Book-to-Market', 'Earnings-Price', 'Sales-to-Price']].median()

final_df['Book-to-Market'].fillna(company_medians['Book-to-Market'], inplace=True)
final_df['Earnings-Price'].fillna(company_medians['Earnings-Price'], inplace=True)
final_df['Sales-to-Price'].fillna(company_medians['Sales-to-Price'], inplace=True)

In [98]:
missing_values_after_fill = final_df.isnull().sum()
print(missing_values_after_fill)

COMNAM                    0
TICKER                    0
date                      0
RET                       0
PRC                       0
VOL                       0
SHROUT                    0
CSRCIC-1                  0
所属证监会行业名称\n[行业级别] 门类行业    0
CSRCIC-2                  0
所属证监会行业代码\n[行业级别] 大类行业    0
10Y                       0
tbl                       0
T10Y3MM                   0
gpce                      0
12m_momentum              0
industry_momentum         0
1m_reversal               0
EXRET                     0
MV                        0
year                      0
DV                        0
BAS                       0
annual_TRV                0
IV                        0
MB                        0
Book-to-Market            0
Earnings-Price            0
Sales-to-Price            0
dtype: int64


In [99]:
final_df.to_csv('Data/PRC/FINAL_DF.csv', index=False)

# Balanced Panel Filter: 300 = 300 (no missing value)

In [100]:
company_counts = final_df['COMNAM'].value_counts()

complete_companies = company_counts[company_counts == 240]

print(f"The number of companies with exactly 240 records is: {complete_companies.shape[0]}")
print("Companies with exactly 240 records are:")
print(complete_companies.index.tolist())

The number of companies with exactly 240 records is: 300
Companies with exactly 240 records are:
['平安银行', '北京银行', '新城控股', '工业富联', '中国化学', '中国国航', '恒立液压', '中国神华', '中信建投', '信达证券', '春秋航空', '隆基绿能', '南京银行', '大秦铁路', '招商证券', '宝丰能源', '东方证券', '兴业银行', '中国铁建', '中国海油', '国泰君安', '中国太保', '中国铝业', '工商银行', '中国中铁', '兴业证券', '三六零', '新华保险', '交通银行', '中国人保', '中国平安', '农业银行', '广汽集团', '红塔证券', '上海银行', '陕西煤业', '中国移动', '杭州银行', '万科A', '海尔智家', '福耀玻璃', '绿地控股', '青岛啤酒', '用友网络', '海螺水泥', '长电科技', '恒生电子', '山东黄金', '贵州茅台', '海南机场', '中金黄金', '士兰微', '通威股份', '片仔癀', '华鲁恒升', '川投能源', '爱旭股份', '江苏银行', '华域汽车', '中泰证券', '三峡能源', '长江电力', '航发动力', '伊利股份', '国投电力', '东方电气', '宝信软件', '海通证券', '山西汾酒', '新奥股份', '国电电力', '中航沈飞', '锦江酒店', '闻泰科技', '上海医药', '明阳智能', '中国中冶', '德业股份', '兆易创新', '晨光股份', '欧派家居', '福斯特', '华友钴业', '璞泰来', '韦尔股份', '科沃斯', '万泰生物', '今世缘', '斯达半导', '海天味业', '合盛硅业', '药明康德', '公牛集团', '洛阳钼业', '东鹏饮料', '中国人寿', '澜起科技', '天合光能', '奇安信-U', '华润微', '华熙生物', '大全能源', '联影医疗', '寒武纪-U', '晶科能源', '时代电气', '沪硅产业', '金山办公', '凯赛生物', '海光信息', '传音控股', '中微公司', '中科曙光', '中信银行