### Data Process

In [2]:
import akshare as ak
import numpy as np
import pandas as pd
from datetime import date, timedelta
from tqdm import tqdm
from typing import List, Dict, Literal

data = np.load('data/hs300.npy', allow_pickle=True)
trade_date = pd.read_csv('data/交易日.txt', names=['date'], header=None, dtype=str)['date']
stocks = pd.read_csv('data/tickers.csv', header=None, dtype=str)[0]

In [4]:
#### 缺失值填补

def fillna(df: pd.DataFrame, method: Literal['mean', 'median', 'pad', 'ffill', 'bfill', 'interpolate']) -> pd.DataFrame:
    """
    Wrapper function to fill NaN in data.
    :param df: a pandas DataFrame.
    :param method: the method to fill NaN.
    :return: a pandas DataFrame with all NaN filled by certain method.
    """
    if method in ['mean', 'median']:
        return df.fillna(getattr(df, method)())
    if method in ['pad']:
        return df.fillna(method=method)
    if method in ['interpolate', 'ffill', 'bfill']:
        return getattr(df, method)()
    
    raise ValueError('Method is not defined')


#### 归一化

def minmax_normalize(df: pd.DataFrame) -> pd.DataFrame:
    """
    Min-Max normalization for a DataFrame.
    """
    return (df - df.min()) / (df.max() - df.min())


def zscore_normalization(df: pd.DataFrame) -> pd.DataFrame:
    return (df - df.mean()) / df.std()


#### 异常极值去除

#### MAD
def mad_filter(df: pd.DataFrame, quantile, axis=0) -> pd.DataFrame:
    median = df.quantile(0.5)
    deviation_median = abs(df - median).quantile(0.5)
    interval = quantile * deviation_median

    return df.clip(median - interval, median + interval, axis=axis^1)

#### 3sigma
def three_sigma_filter(df: pd.DataFrame, n=3, axis=0) -> pd.DataFrame:
    mean = df.mean(axis=axis)
    interval = n * df.std(axis=axis)

    return df.clip(mean - interval, mean + interval, axis=axis^1)

#### percentile
def percentile_filter(df: pd.DataFrame, min, max, axis=0) -> pd.DataFrame:
    pos = df.quantile([min, max])

    return df.clip(pos.iloc[0], pos.iloc[1], axis=axis^1)

#### 若要对单只股票时序数据归一化，需实现滚动归一化，上述未实现

In [5]:
index = pd.MultiIndex.from_product([
    stocks,         # 时间
    trade_date      # 股票
], names=['ticker', 'date'])

columns = data.item()['000001'].columns
columns = columns.drop(['日期', '股票代码'])

dt = pd.DataFrame(index=index, columns=columns)


pbar = tqdm(total=len(stocks))
for stock in stocks:
    pbar.set_description_str(f"Processing => {stock}")

    df = data.item()[stock].drop(columns=['股票代码'])
    if(df.empty):
        continue
    df = df.set_index('日期')
    df.index = df.index.astype('str')
    df.index.name = 'date'

    for date in df.index:
        dt.loc[(stock, date)] = df.loc[date]
    pbar.update(1)
pbar.close()

dt

Processing => 688981: 100%|██████████| 300/300 [00:46<00:00,  6.42it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,开盘,收盘,最高,最低,成交量,成交额,振幅,涨跌幅,涨跌额,换手率
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
000001,2010-01-04,24.52,23.71,24.58,23.68,241923.0,580249472.0,3.69,-2.71,-0.66,0.83
000001,2010-01-05,23.75,23.3,23.9,22.75,556500.0,1293476992.0,4.85,-1.73,-0.41,1.9
000001,2010-01-06,23.25,22.9,23.25,22.72,412143.0,944453696.0,2.27,-1.72,-0.4,1.41
000001,2010-01-07,22.9,22.65,23.05,22.4,355337.0,804166336.0,2.84,-1.09,-0.25,1.22
000001,2010-01-08,22.5,22.6,22.75,22.35,288543.0,650667392.0,1.77,-0.22,-0.05,0.99
...,...,...,...,...,...,...,...,...,...,...,...
688981,2024-12-25,96.48,97.99,99.8,96.38,985329.0,9679610219.0,3.53,1.17,1.13,4.96
688981,2024-12-26,98.0,96.73,98.88,96.15,711046.0,6914605464.0,2.79,-1.29,-1.26,3.58
688981,2024-12-27,96.78,97.51,102.37,96.49,1144716.0,11378943925.0,6.08,0.81,0.78,5.76
688981,2024-12-30,96.6,99.29,100.53,96.0,906573.0,8950494606.0,4.65,1.83,1.78,4.56


In [6]:
dt.to_csv('data/hs300.csv', index=True)

In [7]:
dt = pd.read_csv('data/hs300.csv', index_col=[0, 1], dtype={'ticker': str, 'date': str})
dt

Unnamed: 0_level_0,Unnamed: 1_level_0,开盘,收盘,最高,最低,成交量,成交额,振幅,涨跌幅,涨跌额,换手率
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
000001,2010-01-04,24.52,23.71,24.58,23.68,241923.0,5.802495e+08,3.69,-2.71,-0.66,0.83
000001,2010-01-05,23.75,23.30,23.90,22.75,556500.0,1.293477e+09,4.85,-1.73,-0.41,1.90
000001,2010-01-06,23.25,22.90,23.25,22.72,412143.0,9.444537e+08,2.27,-1.72,-0.40,1.41
000001,2010-01-07,22.90,22.65,23.05,22.40,355337.0,8.041663e+08,2.84,-1.09,-0.25,1.22
000001,2010-01-08,22.50,22.60,22.75,22.35,288543.0,6.506674e+08,1.77,-0.22,-0.05,0.99
...,...,...,...,...,...,...,...,...,...,...,...
688981,2024-12-25,96.48,97.99,99.80,96.38,985329.0,9.679610e+09,3.53,1.17,1.13,4.96
688981,2024-12-26,98.00,96.73,98.88,96.15,711046.0,6.914605e+09,2.79,-1.29,-1.26,3.58
688981,2024-12-27,96.78,97.51,102.37,96.49,1144716.0,1.137894e+10,6.08,0.81,0.78,5.76
688981,2024-12-30,96.60,99.29,100.53,96.00,906573.0,8.950495e+09,4.65,1.83,1.78,4.56


### Factor Data

In [3]:
import akshare as ak
import numpy as np
import pandas as pd
from datetime import date, timedelta
from tqdm import tqdm
from typing import List, Dict, Literal
from factor.Alpha101CN import Alphas, get_alpha
import warnings
warnings.filterwarnings("ignore")

dt = pd.read_csv('data/hs300.csv', index_col=[0, 1], dtype={'ticker': str, 'date': str})
trade_date = pd.read_csv('data/交易日.txt', names=['date'], header=None, dtype=str)['date']
stocks = pd.read_csv('data/tickers.csv', header=None, dtype=str)[0]

In [4]:
pbar = tqdm(total=len(stocks))
for stock in stocks:
    pbar.set_description_str(f"Processing => {stock}")
    alphas = get_alpha(dt.loc[stock])
    alphas.to_csv(f"data/factor_data/alphas_{stock}.csv")
    pbar.update(1)
pbar.close()

Processing => 688981: 100%|██████████| 300/300 [29:07<00:00,  5.82s/it]


### Factor Analysis

In [5]:
import akshare as ak
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Literal
import warnings
warnings.filterwarnings("ignore")

In [6]:
columns = pd.read_csv(f"data/factor_data/alphas_000001.csv", index_col=0).columns[10:]
trade_date = pd.read_csv('data/交易日.txt', names=['date'], header=None, dtype=str)['date']
stocks = pd.read_csv('data/tickers.csv', header=None, dtype=str)[0]

pbar = tqdm(total=len(columns))
for alpha in columns:
    pbar.set_description_str(f"Processing => {alpha}")
    alpha_tb = pd.DataFrame(index=stocks, columns=trade_date)
    
    for stock in stocks:
        exp = pd.read_csv(f"data/factor_data/alphas_{stock}.csv", index_col=0)
        alpha_tb.loc[stock] = exp[alpha]
    
    alpha_tb.index.name = 'ticker'
    alpha_tb.to_csv(f"data/factor_exp/{alpha}.csv")
    pbar.update(1)
pbar.close()

Processing => alpha101: 100%|██████████| 82/82 [06:50<00:00,  5.00s/it]


In [14]:
trade_date = pd.read_csv('data/交易日.txt', names=['date'], header=None, dtype=str)['date']
stocks = pd.read_csv('data/tickers.csv', header=None, dtype=str)[0]

pct_tb = pd.DataFrame(index=stocks, columns=trade_date)

for stock in stocks:
    exp = pd.read_csv(f"data/factor_data/alphas_{stock}.csv", index_col=0)
    pct_tb.loc[stock] = exp['涨跌幅']

pct_tb.index.name = 'ticker'
pct_tb.to_csv('data/pct_tb.csv')

In [7]:
alpha_tb = pd.read_csv('data/factor_exp/alpha001.csv', index_col='ticker', dtype={'ticker': str})
pct_tb = pd.read_csv('data/pct_tb.csv', index_col='ticker', dtype={'ticker': str})

In [8]:
#### 计算IC/IR

def ic_calc(factor_table: pd.DataFrame, pctchange_table: pd.DataFrame, method: Literal['pearson', 'kendall', 'spearman'] = 'spearman') -> pd.DataFrame:
    '''
    :param factor_table: a pandas DataFrame.
    :param pctchange_table: a pandas DataFrame.
    :param method: the method to calculate information coefficient.
    '''

    ic_list = pd.DataFrame(columns=['ic'])
    columns = factor_table.columns
    for i in range(len(columns) - 1):
        factor = factor_table[columns[i]]
        pctchange = pctchange_table[columns[i + 1]]

        index = factor[np.isnan(factor)].index
        factor = factor.drop(index=index)
        if(len(factor) < 100):
            continue
        pctchange = pctchange.drop(index=index)
        ic_list.loc[len(ic_list)] = factor.corr(pctchange, method=method)

    return ic_list

def ir_calc_from_ic(ic: pd.DataFrame) -> pd.Series:
    return ic.mean() / ic.std()

def ir_calc(factor_table: pd.DataFrame, pctchange_table: pd.DataFrame, method: Literal['pearson', 'kendall', 'spearman'] = 'spearman') -> pd.Series:
    ic = ic_calc(factor_table=factor_table, pctchange_table=pctchange_table, method=method)
    
    return ic.mean() / ic.std()

In [14]:
alpha_tb = pd.read_csv('data/factor_exp/alpha001.csv', index_col='ticker', dtype={'ticker': str})
pct_tb = pd.read_csv('data/pct_tb.csv', index_col='ticker', dtype={'ticker': str})
dir_tb = pd.DataFrame(index=alphas, columns=['direction'])

alphas = pd.read_csv(f"data/factor_data/alphas_000001.csv", index_col='date', dtype={'date': str}).columns[10:]
for alpha in alphas:
    alpha_tb = pd.read_csv(f"data/factor_exp/{alpha}.csv", index_col='ticker', dtype={'ticker': str})
    
    ic = ic_calc(alpha_tb, pct_tb)
    ir = ir_calc(alpha_tb, pct_tb)

    if ic.mean()[0] > 0:
        dir_tb.loc[alpha, 'direction'] = 1
    elif ic.mean()[0] < 0:
        dir_tb.loc[alpha, 'direction'] = -1
    else:
        dir_tb.loc[alpha, 'direction'] = 0

dir_tb.to_csv('data/dir_tb.csv')

In [None]:
alpha_tb = pd.read_csv('data/factor_exp/alpha001.csv', index_col='ticker', dtype={'ticker': str})
pct_tb = pd.read_csv('data/pct_tb.csv', index_col='ticker', dtype={'ticker': str})
alphas = pd.read_csv(f"data/factor_data/alphas_000001.csv", index_col='date', dtype={'date': str}).columns[10:]

for alpha in alphas:
    alpha_tb = pd.read_csv(f"data/factor_exp/{alpha}.csv", index_col='ticker', dtype={'ticker': str})
    
    ic = ic_calc(alpha_tb, pct_tb)
    print(alpha, ic.mean()[0])

alpha001 -0.023495118087219064
alpha002 0.012253847962689345
alpha003 0.006247433181904451
alpha004 0.021701779839304703
alpha005 0.04940814230709758
alpha006 0.006392677995199583
alpha007 0.012647810982482318
alpha008 0.02465554043802264
alpha009 0.008645358831875484
alpha010 0.0021667262703009816
alpha011 0.01887124095006488
alpha012 0.004987243928420271
alpha013 0.004844412400433942
alpha014 0.0057561932459619805
alpha015 0.014525347026260632
alpha016 0.0029270323200556027
alpha017 0.01586369297811314
alpha018 0.02299900300857511
alpha019 0.026467018220976465
alpha020 0.0044691409232273235
alpha021 0.027802000894979
alpha022 -0.0030936645856968663
alpha023 0.024418608678143553
alpha024 0.024145700936942276
alpha025 0.032319758538691554
alpha026 0.02127163291637849
alpha027 0.01087349590699911
alpha028 0.005152534445520766
alpha029 0.005769886897327497
alpha030 0.022314617261100907
alpha031 0.010142395672394933
alpha032 -0.005783687401071542
alpha033 0.022471963912168526
alpha034 0.0

### PCA

In [27]:
import akshare as ak
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Literal
import warnings
warnings.filterwarnings("ignore")

In [6]:
stocks = pd.read_csv('data/tickers.csv', header=None, dtype=str)[0]
trade_date = pd.read_csv('data/交易日.txt', names=['date'], header=None, dtype=str)['date']
listing_date = pd.DataFrame(index=stocks, columns=['listing_date'])

for stock in stocks:
    data = pd.read_csv(f"data/factor_data/alphas_{stock}.csv", index_col=0, dtype={'ticker': str})
    for date in trade_date:
        if np.isnan(data.loc[date, '开盘']):
            continue
        else:
            listing_date.loc[stock, 'listing_date'] = date
            break

listing_date.index.name = 'ticker'
listing_date.to_csv('data/listing_date.csv')

In [7]:
listing_date = pd.read_csv('data/listing_date.csv', index_col='ticker', dtype={'ticker': str})

In [72]:
stocks = pd.read_csv('data/tickers.csv', header=None, dtype=str)[0].to_list()
columns = pd.read_csv(f"data/factor_data/alphas_000001.csv", index_col='date').columns[10:]
columns = columns.drop(['alpha068', 'alpha086'])
dir_tb = pd.read_csv('data/dir_tb.csv', index_col=0)
dir_tb = dir_tb[dir_tb['direction'] != 0].to_numpy()

train_data = []
for stock in stocks:
    alpha = pd.read_csv(f"data/factor_data/alphas_{stock}.csv", index_col=0, dtype={'ticker': str})
    alpha = alpha.loc['2024-12-30', columns].to_numpy()
    train_data.append((alpha.flatten() * dir_tb.flatten()))
train_data = np.array(train_data)

test_data = []
for stock in stocks:
    alpha = pd.read_csv(f"data/factor_data/alphas_{stock}.csv", index_col=0, dtype={'ticker': str})
    alpha = alpha.loc['2024-12-31', columns].to_numpy()
    test_data.append((alpha.flatten() * dir_tb.flatten()))
test_data = np.array(test_data)

In [76]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# 2. 数据预处理（缺失填补 + 标准化）
imputer = SimpleImputer(strategy='mean')
train_imputed = imputer.fit_transform(train_data)

scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_imputed)

# 3. 拟合 PCA
pca = PCA(n_components=3)
pca.fit(train_scaled)

# # 4. 应用 PCA
    
# # 填补 + 标准化
# test_imputed = imputer.transform(test_data)
# test_scaled = scaler.transform(test_imputed)
    
# # 固定 PCA 映射
# test_pca = pca.transform(test_scaled)

# test_pca

0,1,2
,n_components,3
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [78]:
weights = pd.Series(pca.components_[0])
weights

0     0.169836
1     0.137090
2     0.033853
3     0.191449
4     0.180748
        ...   
75   -0.013326
76   -0.028597
77   -0.093741
78   -0.015086
79    0.180415
Length: 80, dtype: float64