# 基于集成思想的多因子筛选

## 1. 导入依赖库

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import alphalens

## 2. 数据准备

In [None]:
factor_data = pd.read_csv('../data/data.csv', parse_dates=True, index_col=['date','asset'])
prices = pd.read_csv('../data/prices.csv', parse_dates=True, index_col=['date'])

# 分割数据集
train_data = factor_data.loc["2010":"2018"]
test_data = factor_data.loc["2019":"2020"]

## 3. 因子筛选

### 3.1 初始化因子重要性字典

In [None]:
# 初始化因子重要性字典
factor_importance = {}

### 3.2 IC打分法

In [None]:
# IC打分法
ic_scores = train_data.groupby(level=1).apply(lambda x: x.corrwith(x['target_variable'], method='spearman'))
factor_importance['ic_scores'] = np.abs(ic_scores)

### 3.3 SVM

In [None]:
# SVM
svm_model = SVC(probability=True)
svm_model.fit(train_data.iloc[:, :-1], train_data['target_variable'])
svm_scores = svm_model.predict_proba(test_data.iloc[:, :-1])[:, 1]
factor_importance['svm_scores'] = svm_scores

### 3.4 随机森林

In [None]:
# 随机森林
random_forest = RandomForestClassifier(n_estimators=100, random_state=0)
random_forest.fit(train_data.iloc[:, :-1], train_data['target_variable'])
rf_scores = random_forest.predict_proba(test_data.iloc[:, :-1])[:, 1]
factor_importance['rf_scores'] = rf_scores

### 3.5 梯度提升决策树

In [None]:
# 梯度提升决策树
gradient_boosting = GradientBoostingClassifier(n_estimators=100, random_state=0)
gradient_boosting.fit(train_data.iloc[:, :-1], train_data['target_variable'])
gb_scores = gradient_boosting.predict_proba(test_data.iloc[:, :-1])[:, 1]
factor_importance['gb_scores'] = gb_scores

## 4. 集成因子重要性

In [None]:
# 计算因子重要性的加权平均值
weights = [0.25, 0.25, 0.25, 0.25]  # 根据个人需求分配权重
ensemble_importance = np.average(list(factor_importance.values()), axis=0, weights=weights)


## 5. 因子筛选

In [None]:
# 根据集成因子重要性的排序选择排名靠前的因子作为筛选结果
selected_factors = train_data.columns[:-1][np.argsort(ensemble_importance)[::-1]]

In [None]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# 1. 数据准备
# 假设你的因子数据已经保存在一个名为"factor_data.csv"的文件中，以股票为行，以因子为列
factor_data = pd.read_csv("factor_data.csv", header=[0, 1], index_col=[0, 1])

# 2. 分割数据集
train_data = factor_data.loc["2010":"2018"]
test_data = factor_data.loc["2019":"2020"]

# 获取训练集和测试集的收盘价
train_closes = train_data['Close']
test_closes = test_data['Close']

# 3. 因子筛选技术
# 初始化因子重要性字典
factor_importance = {}

# IC打分法
ic_scores = train_data.groupby(level=1).apply(lambda x: x.corrwith(x['Close'], method='spearman'))
factor_importance['ic_scores'] = np.abs(ic_scores)

# SVM回归
svm_model = SVR()
svm_model.fit(train_data.iloc[:, :-1], train_closes)
svm_predictions = svm_model.predict(test_data.iloc[:, :-1])
factor_importance['svm_predictions'] = np.abs(svm_predictions)

# 随机森林回归
random_forest = RandomForestRegressor(n_estimators=100, random_state=0)
random_forest.fit(train_data.iloc[:, :-1], train_closes)
rf_predictions = random_forest.predict(test_data.iloc[:, :-1])
factor_importance['rf_predictions'] = np.abs(rf_predictions)

# 梯度提升决策树回归
gradient_boosting = GradientBoostingRegressor(n_estimators=100, random_state=0)
gradient_boosting.fit(train_data.iloc[:, :-1], train_closes)
gb_predictions = gradient_boosting.predict(test_data.iloc[:, :-1])
factor_importance['gb_predictions'] = np.abs(gb_predictions)

# 4. 集成因子重要性
# 计算因子重要性的加权平均值
weights = [0.25, 0.25, 0.25, 0.25]  # 根据个人需求分配权重
ensemble_importance = np.average(list(factor_importance.values()), axis=0, weights=weights)

# 5. 因子筛选
# 根据集成因子重要性的排序选择排名靠前的因子作为筛选结果
selected_factors = train_data.columns[:-1][np.argsort(ensemble_importance)[::-1]]

# 输出筛选结果
print("Selected Factors:")
for factor in selected_factors:
    print(factor)
