In [12]:
import akshare as ak
import pandas as pd

# 定义需要下载的股票代码列表
stock_list = ['sh600519', 'sz000858', 'sz000651']  # 示例：贵州茅台、五粮液、格力电器

# 中文列名到英文列名的映射
column_mapping = {
    '报告日': 'report_date',
    '基本每股收益': 'earnings_per_share',  # 每股收益 (EPS)
    '归属于母公司所有者的净利润': 'net_profit',  # 净利润
    '总资产': 'net_assets',  # 总资产
    '每股净资产': 'net_assets_per_share'  # 每股净资产
}

# 下载每只股票的历史数据并保存为 CSV 文件
for stock_code in stock_list:
    try:
        # 下载股票历史数据
        stock_data = ak.stock_zh_a_daily(symbol=stock_code, adjust="hfq")

        # 下载财务指标数据
        stock_financial_data = ak.stock_financial_report_sina(stock=stock_code, symbol="利润表")

        # 动态重命名列
        rename_mapping = {key: value for key, value in column_mapping.items() if key in stock_financial_data.columns}
        stock_financial_data = stock_financial_data.rename(columns=rename_mapping)

        # 合并历史数据和财务数据
        stock_data = stock_data.merge(stock_financial_data, left_on='date', right_on='report_date', how='left')

        # 保存到 CSV 文件
        output_file = f'csv/{stock_code}_data.csv'
        stock_data.to_csv(output_file, index=False)

        print(f"股票 {stock_code} 数据已成功下载并保存到 {output_file}！")
    except Exception as e:
        print(f"处理股票 {stock_code} 时出错: {e}")

股票 sh600519 数据已成功下载并保存到 csv/sh600519_data.csv！
股票 sz000858 数据已成功下载并保存到 csv/sz000858_data.csv！
股票 sz000651 数据已成功下载并保存到 csv/sz000651_data.csv！


In [None]:
import akshare as ak
import pandas as pd

# 下载股票历史数据和财务指标
def download_stock_data(stock_code):
    try:
        # 获取复权后的历史数据
        stock_history = ak.stock_zh_a_daily(symbol=stock_code, adjust="hfq")

        # 获取财务指标数据
        stock_financial = ak.stock_financial_report_sina(stock=stock_code, symbol="利润表")

        # 合并历史数据和财务数据
        stock_data = stock_history.merge(stock_financial, left_on='date', right_on='报告日', how='left')

        return stock_data
    except Exception as e:
        print(f"下载股票 {stock_code} 数据时出错: {e}")
        return None

In [16]:
download_stock_data('sh600519')

Unnamed: 0,date,open,high,low,close,volume,amount,outstanding_share,turnover,报告日,...,归属于母公司所有者的综合收益总额,归属于少数股东的综合收益总额,基本每股收益,稀释每股收益,数据源,是否审计,公告日期,币种,类型,更新日期
0,2001-08-27,34.51,37.78,32.85,35.55,40631800.0,1.410347e+09,7.150000e+07,0.568277,,...,,,,,,,,,,
1,2001-08-28,34.99,37.00,34.61,36.86,12964779.0,4.634631e+08,7.150000e+07,0.181326,,...,,,,,,,,,,
2,2001-08-29,36.98,37.00,36.10,36.38,5325275.0,1.946896e+08,7.150000e+07,0.074479,,...,,,,,,,,,,
3,2001-08-30,36.28,37.51,36.00,37.10,4801306.0,1.775586e+08,7.150000e+07,0.067151,,...,,,,,,,,,,
4,2001-08-31,37.15,37.62,36.80,37.01,2323148.0,8.623124e+07,7.150000e+07,0.032492,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5673,2025-05-12,13371.87,13547.01,13360.24,13426.26,2473533.0,3.967786e+09,1.256198e+09,0.001969,,...,,,,,,,,,,
5674,2025-05-13,13463.25,13463.25,13264.01,13307.44,2125829.0,3.386618e+09,1.256198e+09,0.001692,,...,,,,,,,,,,
5675,2025-05-14,13304.93,13765.16,13289.70,13681.40,3946012.0,6.394735e+09,1.256198e+09,0.003141,,...,,,,,,,,,,
5676,2025-05-15,13679.81,13753.36,13590.52,13656.46,2473285.0,4.043403e+09,1.256198e+09,0.001969,,...,,,,,,,,,,


In [15]:

# 计算选股因子
def calculate_factors(df):
    if df is None or df.empty:
        return df

    # 计算每日收益率
    df['daily_return'] = df['close'].pct_change()

    # 计算波动率（过去 20 日的标准差）
    df['volatility'] = df['daily_return'].rolling(window=20).std()

    # 计算市盈率（PE）和市净率（PB），假设财务数据中包含每股收益和每股净资产
    if '基本每股收益' in df.columns and 'close' in df.columns:
        df['PE'] = df['close'] / df['基本每股收益']
    else:
        df['PE'] = None

    if '每股净资产' in df.columns and 'close' in df.columns:
        df['PB'] = df['close'] / df['每股净资产']
    else:
        df['PB'] = None

    return df

# 主函数：下载多只股票数据并计算因子
def main():
    # 定义需要下载的股票代码列表
    stock_list = ['sh600519', 'sz000858', 'sz000651']  # 示例：贵州茅台、五粮液、格力电器

    all_stock_data = []

    for stock_code in stock_list:
        print(f"正在处理股票 {stock_code}...")
        stock_data = download_stock_data(stock_code)
        if stock_data is not None:
            stock_data_with_factors = calculate_factors(stock_data)
            stock_data_with_factors['stock_code'] = stock_code  # 添加股票代码列
            all_stock_data.append(stock_data_with_factors)

    # 合并所有股票数据
    if all_stock_data:
        combined_df = pd.concat(all_stock_data, ignore_index=True)
        print("已成功合并所有股票数据！")

        # 保存到 CSV 文件
        output_file = 'combined_stock_factors.csv'
        combined_df.to_csv(output_file, index=False)
        print(f"所有股票数据已保存到 {output_file}！")
    else:
        print("未成功下载任何股票数据。")

# 执行主函数
if __name__ == "__main__":
    main()

正在处理股票 sh600519...
正在处理股票 sz000858...
正在处理股票 sz000651...
已成功合并所有股票数据！
所有股票数据已保存到 combined_stock_factors.csv！


In [13]:
import pandas as pd
import os

# 读取 csv 文件夹中的所有股票数据文件
csv_folder = 'csv'
all_stock_data = []

# 遍历 csv 文件夹中的所有文件
for file_name in os.listdir(csv_folder):
    if file_name.endswith('.csv'):  # 确保只处理 CSV 文件
        file_path = os.path.join(csv_folder, file_name)
        try:
            # 读取单个股票数据
            stock_data = pd.read_csv(file_path)

            # 添加股票代码列（从文件名中提取）
            stock_code = file_name.split('_')[0]  # 假设文件名为 "sh600519_data.csv"
            stock_data['stock_code'] = stock_code

            # 将当前股票数据添加到总数据列表中
            all_stock_data.append(stock_data)
        except Exception as e:
            print(f"读取文件 {file_name} 时出错: {e}")

# 合并所有股票数据为一个 DataFrame
if all_stock_data:
    combined_df = pd.concat(all_stock_data, ignore_index=True)
    print("已成功合并所有股票数据！")
else:
    combined_df = pd.DataFrame()
    print("未找到任何股票数据文件。")

# 打印合并后的数据结构以确认
print("合并后的数据结构:")
print(combined_df.head())

已成功合并所有股票数据！
合并后的数据结构:
         date   open   high    low  close      volume        amount  \
0  2001-08-27  34.51  37.78  32.85  35.55  40631800.0  1.410347e+09   
1  2001-08-28  34.99  37.00  34.61  36.86  12964779.0  4.634631e+08   
2  2001-08-29  36.98  37.00  36.10  36.38   5325275.0  1.946896e+08   
3  2001-08-30  36.28  37.51  36.00  37.10   4801306.0  1.775586e+08   
4  2001-08-31  37.15  37.62  36.80  37.01   2323148.0  8.623124e+07   

   outstanding_share  turnover      code  ... 归属于母公司所有者的综合收益总额  \
0         71500000.0  0.568277  sh600519  ...              NaN   
1         71500000.0  0.181326  sh600519  ...              NaN   
2         71500000.0  0.074479  sh600519  ...              NaN   
3         71500000.0  0.067151  sh600519  ...              NaN   
4         71500000.0  0.032492  sh600519  ...              NaN   

   归属于少数股东的综合收益总额  earnings_per_share  稀释每股收益  数据源  是否审计  公告日期  币种  类型  更新日期  
0             NaN                 NaN     NaN  NaN   NaN   NaN NaN NaN   N

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# 假设我们已经下载了多支股票的历史数据
df = pd.read_csv('csv/multi_stock_data.csv')

# 计算替代因子
df['price_change_rate'] = df['close'].pct_change() if 'close' in df.columns else None  # 收盘价变化率
df['volume_change_rate'] = df['volume'].pct_change() if 'volume' in df.columns else None  # 成交量变化率
df['net_profit_growth_rate'] = df['net_profit'].pct_change() if 'net_profit' in df.columns else None  # 净利润增长率
df['revenue_growth_rate'] = df['revenue'].pct_change() if 'revenue' in df.columns else None  # 营业收入增长率

# 打印前几行数据，确认新因子是否成功计算
print("更新后的数据结构:")
print(df[['date', 'price_change_rate', 'volume_change_rate', 'net_profit_growth_rate', 'revenue_growth_rate']].head())

# 准备特征因子和目标变量
available_factors = ['price_change_rate', 'volume_change_rate', 'net_profit_growth_rate', 'revenue_growth_rate']
X = df[[factor for factor in available_factors if factor in df.columns]]  # 动态选择可用因子
y = df['Returns']  # 目标变量（假设已定义）

# 拆分训练数据和测试数据
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 标准化处理
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 构建线性回归模型
model = LinearRegression()
model.fit(X_train, y_train)

# 查看模型系数，确定因子权重
print('Factor weights:', model.coef_)

# 使用模型预测测试数据的收益
y_pred = model.predict(X_test)

# 创建一个 DataFrame 来存储股票的预测收益
predicted_returns = pd.DataFrame({
    'Stock': X_test.index,
    'Predicted return': y_pred
})

# 根据预测的收益选择股票
selected_stocks = predicted_returns[predicted_returns['Predicted return'] > 0.1]

print('Selected stocks:', selected_stocks)


更新后的数据结构:
         date  price_change_rate  volume_change_rate net_profit_growth_rate  \
0  2001-08-27                NaN                 NaN                   None   
1  2001-08-28           0.036850           -0.680920                   None   
2  2001-08-29          -0.013022           -0.589251                   None   
3  2001-08-30           0.019791           -0.098393                   None   
4  2001-08-31          -0.002426           -0.516142                   None   

  revenue_growth_rate  
0                None  
1                None  
2                None  
3                None  
4                None  


KeyError: 'Returns'