In [2]:
# 正确的时间序列交叉验证（3 fold，数据分4份）
print("\n" + "="*60)
print("时间序列交叉验证（3 fold）- 30因子模型稳健性验证")
print("数据分4份：训练集递增，测试集为下一个1/4")
print("="*60)


data_cv = pd.read_csv('/Users/xiaoquanliu/Desktop/Book_DataCode1/第七章/DL_Data16_processed.csv')
data_cv[numeric_cols] = data_cv[numeric_cols].fillna(data_cv[numeric_cols].mean())

for i, j in combinations(range(len(original_features)), 2):
  col_name = f'interaction_{i}_{j}'
  data_cv[col_name] = data_cv[original_features[i]] * data_cv[original_features[j]]

X_cv = data_cv[all_features].values
y_cv = data_cv['Return'].values
time_cv = data_cv.iloc[:, 1].values

unique_times_cv = np.unique(time_cv)
n_times = len(unique_times_cv)
quarter_size = n_times // 4  # 每个1/4的大小

print(f"总时间点数: {n_times}")
print(f"每个1/4包含时间点数: {quarter_size}")
print(f"时间范围: {unique_times_cv[0]} 到 {unique_times_cv[-1]}")

cv_results = []
cv_detailed_results = []

for fold in range(3):
  print(f"\n--- 第 {fold+1} 折验证 ---")

  train_end_idx = quarter_size * (fold + 1)  # 训练集结束位置
  test_start_idx = train_end_idx             # 测试集开始位置
  test_end_idx = quarter_size * (fold + 2)   # 测试集结束位置
  
  # 最后一折的测试集包含所有剩余数据
  if fold == 2:
      test_end_idx = n_times
  
  train_times = unique_times_cv[:train_end_idx]
  test_times = unique_times_cv[test_start_idx:test_end_idx]
  
  print(f"训练时间范围: {train_times[0]} 到 {train_times[-1]} (共{len(train_times)}个时间点)")
  print(f"测试时间范围: {test_times[0]} 到 {test_times[-1]} (共{len(test_times)}个时间点)")
  print(f"训练集占比: {len(train_times)/n_times:.1%}, 测试集占比: {len(test_times)/n_times:.1%}")
  

  assert len(np.intersect1d(train_times, test_times)) == 0, f"第{fold+1}折: 训练集和测试集时间重叠!"
  assert len(train_times) > 0 and len(test_times) > 0, f"第{fold+1}折: 训练集或测试集为空!"
  assert train_times[-1] < test_times[0], f"第{fold+1}折: 训练集时间应该早于测试集时间!"
  
  train_mask = np.isin(time_cv, train_times)
  test_mask = np.isin(time_cv, test_times)
  
  X_train_cv, X_test_cv = X_cv[train_mask], X_cv[test_mask]
  y_train_cv, y_test_cv = y_cv[train_mask], y_cv[test_mask]
  time_train_cv, time_test_cv = time_cv[train_mask], time_cv[test_mask]
  
  print(f"训练样本数: {np.sum(train_mask)}, 测试样本数: {np.sum(test_mask)}")
  
  if np.sum(train_mask) < 100:
      print(f"警告: 第{fold+1}折训练样本数过少 ({np.sum(train_mask)})")
  if np.sum(test_mask) < 50:
      print(f"警告: 第{fold+1}折测试样本数过少 ({np.sum(test_mask)})")
  
  scaler_cv = StandardScaler()
  X_train_scaled_cv = scaler_cv.fit_transform(X_train_cv)
  X_test_scaled_cv = scaler_cv.transform(X_test_cv)
  
  model_cv = DualStructureNN(num_factors=30, num_features=X_train_scaled_cv.shape[1])
  optimizer_cv = keras.optimizers.Adam(learning_rate=0.001)
  
  train_time_data_cv, unique_times_train_cv = prepare_time_based_data(X_train_scaled_cv, y_train_cv, time_train_cv)
  
  training_losses = []
  best_loss = float('inf')
  patience_counter = 0
  patience = 10
  
  for epoch in range(200):
      epoch_losses = []
      for t in unique_times_train_cv:
          t_data = train_time_data_cv[t]
          if len(t_data['X']) < 2: continue
          x_weighted = np.dot(t_data['X'].T, t_data['y']) / len(t_data['y'])
          with tf.GradientTape() as tape:
              predictions, _, _ = model_cv([tf.constant(t_data['X'], dtype=tf.float32), tf.constant(x_weighted, dtype=tf.float32)], training=True)
              loss = tf.reduce_mean(tf.square(tf.constant(t_data['y'], dtype=tf.float32) - predictions))
          gradients = tape.gradient(loss, model_cv.trainable_variables)
          optimizer_cv.apply_gradients(zip(gradients, model_cv.trainable_variables))
          epoch_losses.append(loss.numpy())
      
      if epoch_losses:
          current_loss = np.mean(epoch_losses)
          training_losses.append(current_loss)
          
          # 早停机制
          if current_loss < best_loss:
              best_loss = current_loss
              patience_counter = 0
          else:
              patience_counter += 1
              
          if patience_counter >= patience:
              print(f"早停于第{epoch+1}轮，最佳损失: {best_loss:.6f}")
              break
  
  test_time_data_cv, unique_times_test_cv = prepare_time_based_data(X_test_scaled_cv, y_test_cv, time_test_cv)
  all_factors_cv = []
  test_indices_cv = []
  
  for t in unique_times_test_cv:
      t_data = test_time_data_cv[t]
      if len(t_data['X']) < 2: continue
      x_weighted = np.dot(t_data['X'].T, t_data['y']) / len(t_data['y'])
      _, _, factors = model_cv([tf.constant(t_data['X'], dtype=tf.float32), tf.constant(x_weighted, dtype=tf.float32)], training=False)
      factors_np = factors.numpy()
      if len(factors_np.shape) == 1:
          factors_expanded = np.tile(factors_np, (len(t_data['X']), 1))
      else:
          factors_expanded = factors_np
      all_factors_cv.extend(factors_expanded)
      test_indices_cv.extend(t_data['indices'])
  
  all_factors_cv = np.array(all_factors_cv)
  y_test_subset_cv = y_test_cv[test_indices_cv]
  
  # 线性回归计算R²
  linear_reg_cv = LinearRegression()
  linear_reg_cv.fit(all_factors_cv, y_test_subset_cv)
  y_pred_cv = linear_reg_cv.predict(all_factors_cv)
  r2_cv = r2_score(y_test_subset_cv, y_pred_cv)
  
  # 计算调整R²和其他指标
  n_cv = len(y_test_subset_cv)
  adj_r2_cv = 1 - (1 - r2_cv) * (n_cv - 1) / (n_cv - 31)
  mse_cv = np.mean((y_test_subset_cv - y_pred_cv)**2)
  mae_cv = np.mean(np.abs(y_test_subset_cv - y_pred_cv))
  final_loss = training_losses[-1] if training_losses else np.nan
  

  residuals = y_test_subset_cv - y_pred_cv
  tracking_error = np.std(residuals) * np.sqrt(252)  # 年化跟踪误差
  info_ratio = (np.mean(residuals) * 252) / tracking_error if tracking_error > 0 else 0
  fold_result = {
      'Fold': fold + 1,
      'Train_Time_Start': str(train_times[0]),
      'Train_Time_End': str(train_times[-1]),
      'Test_Time_Start': str(test_times[0]),
      'Test_Time_End': str(test_times[-1]),
      'Train_Samples': int(np.sum(train_mask)),
      'Test_Samples': int(n_cv),
      'Train_Time_Periods': len(train_times),
      'Test_Time_Periods': len(test_times),
      'Train_Ratio': len(train_times)/n_times,
      'Test_Ratio': len(test_times)/n_times,
      'R_Squared': r2_cv,
      'Adjusted_R_Squared': adj_r2_cv,
      'MSE': mse_cv,
      'MAE': mae_cv,
      'Information_Ratio': info_ratio,
      'Tracking_Error': tracking_error,
      'Final_Training_Loss': final_loss,
      'Training_Epochs': len(training_losses),
      'Factors_Generated': all_factors_cv.shape[1]
  }
  
  cv_results.append(fold_result)
  cv_detailed_results.append({
      'fold': fold+1, 
      'factors': all_factors_cv, 
      'predictions': y_pred_cv, 
      'actual': y_test_subset_cv,
      'coefficients': linear_reg_cv.coef_,
      'residuals': residuals
  })
  
  print(f"第{fold+1}折结果:")
  print(f"  R²: {r2_cv:.4f}, 调整R²: {adj_r2_cv:.4f}")
  print(f"  MSE: {mse_cv:.6f}, MAE: {mae_cv:.6f}")
  print(f"  信息比率: {info_ratio:.4f}, 跟踪误差: {tracking_error:.4f}")
  print(f"  训练轮数: {len(training_losses)}")


cv_results_df = pd.DataFrame(cv_results)

summary_stats = {
  'Metric': ['R_Squared', 'Adjusted_R_Squared', 'MSE', 'MAE', 'Information_Ratio'],
  'Mean': [
      cv_results_df['R_Squared'].mean(),
      cv_results_df['Adjusted_R_Squared'].mean(),
      cv_results_df['MSE'].mean(),
      cv_results_df['MAE'].mean(),
      cv_results_df['Information_Ratio'].mean()
  ],
  'Std': [
      cv_results_df['R_Squared'].std(),
      cv_results_df['Adjusted_R_Squared'].std(),
      cv_results_df['MSE'].std(),
      cv_results_df['MAE'].std(),
      cv_results_df['Information_Ratio'].std()
  ],
  'Min': [
      cv_results_df['R_Squared'].min(),
      cv_results_df['Adjusted_R_Squared'].min(),
      cv_results_df['MSE'].min(),
      cv_results_df['MAE'].min(),
      cv_results_df['Information_Ratio'].min()
  ],
  'Max': [
      cv_results_df['R_Squared'].max(),
      cv_results_df['Adjusted_R_Squared'].max(),
      cv_results_df['MSE'].max(),
      cv_results_df['MAE'].max(),
      cv_results_df['Information_Ratio'].max()
  ]
}

summary_df = pd.DataFrame(summary_stats)


coefficients_summary = []
for i, result in enumerate(cv_detailed_results):
  for j, coef in enumerate(result['coefficients']):
      coefficients_summary.append({
          'Fold': i + 1,
          'Factor': f'Factor_{j+1}',
          'Coefficient': coef
      })

coefficients_df = pd.DataFrame(coefficients_summary)
coefficients_pivot = coefficients_df.pivot(index='Factor', columns='Fold', values='Coefficient')
coefficients_pivot['Mean'] = coefficients_pivot.mean(axis=1)
coefficients_pivot['Std'] = coefficients_pivot.std(axis=1)
coefficients_pivot['CV'] = coefficients_pivot['Std'] / np.abs(coefficients_pivot['Mean'])  # 变异系数


cv_results_df.to_csv(os.path.join(save_dir, 'cross_validation_results.csv'), index=False)
summary_df.to_csv(os.path.join(save_dir, 'cross_validation_summary.csv'), index=False)
coefficients_pivot.to_csv(os.path.join(save_dir, 'cross_validation_coefficients.csv'))


for i, result in enumerate(cv_detailed_results):
  fold_predictions_df = pd.DataFrame({
      'Actual_Return': result['actual'],
      'Predicted_Return': result['predictions'],
      'Residual': result['residuals'],
      'Abs_Residual': np.abs(result['residuals'])
  })
  fold_predictions_df.to_csv(os.path.join(save_dir, f'fold_{i+1}_predictions.csv'), index=False)

print(f"\n{'='*60}")
print(f"=== 交叉验证结果汇总 ===")
print(f"{'='*60}")
print(f"R² - 均值: {cv_results_df['R_Squared'].mean():.4f} ± {cv_results_df['R_Squared'].std():.4f}")
print(f"调整R² - 均值: {cv_results_df['Adjusted_R_Squared'].mean():.4f} ± {cv_results_df['Adjusted_R_Squared'].std():.4f}")
print(f"MSE - 均值: {cv_results_df['MSE'].mean():.6f} ± {cv_results_df['MSE'].std():.6f}")
print(f"信息比率 - 均值: {cv_results_df['Information_Ratio'].mean():.4f} ± {cv_results_df['Information_Ratio'].std():.4f}")


r2_cv_coef = cv_results_df['R_Squared'].std() / cv_results_df['R_Squared'].mean()
print(f"\n=== 稳健性评估 ===")
print(f"R²变异系数: {r2_cv_coef:.4f}")
print(f"R²稳健性评估: {'稳健' if r2_cv_coef < 0.1 else '中等' if r2_cv_coef < 0.2 else '不够稳健'}")

print(f"\n=== 性能趋势分析 ===")
for i in range(3):
  print(f"第{i+1}折 (训练集占{cv_results_df.iloc[i]['Train_Ratio']:.1%}): R² = {cv_results_df.iloc[i]['R_Squared']:.4f}")

print(f"\n=== 保存的CSV文件 ===")
print(f"1. 交叉验证主要结果: cross_validation_results.csv")
print(f"2. 统计汇总: cross_validation_summary.csv") 
print(f"3. 因子系数汇总: cross_validation_coefficients.csv")
print(f"4. 各折预测详情: fold_1_predictions.csv, fold_2_predictions.csv, fold_3_predictions.csv")


时间序列交叉验证（3 fold）- 30因子模型稳健性验证
数据分4份：训练集递增，测试集为下一个1/4
总时间点数: 482
每个1/4包含时间点数: 120
时间范围: 1/10/22 到 9/9/22

--- 第 1 折验证 ---
训练时间范围: 1/10/22 到 12/15/21 (共120个时间点)
测试时间范围: 12/15/22 到 4/10/23 (共120个时间点)
训练集占比: 24.9%, 测试集占比: 24.9%
训练样本数: 224183, 测试样本数: 225217
早停于第143轮，最佳损失: 0.423661
第1折结果:
  R²: 0.1722, 调整R²: 0.1721
  MSE: 0.780472, MAE: 0.593374
  信息比率: 0.0001, 跟踪误差: 14.0242
  训练轮数: 143

--- 第 2 折验证 ---
训练时间范围: 1/10/22 到 4/10/23 (共240个时间点)
测试时间范围: 4/11/22 到 7/11/23 (共120个时间点)
训练集占比: 49.8%, 测试集占比: 24.9%
训练样本数: 449400, 测试样本数: 224261
早停于第171轮，最佳损失: 0.427456
第2折结果:
  R²: 0.1933, 调整R²: 0.1932
  MSE: 0.932847, MAE: 0.663227
  信息比率: -0.0070, 跟踪误差: 15.3322
  训练轮数: 171

--- 第 3 折验证 ---
训练时间范围: 1/10/22 到 7/11/23 (共360个时间点)
测试时间范围: 7/12/22 到 9/9/22 (共122个时间点)
训练集占比: 74.7%, 测试集占比: 25.3%
训练样本数: 673661, 测试样本数: 228106
早停于第127轮，最佳损失: 0.516831
第3折结果:
  R²: 0.1814, 调整R²: 0.1813
  MSE: 0.739761, MAE: 0.581417
  信息比率: -0.0001, 跟踪误差: 13.6536
  训练轮数: 127

=== 交叉验证结果汇总 ===
R² - 均值: 0.1823 ± 0.0105
调整R² - 均值: 0.18

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from itertools import combinations
import warnings
import os
import json
warnings.filterwarnings('ignore')


data = pd.read_csv('/Users/xiaoquanliu/Desktop/Book_DataCode1/第七章/DL_Data16_processed.csv')
print(f"Original data shape: {data.shape}")


numeric_cols = data.columns[2:]  # 
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].mean())


original_feature_names = list(data.columns[3:36])  # 


original_features = data.columns[3:36]  
interaction_features = []
interaction_feature_names = []

for i, j in combinations(range(len(original_features)), 2):
    col_name = f'interaction_{i}_{j}'
    interaction_name = f'{original_features[i]}_×_{original_features[j]}'
    data[col_name] = data[original_features[i]] * data[original_features[j]]
    interaction_features.append(col_name)
    interaction_feature_names.append(interaction_name)

print(f"Generated {len(interaction_features)} interaction features")


all_feature_names = original_feature_names + interaction_feature_names

all_features = list(original_features) + interaction_features
X = data[all_features].values
y = data['Return'].values
stock_ids = data.iloc[:, 0].values
time_ids = data.iloc[:, 1].values


train_idx = int(len(data) * 0.8)
X_train, X_test = X[:train_idx], X[train_idx:]
y_train, y_test = y[:train_idx], y[train_idx:]
time_train, time_test = time_ids[:train_idx], time_ids[train_idx:]
stock_train, stock_test = stock_ids[:train_idx], stock_ids[train_idx:]


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


class DualStructureNN(keras.Model):
    def __init__(self, num_factors=10, num_features=None,
                 factor_loading_layers=[256, 128], 
                 factor_extraction_layers=[256, 128]):
        super(DualStructureNN, self).__init__()
        self.num_factors = num_factors
        self.num_features = num_features
        
        # 因子载荷网络 g(z_i,t; θ)
        self.factor_loading_layers = []
        for i, units in enumerate(factor_loading_layers):
            self.factor_loading_layers.append(
                layers.Dense(units, activation='relu', name=f'g_layer_{i}')
            )
        self.factor_loading_output = layers.Dense(
            num_factors, activation=None, name='g_output'
        )
        
        # 因子提取网络 h(x_t+1; φ)
        self.factor_extraction_layers = []
        for i, units in enumerate(factor_extraction_layers):
            self.factor_extraction_layers.append(
                layers.Dense(units, activation='relu', name=f'h_layer_{i}')
            )
        self.factor_extraction_output = layers.Dense(
            num_factors, activation=None, name='h_output'
        )
        
    def compute_factor_loadings(self, z):
        """计算因子载荷 β_i,t = g(z_i,t; θ)"""
        x = z
        for layer in self.factor_loading_layers:
            x = layer(x)
        return self.factor_loading_output(x)
    
    def compute_factors(self, x_weighted):
        """计算因子 f_t+1 = h(x_t+1; φ)"""
        # 确保输入是二维的
        if len(x_weighted.shape) == 1:
            x_weighted = tf.expand_dims(x_weighted, 0)
        
        x = x_weighted
        for layer in self.factor_extraction_layers:
            x = layer(x)
        factors = self.factor_extraction_output(x)
        
        # 如果输入是单个样本，返回一维向量
        if factors.shape[0] == 1:
            factors = tf.squeeze(factors, axis=0)
            
        return factors
    
    def call(self, inputs, training=None):
        z_batch, x_weighted = inputs
        
        # 计算因子载荷
        factor_loadings = self.compute_factor_loadings(z_batch)
        
        # 计算因子
        factors = self.compute_factors(x_weighted)
        
        # 计算预测收益率
        # 如果factors是一维的，需要广播到batch size
        if len(factors.shape) == 1:
            factors = tf.expand_dims(factors, 0)
            factors = tf.tile(factors, [tf.shape(factor_loadings)[0], 1])
        
        predictions = tf.reduce_sum(factor_loadings * factors, axis=1)
        
        return predictions, factor_loadings, factors


def prepare_time_based_data(X, y, time_ids):
    """按时间组织数据"""
    unique_times = np.unique(time_ids)
    time_data = {}
    
    for t in unique_times:
        mask = time_ids == t
        time_data[t] = {
            'X': X[mask],
            'y': y[mask],
            'indices': np.where(mask)[0]
        }
    
    return time_data, unique_times


print("\n" + "="*60)
print("扩展功能1: 测试25-35个因子的双结构神经网络R平方")
print("="*60)

factor_numbers = list(range(25, 36))  # 25到35个因子
nn_results = {}

for num_factors in factor_numbers:
    print(f"\n训练因子数量为 {num_factors} 的模型...")
    
    model = DualStructureNN(
        num_factors=num_factors, 
        num_features=X_train_scaled.shape[1],
        factor_loading_layers=[256, 128, 64],
        factor_extraction_layers=[256, 128, 64]
    )
    
    optimizer = keras.optimizers.Adam(learning_rate=0.001)
    
    train_time_data, unique_times_train = prepare_time_based_data(
        X_train_scaled, y_train, time_train
    )
 
    @tf.function
    def train_step_time(z_batch, y_batch, x_weighted):
        with tf.GradientTape() as tape:
            predictions, _, _ = model([z_batch, x_weighted], training=True)
            loss = tf.reduce_mean(tf.square(y_batch - predictions))
        
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))
        return loss
 
    epochs = 300
    for epoch in range(epochs):
        epoch_losses = []
        
        for t in unique_times_train:
            t_data = train_time_data[t]
            if len(t_data['X']) < 2:
                continue
                
            x_weighted = np.dot(t_data['X'].T, t_data['y']) / len(t_data['y'])
            
            loss = train_step_time(
                tf.constant(t_data['X'], dtype=tf.float32),
                tf.constant(t_data['y'], dtype=tf.float32),
                tf.constant(x_weighted, dtype=tf.float32)
            )
            epoch_losses.append(loss.numpy())
        
        if (epoch + 1) % 50 == 0:
            print(f"  Epoch {epoch + 1}/{epochs}, Loss: {np.mean(epoch_losses):.6f}")
 
    X_full_scaled = np.vstack([X_train_scaled, X_test_scaled])
    y_full = np.concatenate([y_train, y_test])
    time_full = np.concatenate([time_train, time_test])
    
    all_factors_full = []
    full_factor_indices = []
    
    full_time_data, unique_times_full = prepare_time_based_data(
        X_full_scaled, y_full, time_full
    )
    
    for t in unique_times_full:
        t_data = full_time_data[t]
        if len(t_data['X']) < 2:
            continue
        
        x_weighted = np.dot(t_data['X'].T, t_data['y']) / len(t_data['y'])
        
        _, _, factors = model(
            [tf.constant(t_data['X'], dtype=tf.float32),
             tf.constant(x_weighted, dtype=tf.float32)],
            training=False
        )
        
        factors_np = factors.numpy()
        if len(factors_np.shape) == 1:
            factors_expanded = np.tile(factors_np, (len(t_data['X']), 1))
        else:
            factors_expanded = factors_np
        
        all_factors_full.extend(factors_expanded)
        full_factor_indices.extend(t_data['indices'])
    
    all_factors_full = np.array(all_factors_full)
    
 
    X_factors_full = all_factors_full
    y_returns_full = y_full[full_factor_indices]
    
    linear_reg = LinearRegression()
    linear_reg.fit(X_factors_full, y_returns_full)
    y_pred_full = linear_reg.predict(X_factors_full)
    r2_full = r2_score(y_returns_full, y_pred_full)
    
    n = len(y_returns_full)
    p = num_factors
    adj_r2_full = 1 - (1 - r2_full) * (n - 1) / (n - p - 1)
    
    nn_results[num_factors] = {
        'r2': r2_full,
        'adj_r2': adj_r2_full,
        'n_observations': n,
        'n_factors': p
    }
    
    print(f"  因子数量 {num_factors}: R² = {r2_full:.4f}, Adjusted R² = {adj_r2_full:.4f}")


print("\n" + "="*60)
print("25-35个因子的R²统计结果汇总")
print("="*60)


results_table = []
for num_factors in factor_numbers:
    result = nn_results[num_factors]
    results_table.append({
        'Factors': num_factors,
        'R_squared': result['r2'],
        'Adj_R_squared': result['adj_r2'],
        'N_observations': result['n_observations']
    })


results_df = pd.DataFrame(results_table)
print("\n详细结果表:")
print(f"{'因子数量':<8} {'R²':>12} {'调整R²':>12} {'观测数':>10}")
print("-" * 50)
for _, row in results_df.iterrows():
    print(f"{row['Factors']:<8} {row['R_squared']:>12.6f} {row['Adj_R_squared']:>12.6f} {row['N_observations']:>10.0f}")


best_r2_idx = results_df['R_squared'].idxmax()
best_adj_r2_idx = results_df['Adj_R_squared'].idxmax()

print(f"\n性能分析:")
print(f"最高R²: {results_df.loc[best_r2_idx, 'Factors']}个因子 (R² = {results_df.loc[best_r2_idx, 'R_squared']:.6f})")
print(f"最高调整R²: {results_df.loc[best_adj_r2_idx, 'Factors']}个因子 (调整R² = {results_df.loc[best_adj_r2_idx, 'Adj_R_squared']:.6f})")


r2_values = results_df['R_squared'].values
adj_r2_values = results_df['Adj_R_squared'].values

print(f"\nR²统计:")
print(f"  平均值: {np.mean(r2_values):.6f}")
print(f"  标准差: {np.std(r2_values):.6f}")
print(f"  最小值: {np.min(r2_values):.6f}")
print(f"  最大值: {np.max(r2_values):.6f}")

print(f"\n调整R²统计:")
print(f"  平均值: {np.mean(adj_r2_values):.6f}")
print(f"  标准差: {np.std(adj_r2_values):.6f}")
print(f"  最小值: {np.min(adj_r2_values):.6f}")
print(f"  最大值: {np.max(adj_r2_values):.6f}")


save_dir = '/Users/xiaoquanliu/Desktop/Book_DataCode1/第七章/factors_25_35_results'
os.makedirs(save_dir, exist_ok=True)


results_df.to_csv(os.path.join(save_dir, 'factors_25_35_r_squared_results.csv'), index=False)


complete_results = {
    'experiment_info': {
        'factor_range': [25, 35],
        'total_models_trained': len(factor_numbers),
        'training_epochs': 300,
        'train_test_split': 0.8
    },
    'results_summary': {
        'best_r2_factors': int(results_df.loc[best_r2_idx, 'Factors']),
        'best_r2_value': float(results_df.loc[best_r2_idx, 'R_squared']),
        'best_adj_r2_factors': int(results_df.loc[best_adj_r2_idx, 'Factors']),
        'best_adj_r2_value': float(results_df.loc[best_adj_r2_idx, 'Adj_R_squared']),
        'r2_statistics': {
            'mean': float(np.mean(r2_values)),
            'std': float(np.std(r2_values)),
            'min': float(np.min(r2_values)),
            'max': float(np.max(r2_values))
        },
        'adj_r2_statistics': {
            'mean': float(np.mean(adj_r2_values)),
            'std': float(np.std(adj_r2_values)),
            'min': float(np.min(adj_r2_values)),
            'max': float(np.max(adj_r2_values))
        }
    },
    'detailed_results': {
        str(k): {
            'r2': float(v['r2']),
            'adj_r2': float(v['adj_r2']),
            'n_observations': int(v['n_observations']),
            'n_factors': int(v['n_factors'])
        } for k, v in nn_results.items()
    }
}

with open(os.path.join(save_dir, 'factors_25_35_complete_results.json'), 'w') as f:
    json.dump(complete_results, f, indent=2, ensure_ascii=False)

print(f"\n" + "="*60)
print("实验完成")
print("="*60)
print(f"结果已保存至目录: {save_dir}")
print(f"- CSV文件: factors_25_35_r_squared_results.csv")
print(f"- JSON文件: factors_25_35_complete_results.json")
print(f"\n共训练了 {len(factor_numbers)} 个模型，因子数量范围: {min(factor_numbers)}-{max(factor_numbers)}")


Original data shape: (901767, 36)
Generated 528 interaction features

扩展功能1: 测试25-35个因子的双结构神经网络R平方

训练因子数量为 25 的模型...
  Epoch 50/300, Loss: 0.643653
  Epoch 100/300, Loss: 0.553692
  Epoch 150/300, Loss: 0.520674
  Epoch 200/300, Loss: 0.489024
  Epoch 250/300, Loss: 0.473093
  Epoch 300/300, Loss: 0.465716
  因子数量 25: R² = 0.2026, Adjusted R² = 0.2026

训练因子数量为 26 的模型...
  Epoch 50/300, Loss: 0.641906
  Epoch 100/300, Loss: 0.553808
  Epoch 150/300, Loss: 0.513077
  Epoch 200/300, Loss: 0.491131
  Epoch 250/300, Loss: 0.471027
  Epoch 300/300, Loss: 0.463428
  因子数量 26: R² = 0.2019, Adjusted R² = 0.2019

训练因子数量为 27 的模型...
  Epoch 50/300, Loss: 0.646159
  Epoch 100/300, Loss: 0.564456
  Epoch 150/300, Loss: 0.521822
  Epoch 200/300, Loss: 0.509633
  Epoch 250/300, Loss: 0.481696
  Epoch 300/300, Loss: 0.465961
  因子数量 27: R² = 0.2024, Adjusted R² = 0.2024

训练因子数量为 28 的模型...
  Epoch 50/300, Loss: 0.637951
  Epoch 100/300, Loss: 0.554683
  Epoch 150/300, Loss: 0.513736
  Epoch 200/300, Loss: