In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit
import matplotlib.pyplot as plt
import seaborn as sns
import logging
from itertools import combinations
import statsmodels.api as sm

# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 1. 数据准备
def load_and_preprocess_data(file_path):
    try:
        data = pd.read_csv(file_path)
        logging.info(f"Successfully loaded data from {file_path}")
        
        Z = data.iloc[:, 4:36].values  
        # 特征扩展：两两交互
        expanded_features = []
        for i, j in combinations(range(Z.shape[1]), 2):
            expanded_features.append(Z[:, i] * Z[:, j])
        
        Z_expanded = np.column_stack([Z] + expanded_features)
        logging.info(f"Expanded features from {Z.shape[1]} to {Z_expanded.shape[1]}")

        r = data.iloc[:, 3].values  # 收益率，对应r_t

        # 使用均值填充NaN
        Z_expanded = np.nan_to_num(Z_expanded, nan=np.nanmean(Z_expanded))
        r = np.nan_to_num(r, nan=np.nanmean(r))
        logging.info("Filled NaN values with mean")

        # 使用StandardScaler来标准化特征
        scaler = StandardScaler()
        Z_scaled = scaler.fit_transform(Z_expanded)

        Z = Z_scaled.astype(np.float32)
        r = r.astype(np.float32)

        return Z, r
    except Exception as e:
        logging.error(f"Error loading or preprocessing data: {e}")
        raise

# 2. 模型设计
class ConditionalAutoencoder(tf.keras.Model):
    def __init__(self, input_dim, hidden_dim, latent_dim, dropout_rate=0.2):
        super(ConditionalAutoencoder, self).__init__()
        
        # 深度学习自解码因子荷载神经网络 B(Z_{t-1})
        self.beta_net = tf.keras.Sequential([
            tf.keras.layers.Dense(hidden_dim, activation='relu', kernel_initializer='he_normal', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(hidden_dim // 2, activation='relu', kernel_initializer='he_normal', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
            tf.keras.layers.BatchNormalization(),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(latent_dim, kernel_initializer='he_normal', kernel_regularizer=tf.keras.regularizers.l2(0.01))
        ])
        
        # 深度学习自解码的因子提取网络 f_t
        self.factor_net = tf.keras.layers.Dense(latent_dim, use_bias=False, kernel_initializer='he_normal', kernel_regularizer=tf.keras.regularizers.l2(0.01))
        
    def call(self, inputs, training=False):
        Z = inputs
        beta = self.beta_net(Z, training=training)  # B(Z_{t-1})
        f = self.factor_net(tf.ones((tf.shape(Z)[0], 1)))  # f_t
        return tf.reduce_sum(beta * f, axis=1)  # r_t = B(Z_{t-1})f_t

# 3. 实验设计
def prepare_data(Z, r, test_size=0.2, val_size=0.2):
    Z_train_val, Z_test, r_train_val, r_test = train_test_split(Z, r, test_size=test_size, random_state=42)
    Z_train, Z_val, r_train, r_val = train_test_split(Z_train_val, r_train_val, test_size=val_size, random_state=42)
    return (Z_train, r_train), (Z_val, r_val), (Z_test, r_test)

def train_model(model, train_data, val_data, num_epochs=200, batch_size=128, patience=20):
    Z_train, r_train = train_data
    Z_val, r_val = val_data
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    
    train_losses = []
    val_losses = []
    
    best_val_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(num_epochs):
        # Training
        train_loss = tf.keras.metrics.Mean()
        for i in range(0, len(Z_train), batch_size):
            batch_Z = Z_train[i:i+batch_size]
            batch_r = r_train[i:i+batch_size]
            with tf.GradientTape() as tape:
                predictions = model(batch_Z, training=True)
                loss = tf.reduce_mean(tf.square(batch_r - predictions))
                # 添加L2正则化损失
                l2_loss = sum(tf.nn.l2_loss(v) for v in model.trainable_variables if 'kernel' in v.name)
                total_loss = loss + 0.01 * l2_loss
            gradients = tape.gradient(total_loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            train_loss.update_state(loss)

        # Validation
        val_predictions = model(Z_val, training=False)
        val_loss = tf.reduce_mean(tf.square(r_val - val_predictions))

        train_losses.append(train_loss.result().numpy())
        val_losses.append(val_loss.numpy())

        logging.info(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss.result():.4f}, Val Loss: {val_loss:.4f}')

        # 早停
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                logging.info(f"Early stopping triggered at epoch {epoch+1}")
                break

        # 学习率调整
        if epoch % 10 == 0 and epoch > 0:
            optimizer.learning_rate = optimizer.learning_rate * 0.9

    return train_losses, val_losses

# 提取共同因子的函数
def extract_common_factors(model, Z):
    beta = model.beta_net(Z).numpy()
    factor_weights = model.factor_net.weights[0].numpy()
    common_factors = np.einsum('ij,kj->ik', beta, factor_weights.T)
    return common_factors

# 时间序列交叉验证
def time_series_cv(Z, r, n_splits=3):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    cv_scores = []
    cv_results = []
    
    for fold, (train_index, test_index) in enumerate(tscv.split(Z)):
        Z_train, Z_test = Z[train_index], Z[test_index]
        r_train, r_test = r[train_index], r[test_index]
        
        model = ConditionalAutoencoder(input_dim=Z.shape[1], hidden_dim=128, latent_dim=30)
        train_losses, val_losses = train_model(model, (Z_train, r_train), (Z_test, r_test))
        
        # 评估模型
        common_factors = extract_common_factors(model, Z_test)
        scaler = StandardScaler()
        common_factors_scaled = scaler.fit_transform(common_factors)
        X = sm.add_constant(common_factors_scaled)
        y = r_test
        ols_model = sm.OLS(y, X)
        results = ols_model.fit()
        
        cv_scores.append(results.rsquared_adj)
        cv_results.append({
            'fold': fold + 1,
            'r_squared': results.rsquared,
            'adj_r_squared': results.rsquared_adj,
            'aic': results.aic,
            'bic': results.bic
        })
    
    return cv_scores, cv_results

# 样本外测试
def out_of_sample_test(Z, r, train_ratio=0.6, val_ratio=0.2):
    n = len(Z)
    train_end = int(n * train_ratio)
    val_end = int(n * (train_ratio + val_ratio))
    
    Z_train, r_train = Z[:train_end], r[:train_end]
    Z_val, r_val = Z[train_end:val_end], r[train_end:val_end]
    Z_test, r_test = Z[val_end:], r[val_end:]
    
    model = ConditionalAutoencoder(input_dim=Z.shape[1], hidden_dim=128, latent_dim=30)
    train_losses, val_losses = train_model(model, (Z_train, r_train), (Z_val, r_val))
    
    # 在样本外数据上评估模型
    common_factors = extract_common_factors(model, Z_test)
    scaler = StandardScaler()
    common_factors_scaled = scaler.fit_transform(common_factors)
    X = sm.add_constant(common_factors_scaled)
    y = r_test
    ols_model = sm.OLS(y, X)
    results = ols_model.fit()
    
    return results

# 可视化函数
def plot_cv_results(cv_results):
    df = pd.DataFrame(cv_results)
    
    plt.figure(figsize=(12, 8))
    sns.boxplot(data=df[['r_squared', 'adj_r_squared']])
    plt.title('Cross-Validation Results: R-squared and Adjusted R-squared')
    plt.savefig('cv_results_boxplot.png')
    plt.close()
    
    plt.figure(figsize=(12, 8))
    sns.scatterplot(data=df, x='fold', y='adj_r_squared')
    plt.title('Adjusted R-squared across CV Folds')
    plt.savefig('cv_results_scatter.png')
    plt.close()

def plot_oos_results(oos_results):
    plt.figure(figsize=(12, 8))
    plt.scatter(oos_results.fittedvalues, oos_results.resid)
    plt.xlabel('Fitted values')
    plt.ylabel('Residuals')
    plt.title('Out-of-Sample Test: Residuals vs Fitted')
    plt.savefig('oos_residuals_plot.png')
    plt.close()
    
    plt.figure(figsize=(12, 8))
    sm.graphics.plot_regress_exog(oos_results, 'x1', fig=plt.gcf())
    plt.tight_layout()
    plt.savefig('oos_regress_plot.png')
    plt.close()

# 主函数
def main():
    try:
        # 文件路径与数据导入
        file_path = '/Users/xiaoquanliu/Desktop/Book_DataCode1/第七章/DL_Data7.csv'
        Z, r = load_and_preprocess_data(file_path)

        # 原始模型训练和评估
        train_data, val_data, test_data = prepare_data(Z, r)
        Z_train, r_train = train_data
        Z_val, r_val = val_data
        Z_test, r_test = test_data

        model = ConditionalAutoencoder(input_dim=Z.shape[1], hidden_dim=128, latent_dim=30)
        train_losses, val_losses = train_model(model, (Z_train, r_train), (Z_val, r_val))

        # 提取测试集的共同因子
        common_factors = extract_common_factors(model, Z_test)
        scaler = StandardScaler()
        common_factors_scaled = scaler.fit_transform(common_factors)

        # 准备回归数据
        X = sm.add_constant(common_factors_scaled)
        y = r_test

        # 进行OLS回归
        ols_model = sm.OLS(y, X)
        results = ols_model.fit()

        # 输出回归结果
        print(results.summary())

        # 保存共同因子数据
        factor_df = pd.DataFrame(common_factors_scaled, columns=[f'Factor_{i+1}' for i in range(common_factors_scaled.shape[1])])
        factor_df['Return'] = r_test
        factor_df.to_csv('common_factors_and_returns.csv', index=False)

        print("共同因子数据已保存到 'common_factors_and_returns.csv'")

        # 计算调整后的R方
        adjusted_r_squared = results.rsquared_adj
        print(f"调整后的R方: {adjusted_r_squared:.4f}")

        # 绘制训练和验证损失
        plt.figure(figsize=(10, 6))
        plt.plot(train_losses, label='Train Loss')
        plt.plot(val_losses, label='Validation Loss')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Training and Validation Loss')
        plt.legend()
        plt.savefig('loss_plot.png')
        plt.close()

        print("损失图表已保存为 'loss_plot.png'")

        # 添加时间序列交叉验证
        cv_scores, cv_results = time_series_cv(Z, r)
        print(f"交叉验证 R² 分数: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")
        
        # 可视化交叉验证结果
        plot_cv_results(cv_results)
        print("交叉验证结果图表已保存")

        # 样本外测试
        oos_results = out_of_sample_test(Z, r)
        print(f"样本外测试 R²: {oos_results.rsquared:.4f}")
        print(f"样本外测试调整后 R²: {oos_results.rsquared_adj:.4f}")
        
        # 输出样本外测试的完整回归结果
        print("\n样本外测试回归结果:")
        print(oos_results.summary())

        # 可视化样本外测试结果
        plot_oos_results(oos_results)
        print("样本外测试结果图表已保存")

        # 保存交叉验证和样本外测试结果
        cv_df = pd.DataFrame(cv_results)
        cv_df.to_csv('cross_validation_results.csv', index=False)
        print("交叉验证结果已保存到 'cross_validation_results.csv'")

        oos_df = pd.DataFrame({
            'Actual': oos_results.model.endog,
            'Predicted': oos_results.fittedvalues,
            'Residuals': oos_results.resid
        })
        oos_df.to_csv('out_of_sample_results.csv', index=False)
        print("样本外测试结果已保存到 'out_of_sample_results.csv'")

    except Exception as e:
        logging.error(f"An error occurred: {e}")
        raise

if __name__ == "__main__":
    main()


2024-09-24 13:08:32.459884: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-24 13:08:36,891 - INFO - Successfully loaded data from /Users/xiaoquanliu/Desktop/Book_DataCode1/第七章/DL_Data7.csv
2024-09-24 13:08:41,076 - INFO - Expanded features from 32 to 528
2024-09-24 13:08:49,717 - INFO - Filled NaN values with mean
2024-09-24 13:08:58.796856: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-09-24 13:09:37,020 - INFO - Epoch [1/200], Train Los

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.846
Model:                            OLS   Adj. R-squared:                  0.846
Method:                 Least Squares   F-statistic:                 3.309e+04
Date:                Tue, 24 Sep 2024   Prob (F-statistic):               0.00
Time:                        13:26:44   Log-Likelihood:             2.8214e+05
No. Observations:              180354   AIC:                        -5.642e+05
Df Residuals:                  180323   BIC:                        -5.639e+05
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.4533      0.000   3698.453      0.0

2024-09-24 13:27:06,477 - INFO - Epoch [1/200], Train Loss: 2.1506, Val Loss: 0.0513
2024-09-24 13:27:24,115 - INFO - Epoch [2/200], Train Loss: 0.0345, Val Loss: 0.5829
2024-09-24 13:27:41,822 - INFO - Epoch [3/200], Train Loss: 0.0713, Val Loss: 0.1660
2024-09-24 13:27:59,838 - INFO - Epoch [4/200], Train Loss: 0.0563, Val Loss: 0.1339
2024-09-24 13:28:17,845 - INFO - Epoch [5/200], Train Loss: 0.0365, Val Loss: 0.0558
2024-09-24 13:28:35,598 - INFO - Epoch [6/200], Train Loss: 0.0251, Val Loss: 0.0602
2024-09-24 13:28:53,232 - INFO - Epoch [7/200], Train Loss: 0.0200, Val Loss: 0.0252
2024-09-24 13:29:10,633 - INFO - Epoch [8/200], Train Loss: 0.0187, Val Loss: 0.0173
2024-09-24 13:29:28,152 - INFO - Epoch [9/200], Train Loss: 0.0187, Val Loss: 0.0107
2024-09-24 13:29:45,498 - INFO - Epoch [10/200], Train Loss: 0.0186, Val Loss: 0.0125
2024-09-24 13:30:02,903 - INFO - Epoch [11/200], Train Loss: 0.0187, Val Loss: 0.0132
2024-09-24 13:30:20,494 - INFO - Epoch [12/200], Train Loss: 0.

交叉验证 R² 分数: 0.4435 (±0.0892)


  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):
  if pd.api.types.is_categorical_dtype(vector):


交叉验证结果图表已保存


2024-09-24 15:16:54,341 - INFO - Epoch [1/200], Train Loss: 1.5816, Val Loss: 0.1055
2024-09-24 15:17:33,768 - INFO - Epoch [2/200], Train Loss: 0.0619, Val Loss: 0.1193
2024-09-24 15:18:12,931 - INFO - Epoch [3/200], Train Loss: 0.0266, Val Loss: 0.0164
2024-09-24 15:18:52,402 - INFO - Epoch [4/200], Train Loss: 0.0156, Val Loss: 0.0174
2024-09-24 15:19:31,614 - INFO - Epoch [5/200], Train Loss: 0.0153, Val Loss: 0.0211
2024-09-24 15:20:11,104 - INFO - Epoch [6/200], Train Loss: 0.0155, Val Loss: 0.0205
2024-09-24 15:20:50,393 - INFO - Epoch [7/200], Train Loss: 0.0154, Val Loss: 0.0205
2024-09-24 15:21:30,425 - INFO - Epoch [8/200], Train Loss: 0.0154, Val Loss: 0.0199
2024-09-24 15:22:10,484 - INFO - Epoch [9/200], Train Loss: 0.0155, Val Loss: 0.0205
2024-09-24 15:22:50,613 - INFO - Epoch [10/200], Train Loss: 0.0154, Val Loss: 0.0200
2024-09-24 15:23:29,900 - INFO - Epoch [11/200], Train Loss: 0.0154, Val Loss: 0.0184
2024-09-24 15:24:09,374 - INFO - Epoch [12/200], Train Loss: 0.

样本外测试 R²: 0.5712
样本外测试调整后 R²: 0.5711

样本外测试回归结果:
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.571
Model:                            OLS   Adj. R-squared:                  0.571
Method:                 Least Squares   F-statistic:                     8007.
Date:                Tue, 24 Sep 2024   Prob (F-statistic):               0.00
Time:                        16:49:27   Log-Likelihood:             1.8613e+05
No. Observations:              180354   AIC:                        -3.722e+05
Df Residuals:                  180323   BIC:                        -3.719e+05
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
con

  fig = abline_plot(0, fitted_line.params[0], color='k', ax=ax)


样本外测试结果图表已保存
交叉验证结果已保存到 'cross_validation_results.csv'
样本外测试结果已保存到 'out_of_sample_results.csv'
