In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt

# ==============================
# 0. 模拟数据生成 (假设有3个理财产品)
# ==============================
def generate_mock_data(start_date='2021-01-01', periods=547, n_clusters=3):
    dates = pd.date_range(start=start_date, periods=periods)
    data = []

    for clus in range(n_clusters):
        # 模拟申购赎回数据：基函数 + 噪声 + 节假日效应
        base_in = 50 + 30 * np.sin(np.linspace(0, 10*np.pi, periods)) + np.random.normal(0, 10, periods)
        base_out = 45 + 25 * np.sin(np.linspace(0, 10*np.pi, periods)) + np.random.normal(0, 8, periods)

        # 添加月末效应（每月最后3天量增大）
        month_ends = dates.is_month_end
        base_in[month_ends] *= 1.5
        base_out[month_ends] *= 1.3

        # 添加节假日效应（假设每月第15天为节日）
        holidays = (dates.day == 15)
        base_in[holidays] *= 0.7  # 节日申购量下降
        base_out[holidays] *= 1.2 # 节日赎回量上升

        for i in range(periods):
            data.append([
                dates[i].strftime('%Y-%m-%d'),
                f"CLUS_{clus}",
                max(0, base_in[i]),
                max(0, base_out[i])
            ])

    return pd.DataFrame(data, columns=['date', 'clus', 'in', 'out'])

df = generate_mock_data()
print("模拟数据示例:")
print(df.head())

# ==============================
# 1. 数据预处理
# ==============================
# ==============================
# 修正版DataProcessor (关键修改)
# ==============================
class DataProcessor:
    def __init__(self, window_size=30, pred_days=14):
        self.window_size = window_size
        self.pred_days = pred_days
        self.scalers = {}  # 按产品和字段保存归一化器

    def process(self, df, is_predict=False):
        # 按产品分组处理
        grouped = df.groupby('clus')
        all_sequences = []

        for clus, group in grouped:
            # 时间特征工程
            df_clus = group.copy()
            # 特征工程：生成滞后特征和日期特征
            df_clus['date'] = pd.to_datetime(df_clus['date'])
            df_clus['weekday'] = df_clus['date'].dt.weekday
            df_clus['is_month_end'] = df_clus['date'].dt.is_month_end.astype(int)
            df_clus["Year"] = df_clus["date"].dt.year
            df_clus["Month"] = df_clus["date"].dt.month
            df_clus["Day"] = df_clus["date"].dt.day
            # 添加工作日和周末标记
            df_clus["DayOfWeek"] = df_clus["date"].dt.dayofweek
            df_clus["IsWeekend"] = df_clus["DayOfWeek"] >= 5
            # 生成滞后特征
            # （假设我们使用过去lag天的数据）
            for aim in ['in', 'out']:
              for i in range(1, 30):
                  df_clus.loc[:, f"Lag_{aim}_{i}"] = df_clus[aim].shift(i)

            # 生成滚动平均特征（3天窗口）
            df_clus.loc[:, f"Rolling_Mean_{aim}"] = df_clus[aim].rolling(window=3).mean()

            df_clus = df_clus.fillna(0)

            return df_clus

processor = DataProcessor(window_size=30, pred_days=14)
sequences = processor.process(df)



模拟数据示例:
         date    clus         in        out
0  2021-01-01  CLUS_0  52.452394  37.550255
1  2021-01-02  CLUS_0  43.203711  31.955574
2  2021-01-03  CLUS_0  72.309778  63.142757
3  2021-01-04  CLUS_0  45.957601  54.307569
4  2021-01-05  CLUS_0  52.805698  49.549932


In [6]:
sequences

Unnamed: 0,date,clus,in,out,weekday,is_month_end,Year,Month,Day,DayOfWeek,...,Lag_out_21,Lag_out_22,Lag_out_23,Lag_out_24,Lag_out_25,Lag_out_26,Lag_out_27,Lag_out_28,Lag_out_29,Rolling_Mean_out
0,2021-01-01,CLUS_0,52.452394,37.550255,4,0,2021,1,1,4,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,2021-01-02,CLUS_0,43.203711,31.955574,5,0,2021,1,2,5,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2021-01-03,CLUS_0,72.309778,63.142757,6,0,2021,1,3,6,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,44.216195
3,2021-01-04,CLUS_0,45.957601,54.307569,0,0,2021,1,4,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,49.801967
4,2021-01-05,CLUS_0,52.805698,49.549932,1,0,2021,1,5,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,55.666752
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,2022-06-27,CLUS_0,45.250244,39.510539,0,0,2022,6,27,0,...,15.474151,22.977685,12.957434,17.817283,12.603675,4.343676,40.406161,18.357430,27.379669,35.447123
543,2022-06-28,CLUS_0,49.093008,40.725765,1,0,2022,6,28,1,...,18.348716,15.474151,22.977685,12.957434,17.817283,12.603675,4.343676,40.406161,18.357430,36.736956
544,2022-06-29,CLUS_0,66.328975,49.863188,2,0,2022,6,29,2,...,19.734235,18.348716,15.474151,22.977685,12.957434,17.817283,12.603675,4.343676,40.406161,43.366497
545,2022-06-30,CLUS_0,82.818093,51.690048,3,1,2022,6,30,3,...,16.483672,19.734235,18.348716,15.474151,22.977685,12.957434,17.817283,12.603675,4.343676,47.426334


In [5]:
sequences.to_parquet('train_data.parquet')