# Dow Jones Index Trading Signal Prediction
## Data Preprocessing

In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import talib, pandas as pd, numpy as np
from ucimlrepo import fetch_ucirepo
from sklearn.preprocessing import StandardScaler

# fetch dataset
dow_jones_index = fetch_ucirepo(id=312)

# data (as pandas dataframes)
print(dow_jones_index.data.features)
print("\n----\n")
print(dow_jones_index.data.targets)
print(dow_jones_index.variables)

df = pd.concat([dow_jones_index.data.features, dow_jones_index.data.targets], axis=1)
df.info()

     quarter stock       date    open    high     low   close     volume  \
0          1    AA   1/7/2011  $15.82  $16.72  $15.78  $16.42  239655616   
1          1    AA  1/14/2011  $16.71  $16.71  $15.64  $15.97  242963398   
2          1    AA  1/21/2011  $16.19  $16.38  $15.60  $15.79  138428495   
3          1    AA  1/28/2011  $15.87  $16.63  $15.82  $16.13  151379173   
4          1    AA   2/4/2011  $16.18  $17.39  $16.18  $17.14  154387761   
..       ...   ...        ...     ...     ...     ...     ...        ...   
745        2   XOM  5/27/2011  $80.22  $82.63  $80.07  $82.63   68230855   
746        2   XOM   6/3/2011  $83.28  $83.75  $80.18  $81.18   78616295   
747        2   XOM  6/10/2011  $80.93  $81.87  $79.72  $79.78   92380844   
748        2   XOM  6/17/2011  $80.00  $80.82  $78.33  $79.02  100521400   
749        2   XOM  6/24/2011  $78.65  $81.12  $76.78  $76.78  118679791   

     percent_change_price  percent_change_volume_over_last_wk  \
0                 3.79

In [100]:
# Preprocessing
df['date'] = pd.to_datetime(df['date'])
df.sort_values(by=['stock', 'date'], inplace=True)
# 读 CSV，清掉价格和成交量里的符号
for c in ['open', 'high', 'low', 'close', 'volume']:
    if c in df.columns:
        df[c] = df[c].replace('[\$,]', '', regex=True).astype(float)
        
# 缺失值：上一周无数据 -> 无交易量变化-> 变化为 0。
df['percent_change_volume_over_last_wk'] = df['percent_change_volume_over_last_wk'].fillna(0)
df['previous_weeks_volume'] = df['previous_weeks_volume'].fillna(0)


In [101]:
import pandas as pd
import numpy as np
import talib

def generate_features(df):
    df = df.copy()
    def construct_and_fill(group):
        group = group.copy()

        # ===== 构造技术指标 =====
        group['sma_3'] = talib.SMA(group['close'], timeperiod=3)
        group['sma_7'] = talib.SMA(group['close'], timeperiod=7)
        group['sma_diff'] = group['sma_3'] - group['sma_7']

        group['rsi_5'] = talib.RSI(group['close'], timeperiod=5)
        group['atr_5'] = talib.ATR(group['high'], group['low'], group['close'], timeperiod=5)

        macd, macdsignal, macdhist = talib.MACD(group['close'], fastperiod=3, slowperiod=7, signalperiod=3)
        group['macd'] = macd
        group['macd_signal'] = macdsignal
        group['macd_hist'] = macdhist

        group['momentum_3'] = group['close'].pct_change(periods=3)
        group['momentum_5'] = group['close'].pct_change(periods=5)

        group['close_z_5'] = (group['close'] - group['close'].rolling(5).mean()) / group['close'].rolling(5).std()
        group['volatility_5'] = group['percent_change_price'].rolling(5).std()

        group['vpt'] = (group['percent_change_price'] * group['volume']).cumsum()
        group['volume_z_5'] = (group['volume'] - group['volume'].rolling(5).mean()) / group['volume'].rolling(5).std()
        group['price_volume_ratio'] = group['percent_change_price'] / (group['percent_change_volume_over_last_wk'] + 1e-6)

        group['is_dividend_next_week'] = (group['days_to_next_dividend'] <= 7).astype(int)
        group['dividend_expected_return'] = group['percent_return_next_dividend'].fillna(0)

        # ===== 缺失值智能填充 =====
        indicator_cols = [
            'sma_3', 'sma_7', 'sma_diff',
            'rsi_5', 'atr_5',
            'macd', 'macd_signal', 'macd_hist',
            'momentum_3', 'momentum_5', 'close_z_5',
            'volatility_5', 'vpt', 'volume_z_5',
            'price_volume_ratio'
        ]

        # 1. 先前向填充
        group[indicator_cols] = group[indicator_cols].ffill()

        # 2. 用列均值填补（防止全是NaN时失败）
        for col in indicator_cols:
            group[col] = group[col].fillna(group[col].mean())

        # 3. 动量类直接置0（代表“无动量”）
        group[['momentum_3', 'momentum_5']] = group[['momentum_3', 'momentum_5']].fillna(0)

        # 4. 剩余的再兜底为0
        group[indicator_cols] = group[indicator_cols].fillna(0)

        return group

    # 应用于每个股票分组

    df = df.groupby('stock').apply(construct_and_fill).reset_index(drop=True)
    return df


In [102]:
df_features = generate_features(df)  # df 是原始数据集（包含 open, close, volume 等字段）
df_features.head()

  df = df.groupby('stock').apply(construct_and_fill).reset_index(drop=True)


Unnamed: 0,quarter,stock,date,open,high,low,close,volume,percent_change_price,percent_change_volume_over_last_wk,...,macd_hist,momentum_3,momentum_5,close_z_5,volatility_5,vpt,volume_z_5,price_volume_ratio,is_dividend_next_week,dividend_expected_return
0,1,AA,2011-01-07,15.82,16.72,15.78,16.42,239655616.0,3.79267,0.0,...,-0.064633,-0.006748,-0.010053,-0.200085,3.529632,908934700.0,-0.125416,3792670.0,0,0.182704
1,1,AA,2011-01-14,16.71,16.71,15.64,15.97,242963398.0,-4.42849,1.380223,...,-0.064633,-0.006748,-0.010053,-0.200085,3.529632,-167026300.0,-0.125416,-3.20853,0,0.187852
2,1,AA,2011-01-21,16.19,16.38,15.6,15.79,138428495.0,-2.47066,-43.024959,...,-0.064633,-0.006748,-0.010053,-0.200085,3.529632,-509036100.0,-0.125416,0.05742388,0,0.189994
3,1,AA,2011-01-28,15.87,16.63,15.82,16.13,151379173.0,1.63831,9.3555,...,-0.064633,-0.017661,-0.010053,-0.200085,3.529632,-261030000.0,-0.125416,0.1751173,1,0.185989
4,1,AA,2011-02-04,16.18,17.39,16.18,17.14,154387761.0,5.93325,1.987452,...,-0.064633,0.073262,-0.010053,1.608217,4.301167,654991100.0,-0.602205,2.985354,0,0.175029


## Model Trainning

In [103]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error

# 对代码进行封装
class BaseSignalModel:
    def __init__(self, model, feature_cols=None, threshold=0.02, name=None):
        self.model = model
        self.name = name or model.__class__.__name__
        self.scaler = StandardScaler()
        self.threshold = threshold
        self.feature_cols = feature_cols
        self.rmse = None

    def fit(self, df):
        df = df.copy()
        if self.feature_cols is None:
            self.feature_cols = [col for col in df.columns if col not in [
                'stock', 'date', 'percent_change_next_weeks_price', 
                'next_weeks_open', 'next_weeks_close'
            ]]

        X = df[self.feature_cols]
        y = df['percent_change_next_weeks_price']
        X_scaled = self.scaler.fit_transform(X)

        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, shuffle=False, test_size=0.2)
        self.model.fit(X_train, y_train)

        y_pred = self.model.predict(X_test)
        self.rmse = mean_squared_error(y_test, y_pred) ** 0.5
        print(f"[{self.name}] RMSE: {self.rmse:.4f}")

    def predict(self, df):
        df = df.copy()
        X_scaled = self.scaler.transform(df[self.feature_cols])
        df['predicted_return'] = self.model.predict(X_scaled)
        return df

    def generate_signal(self, df):
        df = df.copy()
        df['signal'] = 0
        df.loc[df['predicted_return'] > self.threshold, 'signal'] = 1
        df.loc[df['predicted_return'] < -self.threshold, 'signal'] = -1
        return df

    # 收益回测
    def backtest(self, df):
        df = df.copy()
        df['strategy_return'] = df['signal'] * df['percent_change_next_weeks_price']
        df['cumulative_strategy_return'] = (1 + df['strategy_return']).cumprod()
        df['cumulative_market_return'] = (1 + df['percent_change_next_weeks_price']).cumprod()

        final_strategy = df['cumulative_strategy_return'].iloc[-1]
        final_market = df['cumulative_market_return'].iloc[-1]

        print(f"[{self.name}] 策略累计收益率：{final_strategy - 1:.2%}，市场：{final_market - 1:.2%}")
        return df


In [104]:
# 支持加载多个模型，然后同时运行
class ModelRunner:
    def __init__(self):
        self.models = []

    def add_model(self, model: BaseSignalModel):
        self.models.append(model)

    def run_all(self, df):
        results = {}

        for model in self.models:
            print(f"\nModel: {model.name}")
            model.fit(df)

            df_pred = model.predict(df)
            df_pred = model.generate_signal(df_pred)
            df_bt = model.backtest(df_pred)

            results[model.name] = df_bt

        return results


In [105]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor

# 1. 特征工程
df_feat = generate_features(df)

# 2. 定义多个模型
rf_model = BaseSignalModel(RandomForestRegressor(n_estimators=100), threshold=0.02)
ridge_model = BaseSignalModel(Ridge(alpha=1.0), threshold=0.015)
xgb_model = BaseSignalModel(XGBRegressor(n_estimators=50, max_depth=3), threshold=0.02)

# 3. 添加到 runner
runner = ModelRunner()
runner.add_model(rf_model)
runner.add_model(ridge_model)
runner.add_model(xgb_model)

# 4. 一键运行所有模型
results = runner.run_all(df_feat)


  df = df.groupby('stock').apply(construct_and_fill).reset_index(drop=True)



Model: RandomForestRegressor
[RandomForestRegressor] RMSE: 2.0712
[RandomForestRegressor] 策略累计收益率：-1080580675593738763225343639013326291428858531294352582286011047157820564627750317704476526436906808369143694541186225207821084615719449361416445068550014657266427065601740843562928642750977476815577180824641473307589196929348516198939918848882311692288.00%，市场：-26021114707035573463167827634346710113256385689849700383989383942336811084919309496254428742689874604195840.00%

Model: Ridge
[Ridge] RMSE: 2.0551
[Ridge] 策略累计收益率：3147834044203432643551025163363179837452108552647960722528039690506426609026850251107432410854094048170144218319808502716654295049547284480.00%，市场：-26021114707035573463167827634346710113256385689849700383989383942336811084919309496254428742689874604195840.00%

Model: XGBRegressor
[XGBRegressor] RMSE: 2.1126
[XGBRegressor] 策略累计收益率：90463402220747217204690915489199474999952787623173382466999659087810141161220014716547543358126003736034251116773829539583452217735026545309533

**RMSE 在2.05 左右，代表模型预测误差为2%。策略累计收益这里有bug，先不管。还需要有更多的方法来评估模型，否则论文不好写。**