> https://archive.ics.uci.edu/dataset/312/dow+jones+index

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
dow_jones_index = fetch_ucirepo(id=312)

# data (as pandas dataframes)
print(dow_jones_index.data.features)
print("\n----\n")
print(dow_jones_index.data.targets)

# metadata
# print(dow_jones_index.metadata)

# variable information
print(dow_jones_index.variables)


     quarter stock       date    open    high     low   close     volume  \
0          1    AA   1/7/2011  $15.82  $16.72  $15.78  $16.42  239655616   
1          1    AA  1/14/2011  $16.71  $16.71  $15.64  $15.97  242963398   
2          1    AA  1/21/2011  $16.19  $16.38  $15.60  $15.79  138428495   
3          1    AA  1/28/2011  $15.87  $16.63  $15.82  $16.13  151379173   
4          1    AA   2/4/2011  $16.18  $17.39  $16.18  $17.14  154387761   
..       ...   ...        ...     ...     ...     ...     ...        ...   
745        2   XOM  5/27/2011  $80.22  $82.63  $80.07  $82.63   68230855   
746        2   XOM   6/3/2011  $83.28  $83.75  $80.18  $81.18   78616295   
747        2   XOM  6/10/2011  $80.93  $81.87  $79.72  $79.78   92380844   
748        2   XOM  6/17/2011  $80.00  $80.82  $78.33  $79.02  100521400   
749        2   XOM  6/24/2011  $78.65  $81.12  $76.78  $76.78  118679791   

     percent_change_price  percent_change_volume_over_last_wk  \
0                 3.79

In [3]:
df = pd.concat([dow_jones_index.data.features, dow_jones_index.data.targets], axis=1)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 16 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   quarter                             750 non-null    int64  
 1   stock                               750 non-null    object 
 2   date                                750 non-null    object 
 3   open                                750 non-null    object 
 4   high                                750 non-null    object 
 5   low                                 750 non-null    object 
 6   close                               750 non-null    object 
 7   volume                              750 non-null    int64  
 8   percent_change_price                750 non-null    float64
 9   percent_change_volume_over_last_wk  720 non-null    float64
 10  previous_weeks_volume               720 non-null    float64
 11  next_weeks_open                     750 non-n

In [4]:
# 根据目标变量建立新分类特征label: 上涨 -> 1; 下跌 -> 0
# df['label'] = df['percent_change_next_weeks_price'].astype(float).apply(lambda x: 1 if x > 0 else 0)

# 设置最小涨跌幅阈值（0.5%）(三分类)
def Classify_threshold(price_change):
    if price_change > 0.5:
        return 1  # 上涨
    elif price_change < -0.5:
        return -1  # 下跌
    else:
        return 0  # 横盘/无方向


# 原始二分类涨幅
def Classify_binary(price_change):
    if price_change > 0:
        return 1  # 上涨
    else:
        return 0  # 下跌


df["label"] = df["percent_change_next_weeks_price"].apply(Classify_binary)


In [5]:
df.head()


Unnamed: 0,quarter,stock,date,open,high,low,close,volume,percent_change_price,percent_change_volume_over_last_wk,previous_weeks_volume,next_weeks_open,next_weeks_close,days_to_next_dividend,percent_return_next_dividend,percent_change_next_weeks_price,label
0,1,AA,1/7/2011,$15.82,$16.72,$15.78,$16.42,239655616,3.79267,,,$16.71,$15.97,26,0.182704,-4.42849,0
1,1,AA,1/14/2011,$16.71,$16.71,$15.64,$15.97,242963398,-4.42849,1.380223,239655616.0,$16.19,$15.79,19,0.187852,-2.47066,0
2,1,AA,1/21/2011,$16.19,$16.38,$15.60,$15.79,138428495,-2.47066,-43.024959,242963398.0,$15.87,$16.13,12,0.189994,1.63831,1
3,1,AA,1/28/2011,$15.87,$16.63,$15.82,$16.13,151379173,1.63831,9.3555,138428495.0,$16.18,$17.14,5,0.185989,5.93325,1
4,1,AA,2/4/2011,$16.18,$17.39,$16.18,$17.14,154387761,5.93325,1.987452,151379173.0,$17.33,$17.37,97,0.175029,0.230814,1


In [6]:
df["date"] = pd.to_datetime(df["date"])
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750 entries, 0 to 749
Data columns (total 17 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   quarter                             750 non-null    int64         
 1   stock                               750 non-null    object        
 2   date                                750 non-null    datetime64[ns]
 3   open                                750 non-null    object        
 4   high                                750 non-null    object        
 5   low                                 750 non-null    object        
 6   close                               750 non-null    object        
 7   volume                              750 non-null    int64         
 8   percent_change_price                750 non-null    float64       
 9   percent_change_volume_over_last_wk  720 non-null    float64       
 10  previous_weeks_volume     

In [7]:
df.sort_values("date", inplace=True)

price_columns = ["open", "high", "low", "close", "next_weeks_open", "next_weeks_close"]
for col in price_columns:
    df[col] = pd.to_numeric(
        df[col].astype(str).str.replace(r"[\$,]", "", regex=True), errors="coerce"
    )

df.head()


Unnamed: 0,quarter,stock,date,open,high,low,close,volume,percent_change_price,percent_change_volume_over_last_wk,previous_weeks_volume,next_weeks_open,next_weeks_close,days_to_next_dividend,percent_return_next_dividend,percent_change_next_weeks_price,label
0,1,AA,2011-01-07,15.82,16.72,15.78,16.42,239655616,3.79267,,,16.71,15.97,26,0.182704,-4.42849,0
288,1,T,2011-01-07,29.68,30.1,28.66,28.85,157834347,-2.7965,,,28.54,28.43,30,1.49047,-0.385424,0
36,1,BAC,2011-01-07,13.85,14.69,13.8,14.25,1453438639,2.88809,,,14.17,15.25,54,0.070175,7.62174,1
276,1,PG,2011-01-07,64.39,65.08,64.0,64.5,52323352,0.170834,,,64.4,65.53,12,0.744186,1.75466,1
264,1,PFE,2011-01-07,17.7,18.38,17.62,18.34,386804789,3.61582,,,18.22,18.34,26,1.09051,0.658617,1


In [8]:
df.isnull().sum()


quarter                                0
stock                                  0
date                                   0
open                                   0
high                                   0
low                                    0
close                                  0
volume                                 0
percent_change_price                   0
percent_change_volume_over_last_wk    30
previous_weeks_volume                 30
next_weeks_open                        0
next_weeks_close                       0
days_to_next_dividend                  0
percent_return_next_dividend           0
percent_change_next_weeks_price        0
label                                  0
dtype: int64

In [9]:
# 缺失值：上一周无数据 -> 无交易量变化-> 变化为 0。
df['percent_change_volume_over_last_wk'] = df['percent_change_volume_over_last_wk'].fillna(0)
df['previous_weeks_volume'] = df['previous_weeks_volume'].fillna(0)

# 按股票代码分组，做前向填充
# df.sort_values(by=["stock", "date"], inplace=True)
# df["percent_change_volume_over_last_wk"] = (
#     df.groupby("stock")["percent_change_volume_over_last_wk"].ffill().fillna(0)
# )
# df["previous_weeks_volume"] = (
#     df.groupby("stock")["previous_weeks_volume"].ffill().fillna(0)
# )

# 对特征stock进行类别编码
df['stock'] = df['stock'].astype('category').cat.codes
df.head()


Unnamed: 0,quarter,stock,date,open,high,low,close,volume,percent_change_price,percent_change_volume_over_last_wk,previous_weeks_volume,next_weeks_open,next_weeks_close,days_to_next_dividend,percent_return_next_dividend,percent_change_next_weeks_price,label
0,1,0,2011-01-07,15.82,16.72,15.78,16.42,239655616,3.79267,0.0,0.0,16.71,15.97,26,0.182704,-4.42849,0
288,1,24,2011-01-07,29.68,30.1,28.66,28.85,157834347,-2.7965,0.0,0.0,28.54,28.43,30,1.49047,-0.385424,0
36,1,3,2011-01-07,13.85,14.69,13.8,14.25,1453438639,2.88809,0.0,0.0,14.17,15.25,54,0.070175,7.62174,1
276,1,23,2011-01-07,64.39,65.08,64.0,64.5,52323352,0.170834,0.0,0.0,64.4,65.53,12,0.744186,1.75466,1
264,1,22,2011-01-07,17.7,18.38,17.62,18.34,386804789,3.61582,0.0,0.0,18.22,18.34,26,1.09051,0.658617,1


In [10]:
# 滞后特征（1~2期）
for i in range(1, 3):
    df[f"price_change_lag_{i}"] = df.groupby("stock")["percent_change_price"].shift(i)
    df[f"volume_lag_{i}"] = df.groupby("stock")["volume"].shift(i)

# 滚动均值（3期）
# df["price_change_roll3"] = df.groupby("stock")["percent_change_price"].transform(
#     lambda x: x.rolling(window=3).mean()
# )
# df["volume_roll3"] = df.groupby("stock")["volume"].transform(
#     lambda x: x.rolling(window=3).mean()
# )

In [11]:
# 删除无关特征
# df = df.drop(
#     columns=[
#         "date",
#         "next_weeks_open",
#         "next_weeks_close",
#         "percent_change_next_weeks_price",
#     ]
# )
# 删除未来信息 + 保留时间列用于排序
df_model = df.drop(
    columns=["next_weeks_open", "next_weeks_close", "percent_change_next_weeks_price"]
)

# 删除因滞后而产生的缺失值
df.dropna(inplace=True)

from sklearn.preprocessing import MinMaxScaler

# 特征标准化
feature_cols = df_model.drop(columns=["label", "stock", "date"]).columns
scaler = MinMaxScaler()
df_model[feature_cols] = scaler.fit_transform(df_model[feature_cols])

# 拆分特征与目标变量
# X = df.drop("label", axis=1)
# y = df["label"]


In [12]:
# 构建滑动窗口序列数据
def create_lstm_sequences(df, seq_len=5):
    X, y = [], []
    grouped = df.groupby("stock")

    for _, group in grouped:
        group = group.sort_values("date")
        features = group[feature_cols].values
        labels = group["label"].values

        for i in range(len(group) - seq_len):
            X.append(features[i: i + seq_len])
            y.append(labels[i + seq_len])  # 预测第 seq_len+1 周的涨跌

    return np.array(X), np.array(y)


# 拆分特征与目标变量
X, y = create_lstm_sequences(df_model, seq_len=5)


In [13]:
# 划分训练集（前80%）和测试集（后20%）
split_index = int(len(X) * 0.8)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

# 划分训练集/测试集 - 时序交叉验证
# from sklearn.model_selection import TimeSeriesSplit
#
# tscv = TimeSeriesSplit(n_splits=5)
# for train_index, test_index in tscv.split(X):
#     X_train, X_test = X.iloc[train_index], X.iloc[test_index]
#     y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
#     print(f"Fold {fold + 1}")
#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = y[train_idx], y[test_idx]


## LSTM

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 构建 LSTM 模型
model = Sequential()
model.add(
    LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False)
)
model.add(Dropout(0.2))
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# 训练模型
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

# 评估模型
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Epoch 1/20


  super().__init__(**kwargs)


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.5365 - loss: nan - val_accuracy: 0.5417 - val_loss: nan
Epoch 2/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5040 - loss: nan - val_accuracy: 0.5417 - val_loss: nan
Epoch 3/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5176 - loss: nan - val_accuracy: 0.5417 - val_loss: nan
Epoch 4/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4761 - loss: nan - val_accuracy: 0.5417 - val_loss: nan
Epoch 5/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4601 - loss: nan - val_accuracy: 0.5417 - val_loss: nan
Epoch 6/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4779 - loss: nan - val_accuracy: 0.5417 - val_loss: nan
Epoch 7/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
