# LSTM + DNN 최종코드

In [None]:
import pandas as pd

# 파일 경로
stock_path = "./AAPL_1hour_data_365days.csv"
news_path = "./apple_finbert_finnhub.csv"

# 데이터 불러오기
stock_df = pd.read_csv(stock_path, parse_dates=["Datetime"])
news_df = pd.read_csv(news_path, parse_dates=["pubDate"])

# 타임존 제거
stock_df["Datetime"] = stock_df["Datetime"].dt.tz_localize(None)
news_df["pubDate"] = news_df["pubDate"].dt.tz_localize(None)

# 정렬
stock_df = stock_df.sort_values("Datetime").reset_index(drop=True)

# 제외할 열
exclude_cols = ['Is_Trading_Hours', 'Is_Market_Open', 'Is_Premarket', 'Is_Aftermarket', 'Is_Extended_Hours']
stock_df = stock_df.drop(columns=[col for col in exclude_cols if col in stock_df.columns])

# 병합 결과
rows = []

for _, news_row in news_df.iterrows():
    news_time = news_row['pubDate']

    # 뉴스 이후 가장 가까운 주가
    future_stock = stock_df[stock_df['Datetime'] > news_time].head(1)
    if future_stock.empty:
        continue

    target_row = future_stock.iloc[0]
    target_time = target_row['Datetime']
    target_close = target_row['Close']

    # 과거 3개 주가
    past_rows = stock_df[stock_df['Datetime'] < target_time].tail(3)
    if len(past_rows) < 3:
        continue

    past_last_close = past_rows.iloc[-1]['Close']

    # 상승률
    return_pct = (target_close - past_last_close) / past_last_close * 100
    label = 1 if return_pct >= 0.4 else (-1 if return_pct <= -0.4 else 0)

    # 병합 row 생성
    row = {
        "news_id": news_row['id'],
        "news_time": news_time,
        "target_close": target_close,
        "target_return_pct": return_pct,
        "target_multi_raw": label,
        "finbert_positive": news_row['finbert_positive'],
        "finbert_neutral": news_row['finbert_neutral'],
        "finbert_negative": news_row['finbert_negative'],
    }

    # 과거 3개 flatten
    for i, (_, stock_row) in enumerate(past_rows.iterrows(), 1):
        for col in stock_df.columns:
            if col == "Datetime":
                continue
            row[f"x{i}_{col}"] = stock_row[col]

    rows.append(row)

# 최종 DataFrame
merged_df = pd.DataFrame(rows)

# 클래스 0/1/2로 매핑 (XGBoost용)
label_map = {-1: 0, 0: 1, 1: 2}
merged_df["target_multi"] = merged_df["target_multi_raw"].map(label_map)

# 저장
merged_df.to_csv("news_stock_classification.csv", index=False)
print("병합 완료: news_stock_classification.csv 저장됨")


In [9]:

import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. 데이터 불러오기
df = pd.read_csv("news_stock_classification.csv", parse_dates=["news_time"])

# 2. Feature 및 Label 준비
feature_cols = [col for col in df.columns if col.startswith("x") or col.startswith("finbert_")]
X = df[feature_cols].fillna(0)
y = df["target_multi"]

# 3. 시계열 데이터 3-step 생성 (x1_, x2_, x3_)
X_seq = []
for i in range(len(X)):
    X_seq.append([
        X.iloc[i][[col for col in X.columns if col.startswith("x1_")]].values,
        X.iloc[i][[col for col in X.columns if col.startswith("x2_")]].values,
        X.iloc[i][[col for col in X.columns if col.startswith("x3_")]].values
    ])
X_seq = np.array(X_seq)

# 4. FinBERT 피처 추가 (Broadcast across time steps)
finbert_feats = X[[c for c in X.columns if c.startswith("finbert_")]].values
finbert_feats = np.repeat(finbert_feats[:, np.newaxis, :], 3, axis=1)
X_seq = np.concatenate([X_seq, finbert_feats], axis=-1)

# 5. 정규화
n_samples, time_steps, n_features = X_seq.shape
X_reshaped = X_seq.reshape(-1, n_features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_seq = X_scaled.reshape(n_samples, time_steps, n_features)

# 6. Tensor로 변환
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 7. Train/Test 분리
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, shuffle=False)
train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_dl = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# 8. LSTM 모델 정의
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=3):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.fc(hn[-1])

# 9. 학습 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(input_dim=n_features).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 10. 학습 루프
for epoch in range(50):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")

Epoch 1 | Loss: 165.6778
Epoch 2 | Loss: 139.9572
Epoch 3 | Loss: 124.0224
Epoch 4 | Loss: 111.9599
Epoch 5 | Loss: 102.0316
Epoch 6 | Loss: 91.2283
Epoch 7 | Loss: 82.4791
Epoch 8 | Loss: 74.9245
Epoch 9 | Loss: 68.3347
Epoch 10 | Loss: 62.1140
Epoch 11 | Loss: 56.4879
Epoch 12 | Loss: 51.9471
Epoch 13 | Loss: 47.4376
Epoch 14 | Loss: 43.8949
Epoch 15 | Loss: 39.6787
Epoch 16 | Loss: 36.4289
Epoch 17 | Loss: 33.2387
Epoch 18 | Loss: 30.9987
Epoch 19 | Loss: 28.0263
Epoch 20 | Loss: 25.6803
Epoch 21 | Loss: 23.1786
Epoch 22 | Loss: 21.3123
Epoch 23 | Loss: 19.6293
Epoch 24 | Loss: 17.5688
Epoch 25 | Loss: 15.6692
Epoch 26 | Loss: 14.3345
Epoch 27 | Loss: 12.6775
Epoch 28 | Loss: 11.6631
Epoch 29 | Loss: 10.7014
Epoch 30 | Loss: 9.4993
Epoch 31 | Loss: 8.4388
Epoch 32 | Loss: 7.4168
Epoch 33 | Loss: 7.0659
Epoch 34 | Loss: 5.9982
Epoch 35 | Loss: 5.2894
Epoch 36 | Loss: 4.8408
Epoch 37 | Loss: 4.3750
Epoch 38 | Loss: 3.9858
Epoch 39 | Loss: 4.1333
Epoch 40 | Loss: 3.2084
Epoch 41 | Loss

In [10]:

# 11. 평가
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        preds = model(xb).argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("\n Accuracy:", accuracy_score(all_labels, all_preds))
print("\n Classification Report:\n", classification_report(all_labels, all_preds))
print("\n Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))


📈 Accuracy: 0.5917297612114153

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.09      0.13      0.11       246
           1       0.77      0.78      0.77      1224
           2       0.25      0.11      0.15       247

    accuracy                           0.59      1717
   macro avg       0.37      0.34      0.34      1717
weighted avg       0.59      0.59      0.59      1717


🧱 Confusion Matrix:
 [[ 32 161  53]
 [237 957  30]
 [ 90 130  27]]
