# 1

In [1]:
import pandas as pd

# 파일 경로
stock_path = "./AAPL_1hour_data_365days.csv"
news_path = "./apple_finbert_finnhub.csv"

# 데이터 불러오기
stock_df = pd.read_csv(stock_path, parse_dates=["Datetime"])
news_df = pd.read_csv(news_path, parse_dates=["pubDate"])

# 타임존 제거
stock_df["Datetime"] = stock_df["Datetime"].dt.tz_localize(None)
news_df["pubDate"] = news_df["pubDate"].dt.tz_localize(None)

# 정렬
stock_df = stock_df.sort_values("Datetime").reset_index(drop=True)

# 제외할 열
exclude_cols = ['Is_Trading_Hours', 'Is_Market_Open', 'Is_Premarket', 'Is_Aftermarket', 'Is_Extended_Hours']
stock_df = stock_df.drop(columns=[col for col in exclude_cols if col in stock_df.columns])

# 병합 결과
rows = []

for _, news_row in news_df.iterrows():
    news_time = news_row['pubDate']

    # 뉴스 이후 가장 가까운 주가
    future_stock = stock_df[stock_df['Datetime'] > news_time].head(1)
    if future_stock.empty:
        continue

    target_row = future_stock.iloc[0]
    target_time = target_row['Datetime']
    target_close = target_row['Close']

    # 과거 3개 주가
    past_rows = stock_df[stock_df['Datetime'] < target_time].tail(3)
    if len(past_rows) < 3:
        continue

    past_last_close = past_rows.iloc[-1]['Close']

    # 상승률
    return_pct = (target_close - past_last_close) / past_last_close * 100
    label = 1 if return_pct >= 0.4 else (-1 if return_pct <= -0.4 else 0)

    # 병합 row 생성
    row = {
        "news_id": news_row['id'],
        "news_time": news_time,
        "target_close": target_close,
        "target_return_pct": return_pct,
        "target_multi_raw": label,
        "finbert_positive": news_row['finbert_positive'],
        "finbert_neutral": news_row['finbert_neutral'],
        "finbert_negative": news_row['finbert_negative'],
    }

    # 과거 3개 flatten
    for i, (_, stock_row) in enumerate(past_rows.iterrows(), 1):
        for col in stock_df.columns:
            if col == "Datetime":
                continue
            row[f"x{i}_{col}"] = stock_row[col]

    rows.append(row)

# 최종 DataFrame
merged_df = pd.DataFrame(rows)

# 클래스 0/1/2로 매핑 (XGBoost용)
label_map = {-1: 0, 0: 1, 1: 2}
merged_df["target_multi"] = merged_df["target_multi_raw"].map(label_map)

# 저장
merged_df.to_csv("news_stock_classification.csv", index=False)
print("✅ 병합 완료: news_stock_classification.csv 저장됨")


✅ 병합 완료: news_stock_classification.csv 저장됨


In [2]:

import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. 데이터 불러오기
df = pd.read_csv("news_stock_classification.csv", parse_dates=["news_time"])

# 2. Feature 및 Label 준비
feature_cols = [col for col in df.columns if col.startswith("x") or col.startswith("finbert_")]
X = df[feature_cols].fillna(0)
y = df["target_multi"]

# 3. 시계열 데이터 3-step 생성 (x1_, x2_, x3_)
X_seq = []
for i in range(len(X)):
    X_seq.append([
        X.iloc[i][[col for col in X.columns if col.startswith("x1_")]].values,
        X.iloc[i][[col for col in X.columns if col.startswith("x2_")]].values,
        X.iloc[i][[col for col in X.columns if col.startswith("x3_")]].values
    ])
X_seq = np.array(X_seq)

# 4. FinBERT 피처 추가 (Broadcast across time steps)
finbert_feats = X[[c for c in X.columns if c.startswith("finbert_")]].values
finbert_feats = np.repeat(finbert_feats[:, np.newaxis, :], 3, axis=1)
X_seq = np.concatenate([X_seq, finbert_feats], axis=-1)

# 5. 정규화
n_samples, time_steps, n_features = X_seq.shape
X_reshaped = X_seq.reshape(-1, n_features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_seq = X_scaled.reshape(n_samples, time_steps, n_features)

# 6. Tensor로 변환
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 7. Train/Test 분리
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, shuffle=False)
train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_dl = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# 8. LSTM 모델 정의
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=3):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.fc(hn[-1])

# 9. 학습 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(input_dim=n_features).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 10. 학습 루프
for epoch in range(50):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")

Epoch 1 | Loss: 164.0788
Epoch 2 | Loss: 141.5729
Epoch 3 | Loss: 125.2798
Epoch 4 | Loss: 111.2254
Epoch 5 | Loss: 99.4632
Epoch 6 | Loss: 90.4368
Epoch 7 | Loss: 81.4495
Epoch 8 | Loss: 72.8773
Epoch 9 | Loss: 67.3597
Epoch 10 | Loss: 60.8399
Epoch 11 | Loss: 55.6103
Epoch 12 | Loss: 51.2171
Epoch 13 | Loss: 46.9381
Epoch 14 | Loss: 42.8304
Epoch 15 | Loss: 39.2854
Epoch 16 | Loss: 36.0406
Epoch 17 | Loss: 32.5115
Epoch 18 | Loss: 29.6463
Epoch 19 | Loss: 27.1152
Epoch 20 | Loss: 24.6855
Epoch 21 | Loss: 23.1054
Epoch 22 | Loss: 20.5943
Epoch 23 | Loss: 18.8145
Epoch 24 | Loss: 17.1873
Epoch 25 | Loss: 15.4179
Epoch 26 | Loss: 14.2074
Epoch 27 | Loss: 12.4770
Epoch 28 | Loss: 11.9293
Epoch 29 | Loss: 10.3284
Epoch 30 | Loss: 9.1492
Epoch 31 | Loss: 8.0573
Epoch 32 | Loss: 7.6288
Epoch 33 | Loss: 6.4673
Epoch 34 | Loss: 5.7015
Epoch 35 | Loss: 5.3819
Epoch 36 | Loss: 5.4640
Epoch 37 | Loss: 4.4068
Epoch 38 | Loss: 5.6787
Epoch 39 | Loss: 3.5228
Epoch 40 | Loss: 2.8861
Epoch 41 | Loss:

In [3]:

# 11. 평가
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        preds = model(xb).argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("\n📈 Accuracy:", accuracy_score(all_labels, all_preds))
print("\n📊 Classification Report:\n", classification_report(all_labels, all_preds))
print("\n🧱 Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))


📈 Accuracy: 0.5701805474665114

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.10      0.13      0.12       246
           1       0.75      0.75      0.75      1224
           2       0.19      0.14      0.16       247

    accuracy                           0.57      1717
   macro avg       0.35      0.34      0.34      1717
weighted avg       0.58      0.57      0.57      1717


🧱 Confusion Matrix:
 [[ 33 136  77]
 [244 912  68]
 [ 46 167  34]]


# 2

In [8]:
import pandas as pd
import os

def load_stock_and_news(stock_path, news_path):
    stock = pd.read_csv(stock_path, parse_dates=["Datetime"])
    news = pd.read_csv(news_path, parse_dates=["pubDate"])

    stock["Datetime"] = stock["Datetime"].dt.tz_localize(None)
    news["pubDate"] = news["pubDate"].dt.tz_localize(None)

    # 정렬 및 제외 열 제거
    stock = stock.sort_values("Datetime").reset_index(drop=True)
    stock = stock.drop(columns=[col for col in stock.columns if col.startswith("Is_")])

    return stock, news


def make_binary_merged_df(stock_df, news_df, company):
    rows = []

    for _, news_row in news_df.iterrows():
        news_time = news_row["pubDate"]

        # 뉴스 이후 가장 가까운 주가
        future_row = stock_df[stock_df["Datetime"] > news_time].head(1)
        if future_row.empty:
            continue

        target_row = future_row.iloc[0]
        target_return = target_row.get("Returns", None)
        if pd.isna(target_return):
            continue

        # 과거 5개
        past_rows = stock_df[stock_df["Datetime"] < target_row["Datetime"]].tail(5)
        if len(past_rows) < 5:
            continue

        if target_return >= 0.01:
            label = 1
        elif target_return <= -0.01:
            label = 0
        else:
            continue  # 기준 미달인 경우는 무시

        row = {
            "company": company,
            "news_time": news_time,
            "target_return": target_return,
            "target": label,
            "finbert_positive": news_row["finbert_positive"],
            "finbert_neutral": news_row["finbert_neutral"],
            "finbert_negative": news_row["finbert_negative"]
        }

        for i, (_, p_row) in enumerate(past_rows.iterrows(), 1):
            for col in stock_df.columns:
                if col == "Datetime":
                    continue
                row[f"x{i}_{col}"] = p_row[col]

        rows.append(row)

    return pd.DataFrame(rows)

In [9]:
base_dir = "./"  # 압축 풀린 폴더 기준
companies = {
    "AAPL": ("AAPL_1hour_data_365days.csv", "apple_finbert_finnhub.csv"),
    "AMZN": ("AMZN_1hour_data_365days.csv", "amazon_finbert_finnhub.csv"),
    "GOOGL": ("GOOGL_1hour_data_365days.csv", "google_finbert_finnhub.csv"),
    "MSFT": ("MSFT_1hour_data_365days.csv", "microsoft_finbert_finnhub.csv"),
    "TSLA": ("TSLA_1hour_data_365days.csv", "tesla_finbert_finnhub.csv"),
}

dfs = []
for company, (stock_file, news_file) in companies.items():
    stock_path = os.path.join(base_dir, stock_file)
    news_path = os.path.join(base_dir, news_file)
    if not os.path.exists(stock_path) or not os.path.exists(news_path):
        continue

    stock_df, news_df = load_stock_and_news(stock_path, news_path)
    merged_df = make_binary_merged_df(stock_df, news_df, company)
    dfs.append(merged_df)

# 최종 병합
final_df = pd.concat(dfs, ignore_index=True)
final_df.to_csv("news_stock_binary_classification.csv", index=False)
print("news_stock_binary_classification.csv 저장 완료")


news_stock_binary_classification.csv 저장 완료


In [14]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. Load data
df = pd.read_csv("news_stock_binary_classification.csv", parse_dates=["news_time"])
df = df.dropna()

# 2. Features & Labels
feature_cols = [col for col in df.columns if col.startswith("x") or col.startswith("finbert_")]
X = df[feature_cols].fillna(0)
y = df["target"]

# 3. Reshape to time-series: x1_ ~ x5_
X_seq = []
for i in range(len(X)):
    timestep_data = []
    for t in range(1, 6):
        timestep_data.append(X.iloc[i][[col for col in X.columns if col.startswith(f"x{t}_")]].values)
    X_seq.append(timestep_data)
X_seq = np.array(X_seq)

# 4. FinBERT features broadcast
finbert_feats = X[[c for c in X.columns if c.startswith("finbert_")]].values
finbert_feats = np.repeat(finbert_feats[:, np.newaxis, :], 5, axis=1)
X_seq = np.concatenate([X_seq, finbert_feats], axis=-1)

# 5. Normalize
n_samples, time_steps, n_features = X_seq.shape
X_reshaped = X_seq.reshape(-1, n_features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_seq = X_scaled.reshape(n_samples, time_steps, n_features)

# 6. Tensor conversion
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 7. Time-based split
split_idx = int(len(df) * 0.8)
X_train, X_test = X_tensor[:split_idx], X_tensor[split_idx:]
y_train, y_test = y_tensor[:split_idx], y_tensor[split_idx:]

train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_dl  = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# 8. Model - Deep LSTM + MLP
class DeepLSTMClassifier(nn.Module):
    def __init__(self, input_dim, lstm_hidden=512, mlp_hidden=[512, 256, 128, 64], output_dim=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, lstm_hidden, num_layers=2, batch_first=True, dropout=0.3)
        self.mlp = nn.Sequential(
            nn.Linear(lstm_hidden, mlp_hidden[0]),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(mlp_hidden[0], mlp_hidden[1]),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(mlp_hidden[1], mlp_hidden[2]),
            nn.ReLU(),
            nn.Linear(mlp_hidden[2], mlp_hidden[3]),
            nn.ReLU(),
            nn.Linear(mlp_hidden[3], output_dim)
        )

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.mlp(hn[-1])

# 9. Focal Loss 정의
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=0.8):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)
        focal = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal.mean()

# 10. Training Setup + EarlyStopping
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = DeepLSTMClassifier(input_dim=n_features).to(device)

loss_fn = FocalLoss(gamma=2.0, alpha=2.0)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
patience = 12
best_loss = float("inf")
epochs_no_improve = 0

for epoch in range(100):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation loss
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for xb, yb in test_dl:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = loss_fn(pred, yb)
            val_loss += loss.item()
    val_loss /= len(test_dl)

    print(f"Epoch {epoch+1:02d} | Train Loss: {total_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("⏹️ Early stopping triggered.")
            break

model.load_state_dict(best_model_state)

# 11. 평가
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        preds = model(xb).argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("\n📈 Accuracy:", accuracy_score(all_labels, all_preds))
print("\n📊 Classification Report:\n", classification_report(all_labels, all_preds))
print("\n🧱 Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))


Epoch 01 | Train Loss: 41.3100 | Val Loss: 0.3419
Epoch 02 | Train Loss: 37.3130 | Val Loss: 0.4158
Epoch 03 | Train Loss: 32.4232 | Val Loss: 0.7833
Epoch 04 | Train Loss: 28.6629 | Val Loss: 0.7505
Epoch 05 | Train Loss: 22.6333 | Val Loss: 1.1208
Epoch 06 | Train Loss: 22.0089 | Val Loss: 0.9983
Epoch 07 | Train Loss: 17.0868 | Val Loss: 1.4601
Epoch 08 | Train Loss: 14.0339 | Val Loss: 2.3823
Epoch 09 | Train Loss: 14.4976 | Val Loss: 1.4230
Epoch 10 | Train Loss: 12.8186 | Val Loss: 1.4774
Epoch 11 | Train Loss: 9.4039 | Val Loss: 2.5632
Epoch 12 | Train Loss: 7.7556 | Val Loss: 2.0023
Epoch 13 | Train Loss: 6.9513 | Val Loss: 3.5774
⏹️ Early stopping triggered.

📈 Accuracy: 0.5678137651821862

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.47      0.50       449
           1       0.59      0.65      0.62       539

    accuracy                           0.57       988
   macro avg       0.56      0.56      0.56    

# 3

In [1]:
import pandas as pd
import numpy as np
import os

def load_stock_and_news(stock_path, news_path):
    stock = pd.read_csv(stock_path, parse_dates=["Datetime"])
    news = pd.read_csv(news_path, parse_dates=["pubDate"])

    stock["Datetime"] = stock["Datetime"].dt.tz_localize(None)
    news["pubDate"] = news["pubDate"].dt.tz_localize(None)

    stock = stock.sort_values("Datetime").reset_index(drop=True)
    stock = stock.drop(columns=[col for col in stock.columns if col.startswith("Is_")])

    return stock, news

def classify_context(timestamp):
    hour, minute = timestamp.hour, timestamp.minute
    if hour < 9 or (hour == 9 and minute < 30):
        return "premarket"
    elif 9 <= hour < 16:
        return "intraday"
    else:
        return "aftermarket"

def make_binary_merged_df(stock_df, news_df, company):
    rows = []

    for _, news_row in news_df.iterrows():
        news_time = news_row["pubDate"]
        context = classify_context(news_time)

        future_row = stock_df[stock_df["Datetime"] > news_time].head(1)
        if future_row.empty:
            continue

        target_row = future_row.iloc[0]
        target_return = target_row.get("Returns", None)
        if pd.isna(target_return):
            continue

        past_rows = stock_df[stock_df["Datetime"] < target_row["Datetime"]].tail(5)
        if len(past_rows) < 5:
            continue

        if target_return >= 0.01:
            label = 1
        elif target_return <= -0.01:
            label = 0
        else:
            continue

        row = {
            "company": company,
            "news_time": news_time,
            "context": context,
            "target_return": target_return,
            "target": label,
            "finbert_positive": news_row["finbert_positive"],
            "finbert_neutral": news_row["finbert_neutral"],
            "finbert_negative": news_row["finbert_negative"]
        }

        for i, (_, p_row) in enumerate(past_rows.iterrows(), 1):
            for col in stock_df.columns:
                if col == "Datetime":
                    continue
                row[f"x{i}_{col}"] = p_row[col]

        rows.append(row)

    return pd.DataFrame(rows)


In [2]:
base_dir = "./"
companies = {
    "AAPL": ("AAPL_1hour_data_365days.csv", "apple_finbert_finnhub.csv"),
    "AMZN": ("AMZN_1hour_data_365days.csv", "amazon_finbert_finnhub.csv"),
    "GOOGL": ("GOOGL_1hour_data_365days.csv", "google_finbert_finnhub.csv"),
    "MSFT": ("MSFT_1hour_data_365days.csv", "microsoft_finbert_finnhub.csv"),
    "TSLA": ("TSLA_1hour_data_365days.csv", "tesla_finbert_finnhub.csv"),
}

dfs = []
for company, (stock_file, news_file) in companies.items():
    stock_path = os.path.join(base_dir, stock_file)
    news_path = os.path.join(base_dir, news_file)
    if not os.path.exists(stock_path) or not os.path.exists(news_path):
        continue

    stock_df, news_df = load_stock_and_news(stock_path, news_path)
    merged_df = make_binary_merged_df(stock_df, news_df, company)
    dfs.append(merged_df)

final_df = pd.concat(dfs, ignore_index=True)
final_df.to_csv("news_stock_binary_classification.csv", index=False)


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate

# 1. 데이터 로드
df = pd.read_csv("news_stock_binary_classification.csv", parse_dates=["news_time"])
df["context"] = df["context"].astype(str)
df["company"] = df["company"].astype(str)

# 2. 입력 분리
feature_cols_seq = [col for col in df.columns if col.startswith("x")]
feature_cols_finbert = ["finbert_positive", "finbert_neutral", "finbert_negative"]
company_dummies = pd.get_dummies(df["company"], prefix="company")
context_dummies = pd.get_dummies(df["context"], prefix="context")
X_static = pd.concat([df[feature_cols_finbert], company_dummies, context_dummies], axis=1).astype(np.float32).values
X_seq = df[feature_cols_seq].values
y = df["target"].values

# 3. LSTM 시퀀스 리쉐이핑
X_seq = X_seq.reshape((-1, 5, len(feature_cols_seq)//5))

# 4. 시계열 스케일링
scaler = StandardScaler()
X_seq_scaled = scaler.fit_transform(X_seq.reshape(-1, X_seq.shape[-1])).reshape(X_seq.shape)

# 5. TimeSeriesSplit 적용
tscv = TimeSeriesSplit(n_splits=5)

for fold, (train_idx, test_idx) in enumerate(tscv.split(X_seq_scaled), 1):
    X_train_seq = X_seq_scaled[train_idx]
    X_test_seq = X_seq_scaled[test_idx]
    X_train_static = X_static[train_idx]
    X_test_static = X_static[test_idx]
    y_train = y[train_idx]
    y_test = y[test_idx]

    # 모델 정의
    tf.keras.backend.clear_session()
    np.random.seed(42)
    tf.random.set_seed(42)

    seq_input = Input(shape=(5, X_seq.shape[2]), name="seq_input")
    lstm_out = LSTM(64, activation="tanh")(seq_input)

    static_input = Input(shape=(X_static.shape[1],), name="static_input")
    x = Concatenate()([lstm_out, static_input])
    x = Dense(64, activation="relu")(x)
    x = Dense(32, activation="relu")(x)
    output = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=[seq_input, static_input], outputs=output)
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    # 학습
    model.fit([X_train_seq, X_train_static], y_train, epochs=50, batch_size=32, verbose=1)

    # 예측
    y_prob = model.predict([X_test_seq, X_test_static])
    y_pred = (y_prob.flatten() > 0.5).astype(int)

    # 출력
    print(f"\n📦 Fold {fold}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))



Epoch 1/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5527 - loss: 0.6813
Epoch 2/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7880 - loss: 0.5674 
Epoch 3/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8538 - loss: 0.3556 
Epoch 4/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9095 - loss: 0.2548 
Epoch 5/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9170 - loss: 0.2023 
Epoch 6/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9374 - loss: 0.1643 
Epoch 7/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9453 - loss: 0.1348 
Epoch 8/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.9616 - loss: 0.1085 
Epoch 9/50
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/50
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5480 - loss: 0.6853
Epoch 2/50
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5430 - loss: 0.6913
Epoch 3/50
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5430 - loss: 0.6906
Epoch 4/50
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5430 - loss: 0.6905
Epoch 5/50
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5430 - loss: 0.6905
Epoch 6/50
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5430 - loss: 0.6905
Epoch 7/50
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5430 - loss: 0.6905
Epoch 8/50
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5430 - loss: 0.6905
Epoch 9/50
[1m104/104[0m [32m━━━━━━━━

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Epoch 1/50
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.4961 - loss: 0.6934
Epoch 2/50
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5157 - loss: 0.6929
Epoch 3/50
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5157 - loss: 0.6929
Epoch 4/50
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5157 - loss: 0.6928
Epoch 5/50
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5157 - loss: 0.6928
Epoch 6/50
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5157 - loss: 0.6928
Epoch 7/50
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5157 - loss: 0.6928
Epoch 8/50
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5157 - loss: 0.6928
Epoch 9/50
[1m129/129[0m [32m━━━━━━━━

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. Load data
base_df = pd.read_csv("news_stock_binary_classification.csv", parse_dates=["news_time"])
base_df = base_df.dropna().reset_index(drop=True)

# 2. Extract FinBERT, Time Context, Company info
finbert_cols = [c for c in base_df.columns if c.startswith("finbert_")]

context_dummies = pd.get_dummies(base_df["context"], prefix="context")
company_dummies = pd.get_dummies(base_df["company"], prefix="company")

static_feats = pd.concat([
    base_df[finbert_cols],
    context_dummies,
    company_dummies
], axis=1).astype(np.float32)

# 3. Extract and reshape sequence features (x1_~x5_)
x_cols = [c for c in base_df.columns if c.startswith("x") and not c.startswith("x_")]
X_seq = []
for i in range(len(base_df)):
    timestep_data = []
    for t in range(1, 6):
        timestep_data.append(base_df.iloc[i][[col for col in x_cols if col.startswith(f"x{t}_")]].values)
    X_seq.append(timestep_data)
X_seq = np.array(X_seq, dtype=np.float32)  # (n_samples, 5, num_features_per_time)

# 4. Normalize sequence part
n_samples, time_steps, n_features = X_seq.shape
scaler = StandardScaler()
X_seq_reshaped = X_seq.reshape(-1, n_features)
X_seq_scaled = scaler.fit_transform(X_seq_reshaped).reshape(n_samples, time_steps, n_features)

# 5. Concatenate static features to each timestep
finbert_feats = static_feats.values[:, np.newaxis, :].repeat(time_steps, axis=1)
X_combined = np.concatenate([X_seq_scaled, finbert_feats], axis=-1)

# 6. Labels
y = base_df["target"].values

# 7. Tensor conversion
X_tensor = torch.tensor(X_combined, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# 8. Time-based split
split_idx = int(len(base_df) * 0.8)
X_train, X_test = X_tensor[:split_idx], X_tensor[split_idx:]
y_train, y_test = y_tensor[:split_idx], y_tensor[split_idx:]

train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_dl  = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# 9. Model
class DeepLSTMClassifier(nn.Module):
    def __init__(self, input_dim, lstm_hidden=128, mlp_hidden=[128, 64], output_dim=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, lstm_hidden, num_layers=1, batch_first=True, dropout=0.2)
        self.mlp = nn.Sequential(
            nn.Linear(lstm_hidden, mlp_hidden[0]),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(mlp_hidden[0], mlp_hidden[1]),
            nn.ReLU(),
            nn.Linear(mlp_hidden[1], output_dim)
        )

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.mlp(hn[-1])

# 10. Focal Loss
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=0.8):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)
        return (self.alpha * (1 - pt) ** self.gamma * ce_loss).mean()

# 11. Train
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = DeepLSTMClassifier(input_dim=X_combined.shape[-1]).to(device)
loss_fn = FocalLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

best_loss = float('inf')
patience = 10
no_improve = 0

for epoch in range(100):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for xb, yb in test_dl:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = loss_fn(pred, yb)
            val_loss += loss.item()
    val_loss /= len(test_dl)

    print(f"Epoch {epoch+1:02d} | Train Loss: {total_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        best_model_state = model.state_dict()
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            print("⏹️ Early stopping triggered.")
            break

# Load best model
model.load_state_dict(best_model_state)
model.eval()

# 12. Evaluation
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        preds = model(xb).argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("\n📈 Accuracy:", accuracy_score(all_labels, all_preds))
print("\n📊 Classification Report:\n", classification_report(all_labels, all_preds))
print("\n🧱 Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))




Epoch 01 | Train Loss: 15.6411 | Val Loss: 0.1352
Epoch 02 | Train Loss: 11.7393 | Val Loss: 0.2596
Epoch 03 | Train Loss: 8.3732 | Val Loss: 0.3408
Epoch 04 | Train Loss: 6.4889 | Val Loss: 0.4450
Epoch 05 | Train Loss: 5.2921 | Val Loss: 0.6433
Epoch 06 | Train Loss: 4.5460 | Val Loss: 0.6905
Epoch 07 | Train Loss: 3.9450 | Val Loss: 0.7545
Epoch 08 | Train Loss: 3.7066 | Val Loss: 0.5245
Epoch 09 | Train Loss: 3.2327 | Val Loss: 0.9912
Epoch 10 | Train Loss: 2.2145 | Val Loss: 1.2055
Epoch 11 | Train Loss: 1.7498 | Val Loss: 1.4031
⏹️ Early stopping triggered.

📈 Accuracy: 0.5668016194331984

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.38      0.44       449
           1       0.58      0.72      0.65       539

    accuracy                           0.57       988
   macro avg       0.56      0.55      0.54       988
weighted avg       0.56      0.57      0.55       988


🧱 Confusion Matrix:
 [[171 278]
 [150 389]]

In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Dense, Concatenate, LayerNormalization, Dropout, MultiHeadAttention, GlobalAveragePooling1D
)
from tensorflow.keras.optimizers import Adam

# 1. Load data
df = pd.read_csv("news_stock_binary_classification.csv", parse_dates=["news_time"])
df["context"] = df["context"].astype(str)
df["company"] = df["company"].astype(str)

# 2. Split features
feature_cols_seq = [col for col in df.columns if col.startswith("x")]
feature_cols_finbert = ["finbert_positive", "finbert_neutral", "finbert_negative"]
company_dummies = pd.get_dummies(df["company"], prefix="company")
context_dummies = pd.get_dummies(df["context"], prefix="context")
X_static = pd.concat([df[feature_cols_finbert], company_dummies, context_dummies], axis=1).astype(np.float32).values
X_seq = df[feature_cols_seq].values
y = df["target"].values

# 3. Reshape sequence (batch, timesteps, features)
X_seq = X_seq.reshape((-1, 5, len(feature_cols_seq) // 5))

# 4. Normalize
scaler = StandardScaler()
X_seq_scaled = scaler.fit_transform(X_seq.reshape(-1, X_seq.shape[-1])).reshape(X_seq.shape)

# 5. TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=5)

for fold, (train_idx, test_idx) in enumerate(tscv.split(X_seq_scaled), 1):
    X_train_seq, X_test_seq = X_seq_scaled[train_idx], X_seq_scaled[test_idx]
    X_train_static, X_test_static = X_static[train_idx], X_static[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    tf.keras.backend.clear_session()
    tf.random.set_seed(42)
    np.random.seed(42)

    # Transformer Encoder Layer
    def transformer_encoder(inputs, num_heads=4, ff_dim=128, dropout=0.1):
        attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=inputs.shape[-1])(inputs, inputs)
        attn_output = Dropout(dropout)(attn_output)
        out1 = LayerNormalization(epsilon=1e-6)(inputs + attn_output)

        ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(inputs.shape[-1])
        ])
        ffn_output = ffn(out1)
        ffn_output = Dropout(dropout)(ffn_output)
        return LayerNormalization(epsilon=1e-6)(out1 + ffn_output)

    # Model definition
    seq_input = Input(shape=(5, X_seq.shape[2]), name="seq_input")
    x = transformer_encoder(seq_input)
    x = GlobalAveragePooling1D()(x)

    static_input = Input(shape=(X_static.shape[1],), name="static_input")
    x = Concatenate()([x, static_input])
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation="relu")(x)
    output = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=[seq_input, static_input], outputs=output)
    model.compile(loss="binary_crossentropy", optimizer=Adam(1e-3), metrics=["accuracy"])

    # Train
    model.fit([X_train_seq, X_train_static], y_train, epochs=30, batch_size=32, verbose=1)

    # Evaluate
    y_prob = model.predict([X_test_seq, X_test_static])
    y_pred = (y_prob.flatten() > 0.5).astype(int)

    print(f"\n📦 Fold {fold}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

📦 Fold 1
Accuracy: 0.5121359223300971
Confusion Matrix:
 [[121 318]
 [ 84 301]]
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.28      0.38       439
           1       0.49      0.78      0.60       385

    accuracy                           0.51       824
   macro avg       0.54      0.53      0.49       824
weighted avg       0.54      0.51      0.48       824

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/3

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

📦 Fold 4
Accuracy: 0.3580097087378641
Confusion Matrix:
 [[295   0]
 [529   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.36      1.00      0.53       295
           1       0.00      0.00      0.00       529

    accuracy                           0.36       824
   macro avg       0.18      0.50      0.26       824
weighted avg       0.13      0.36      0.19       824

Epoch 1/30


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

📦 Fold 5
Accuracy: 0.49029126213592233
Confusion Matrix:
 [[404   0]
 [420   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.49      1.00      0.66       404
           1       0.00      0.00      0.00       420

    accuracy                           0.49       824
   macro avg       0.25      0.50      0.33       824
weighted avg       0.24      0.49      0.32       824



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
