# 1

In [1]:
import pandas as pd

# 파일 경로
stock_path = "./AAPL_1hour_data_365days.csv"
news_path = "./apple_finbert_finnhub.csv"

# 데이터 불러오기
stock_df = pd.read_csv(stock_path, parse_dates=["Datetime"])
news_df = pd.read_csv(news_path, parse_dates=["pubDate"])

# 타임존 제거
stock_df["Datetime"] = stock_df["Datetime"].dt.tz_localize(None)
news_df["pubDate"] = news_df["pubDate"].dt.tz_localize(None)

# 정렬
stock_df = stock_df.sort_values("Datetime").reset_index(drop=True)

# 제외할 열
exclude_cols = ['Is_Trading_Hours', 'Is_Market_Open', 'Is_Premarket', 'Is_Aftermarket', 'Is_Extended_Hours']
stock_df = stock_df.drop(columns=[col for col in exclude_cols if col in stock_df.columns])

# 병합 결과
rows = []

for _, news_row in news_df.iterrows():
    news_time = news_row['pubDate']

    # 뉴스 이후 가장 가까운 주가
    future_stock = stock_df[stock_df['Datetime'] > news_time].head(1)
    if future_stock.empty:
        continue

    target_row = future_stock.iloc[0]
    target_time = target_row['Datetime']
    target_close = target_row['Close']

    # 과거 3개 주가
    past_rows = stock_df[stock_df['Datetime'] < target_time].tail(3)
    if len(past_rows) < 3:
        continue

    past_last_close = past_rows.iloc[-1]['Close']

    # 상승률
    return_pct = (target_close - past_last_close) / past_last_close * 100
    label = 1 if return_pct >= 0.4 else (-1 if return_pct <= -0.4 else 0)

    # 병합 row 생성
    row = {
        "news_id": news_row['id'],
        "news_time": news_time,
        "target_close": target_close,
        "target_return_pct": return_pct,
        "target_multi_raw": label,
        "finbert_positive": news_row['finbert_positive'],
        "finbert_neutral": news_row['finbert_neutral'],
        "finbert_negative": news_row['finbert_negative'],
    }

    # 과거 3개 flatten
    for i, (_, stock_row) in enumerate(past_rows.iterrows(), 1):
        for col in stock_df.columns:
            if col == "Datetime":
                continue
            row[f"x{i}_{col}"] = stock_row[col]

    rows.append(row)

# 최종 DataFrame
merged_df = pd.DataFrame(rows)

# 클래스 0/1/2로 매핑 (XGBoost용)
label_map = {-1: 0, 0: 1, 1: 2}
merged_df["target_multi"] = merged_df["target_multi_raw"].map(label_map)

# 저장
merged_df.to_csv("news_stock_classification.csv", index=False)
print("✅ 병합 완료: news_stock_classification.csv 저장됨")


✅ 병합 완료: news_stock_classification.csv 저장됨


In [2]:

import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. 데이터 불러오기
df = pd.read_csv("news_stock_classification.csv", parse_dates=["news_time"])

# 2. Feature 및 Label 준비
feature_cols = [col for col in df.columns if col.startswith("x") or col.startswith("finbert_")]
X = df[feature_cols].fillna(0)
y = df["target_multi"]

# 3. 시계열 데이터 3-step 생성 (x1_, x2_, x3_)
X_seq = []
for i in range(len(X)):
    X_seq.append([
        X.iloc[i][[col for col in X.columns if col.startswith("x1_")]].values,
        X.iloc[i][[col for col in X.columns if col.startswith("x2_")]].values,
        X.iloc[i][[col for col in X.columns if col.startswith("x3_")]].values
    ])
X_seq = np.array(X_seq)

# 4. FinBERT 피처 추가 (Broadcast across time steps)
finbert_feats = X[[c for c in X.columns if c.startswith("finbert_")]].values
finbert_feats = np.repeat(finbert_feats[:, np.newaxis, :], 3, axis=1)
X_seq = np.concatenate([X_seq, finbert_feats], axis=-1)

# 5. 정규화
n_samples, time_steps, n_features = X_seq.shape
X_reshaped = X_seq.reshape(-1, n_features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_seq = X_scaled.reshape(n_samples, time_steps, n_features)

# 6. Tensor로 변환
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 7. Train/Test 분리
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, shuffle=False)
train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_dl = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# 8. LSTM 모델 정의
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=3):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.fc(hn[-1])

# 9. 학습 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(input_dim=n_features).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 10. 학습 루프
for epoch in range(50):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")

Epoch 1 | Loss: 164.0788
Epoch 2 | Loss: 141.5729
Epoch 3 | Loss: 125.2798
Epoch 4 | Loss: 111.2254
Epoch 5 | Loss: 99.4632
Epoch 6 | Loss: 90.4368
Epoch 7 | Loss: 81.4495
Epoch 8 | Loss: 72.8773
Epoch 9 | Loss: 67.3597
Epoch 10 | Loss: 60.8399
Epoch 11 | Loss: 55.6103
Epoch 12 | Loss: 51.2171
Epoch 13 | Loss: 46.9381
Epoch 14 | Loss: 42.8304
Epoch 15 | Loss: 39.2854
Epoch 16 | Loss: 36.0406
Epoch 17 | Loss: 32.5115
Epoch 18 | Loss: 29.6463
Epoch 19 | Loss: 27.1152
Epoch 20 | Loss: 24.6855
Epoch 21 | Loss: 23.1054
Epoch 22 | Loss: 20.5943
Epoch 23 | Loss: 18.8145
Epoch 24 | Loss: 17.1873
Epoch 25 | Loss: 15.4179
Epoch 26 | Loss: 14.2074
Epoch 27 | Loss: 12.4770
Epoch 28 | Loss: 11.9293
Epoch 29 | Loss: 10.3284
Epoch 30 | Loss: 9.1492
Epoch 31 | Loss: 8.0573
Epoch 32 | Loss: 7.6288
Epoch 33 | Loss: 6.4673
Epoch 34 | Loss: 5.7015
Epoch 35 | Loss: 5.3819
Epoch 36 | Loss: 5.4640
Epoch 37 | Loss: 4.4068
Epoch 38 | Loss: 5.6787
Epoch 39 | Loss: 3.5228
Epoch 40 | Loss: 2.8861
Epoch 41 | Loss:

In [3]:

# 11. 평가
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        preds = model(xb).argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("\n📈 Accuracy:", accuracy_score(all_labels, all_preds))
print("\n📊 Classification Report:\n", classification_report(all_labels, all_preds))
print("\n🧱 Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))


📈 Accuracy: 0.5701805474665114

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.10      0.13      0.12       246
           1       0.75      0.75      0.75      1224
           2       0.19      0.14      0.16       247

    accuracy                           0.57      1717
   macro avg       0.35      0.34      0.34      1717
weighted avg       0.58      0.57      0.57      1717


🧱 Confusion Matrix:
 [[ 33 136  77]
 [244 912  68]
 [ 46 167  34]]


# 2

In [8]:
import pandas as pd
import os

def load_stock_and_news(stock_path, news_path):
    stock = pd.read_csv(stock_path, parse_dates=["Datetime"])
    news = pd.read_csv(news_path, parse_dates=["pubDate"])

    stock["Datetime"] = stock["Datetime"].dt.tz_localize(None)
    news["pubDate"] = news["pubDate"].dt.tz_localize(None)

    # 정렬 및 제외 열 제거
    stock = stock.sort_values("Datetime").reset_index(drop=True)
    stock = stock.drop(columns=[col for col in stock.columns if col.startswith("Is_")])

    return stock, news


def make_binary_merged_df(stock_df, news_df, company):
    rows = []

    for _, news_row in news_df.iterrows():
        news_time = news_row["pubDate"]

        # 뉴스 이후 가장 가까운 주가
        future_row = stock_df[stock_df["Datetime"] > news_time].head(1)
        if future_row.empty:
            continue

        target_row = future_row.iloc[0]
        target_return = target_row.get("Returns", None)
        if pd.isna(target_return):
            continue

        # 과거 5개
        past_rows = stock_df[stock_df["Datetime"] < target_row["Datetime"]].tail(5)
        if len(past_rows) < 5:
            continue

        if target_return >= 0.01:
            label = 1
        elif target_return <= -0.01:
            label = 0
        else:
            continue  # 기준 미달인 경우는 무시

        row = {
            "company": company,
            "news_time": news_time,
            "target_return": target_return,
            "target": label,
            "finbert_positive": news_row["finbert_positive"],
            "finbert_neutral": news_row["finbert_neutral"],
            "finbert_negative": news_row["finbert_negative"]
        }

        for i, (_, p_row) in enumerate(past_rows.iterrows(), 1):
            for col in stock_df.columns:
                if col == "Datetime":
                    continue
                row[f"x{i}_{col}"] = p_row[col]

        rows.append(row)

    return pd.DataFrame(rows)

In [9]:
base_dir = "./"  # 압축 풀린 폴더 기준
companies = {
    "AAPL": ("AAPL_1hour_data_365days.csv", "apple_finbert_finnhub.csv"),
    "AMZN": ("AMZN_1hour_data_365days.csv", "amazon_finbert_finnhub.csv"),
    "GOOGL": ("GOOGL_1hour_data_365days.csv", "google_finbert_finnhub.csv"),
    "MSFT": ("MSFT_1hour_data_365days.csv", "microsoft_finbert_finnhub.csv"),
    "TSLA": ("TSLA_1hour_data_365days.csv", "tesla_finbert_finnhub.csv"),
}

dfs = []
for company, (stock_file, news_file) in companies.items():
    stock_path = os.path.join(base_dir, stock_file)
    news_path = os.path.join(base_dir, news_file)
    if not os.path.exists(stock_path) or not os.path.exists(news_path):
        continue

    stock_df, news_df = load_stock_and_news(stock_path, news_path)
    merged_df = make_binary_merged_df(stock_df, news_df, company)
    dfs.append(merged_df)

# 최종 병합
final_df = pd.concat(dfs, ignore_index=True)
final_df.to_csv("news_stock_binary_classification.csv", index=False)
print("news_stock_binary_classification.csv 저장 완료")


news_stock_binary_classification.csv 저장 완료


In [14]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. Load data
df = pd.read_csv("news_stock_binary_classification.csv", parse_dates=["news_time"])
df = df.dropna()

# 2. Features & Labels
feature_cols = [col for col in df.columns if col.startswith("x") or col.startswith("finbert_")]
X = df[feature_cols].fillna(0)
y = df["target"]

# 3. Reshape to time-series: x1_ ~ x5_
X_seq = []
for i in range(len(X)):
    timestep_data = []
    for t in range(1, 6):
        timestep_data.append(X.iloc[i][[col for col in X.columns if col.startswith(f"x{t}_")]].values)
    X_seq.append(timestep_data)
X_seq = np.array(X_seq)

# 4. FinBERT features broadcast
finbert_feats = X[[c for c in X.columns if c.startswith("finbert_")]].values
finbert_feats = np.repeat(finbert_feats[:, np.newaxis, :], 5, axis=1)
X_seq = np.concatenate([X_seq, finbert_feats], axis=-1)

# 5. Normalize
n_samples, time_steps, n_features = X_seq.shape
X_reshaped = X_seq.reshape(-1, n_features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_seq = X_scaled.reshape(n_samples, time_steps, n_features)

# 6. Tensor conversion
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 7. Time-based split
split_idx = int(len(df) * 0.8)
X_train, X_test = X_tensor[:split_idx], X_tensor[split_idx:]
y_train, y_test = y_tensor[:split_idx], y_tensor[split_idx:]

train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_dl  = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# 8. Model - Deep LSTM + MLP
class DeepLSTMClassifier(nn.Module):
    def __init__(self, input_dim, lstm_hidden=512, mlp_hidden=[512, 256, 128, 64], output_dim=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, lstm_hidden, num_layers=2, batch_first=True, dropout=0.3)
        self.mlp = nn.Sequential(
            nn.Linear(lstm_hidden, mlp_hidden[0]),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(mlp_hidden[0], mlp_hidden[1]),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(mlp_hidden[1], mlp_hidden[2]),
            nn.ReLU(),
            nn.Linear(mlp_hidden[2], mlp_hidden[3]),
            nn.ReLU(),
            nn.Linear(mlp_hidden[3], output_dim)
        )

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.mlp(hn[-1])

# 9. Focal Loss 정의
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=0.8):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)
        focal = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal.mean()

# 10. Training Setup + EarlyStopping
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = DeepLSTMClassifier(input_dim=n_features).to(device)

loss_fn = FocalLoss(gamma=2.0, alpha=2.0)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
patience = 12
best_loss = float("inf")
epochs_no_improve = 0

for epoch in range(100):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation loss
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for xb, yb in test_dl:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = loss_fn(pred, yb)
            val_loss += loss.item()
    val_loss /= len(test_dl)

    print(f"Epoch {epoch+1:02d} | Train Loss: {total_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("⏹️ Early stopping triggered.")
            break

model.load_state_dict(best_model_state)

# 11. 평가
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        preds = model(xb).argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("\n📈 Accuracy:", accuracy_score(all_labels, all_preds))
print("\n📊 Classification Report:\n", classification_report(all_labels, all_preds))
print("\n🧱 Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))


Epoch 01 | Train Loss: 41.3100 | Val Loss: 0.3419
Epoch 02 | Train Loss: 37.3130 | Val Loss: 0.4158
Epoch 03 | Train Loss: 32.4232 | Val Loss: 0.7833
Epoch 04 | Train Loss: 28.6629 | Val Loss: 0.7505
Epoch 05 | Train Loss: 22.6333 | Val Loss: 1.1208
Epoch 06 | Train Loss: 22.0089 | Val Loss: 0.9983
Epoch 07 | Train Loss: 17.0868 | Val Loss: 1.4601
Epoch 08 | Train Loss: 14.0339 | Val Loss: 2.3823
Epoch 09 | Train Loss: 14.4976 | Val Loss: 1.4230
Epoch 10 | Train Loss: 12.8186 | Val Loss: 1.4774
Epoch 11 | Train Loss: 9.4039 | Val Loss: 2.5632
Epoch 12 | Train Loss: 7.7556 | Val Loss: 2.0023
Epoch 13 | Train Loss: 6.9513 | Val Loss: 3.5774
⏹️ Early stopping triggered.

📈 Accuracy: 0.5678137651821862

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.47      0.50       449
           1       0.59      0.65      0.62       539

    accuracy                           0.57       988
   macro avg       0.56      0.56      0.56    