In [24]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [25]:
# 1. Load dataset
df = pd.read_csv("news_stock_binary_classification.csv")
df = df.dropna()

# 2. Drop unused columns
X = df.drop(columns=["company", "news_time", "target_return", "target"])
y = df["target"]

# 3. 시간순 정렬 및 Split
df_sorted = df.sort_values("news_time").reset_index(drop=True)
split_idx = int(len(df_sorted) * 0.8)

X_train = df_sorted.iloc[:split_idx].drop(columns=["company", "news_time", "target_return", "target"])
y_train = df_sorted.iloc[:split_idx]["target"]
X_test  = df_sorted.iloc[split_idx:].drop(columns=["company", "news_time", "target_return", "target"])
y_test  = df_sorted.iloc[split_idx:]["target"]


In [27]:


# 4. RandomForest 학습
clf = RandomForestClassifier(n_estimators=300, max_depth=12, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

# 5. 평가
y_pred = clf.predict(X_test)

print("📈 Accuracy:", accuracy_score(y_test, y_pred))
print("📊 Classification Report:\n", classification_report(y_test, y_pred))
print("🧱 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


📈 Accuracy: 0.4415322580645161
📊 Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.70      0.45       323
           1       0.69      0.32      0.43       669

    accuracy                           0.44       992
   macro avg       0.51      0.51      0.44       992
weighted avg       0.57      0.44      0.44       992

🧱 Confusion Matrix:
 [[227  96]
 [458 211]]


In [44]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. 데이터 로드
df = pd.read_csv("news_stock_binary_classification.csv", parse_dates=["news_time"])
df = df.dropna()

# 2. feature / label 분리
feature_cols = [col for col in df.columns if col.startswith("x") or col.startswith("finbert_")]
X = df[feature_cols].fillna(0)
y = df["target"]

# 3. 시계열 재구성: x1_, x2_, ..., x5_
X_seq = []
for i in range(len(X)):
    timestep_data = []
    for t in range(1, 6):
        timestep_data.append(X.iloc[i][[col for col in X.columns if col.startswith(f"x{t}_")]].values)
    X_seq.append(timestep_data)
X_seq = np.array(X_seq)

# 4. FinBERT 피처 broadcast
finbert_feats = X[[c for c in X.columns if c.startswith("finbert_")]].values
finbert_feats = np.repeat(finbert_feats[:, np.newaxis, :], 5, axis=1)
X_seq = np.concatenate([X_seq, finbert_feats], axis=-1)

# 5. 정규화
n_samples, time_steps, n_features = X_seq.shape
X_reshaped = X_seq.reshape(-1, n_features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_seq = X_scaled.reshape(n_samples, time_steps, n_features)

# 6. Tensor 변환
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 7. Train/Test Split (시간순 정렬 기반)
df_sorted = df.sort_values("news_time").reset_index(drop=True)
split_idx = int(len(df_sorted) * 0.8)

X_train = X_tensor[:split_idx]
y_train = y_tensor[:split_idx]
X_test  = X_tensor[split_idx:]
y_test  = y_tensor[split_idx:]

train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_dl  = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# 8. LSTM 모델 정의
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, output_dim=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.fc(hn[-1])

# 9. 학습
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(input_dim=n_features).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(50):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")


Epoch 1 | Loss: 78.4516
Epoch 2 | Loss: 58.7146
Epoch 3 | Loss: 42.1756
Epoch 4 | Loss: 32.3142
Epoch 5 | Loss: 25.7207
Epoch 6 | Loss: 19.7869
Epoch 7 | Loss: 17.6610
Epoch 8 | Loss: 12.8664
Epoch 9 | Loss: 9.5621
Epoch 10 | Loss: 9.0111
Epoch 11 | Loss: 6.4109
Epoch 12 | Loss: 9.0704
Epoch 13 | Loss: 6.9062
Epoch 14 | Loss: 3.5278
Epoch 15 | Loss: 4.9738
Epoch 16 | Loss: 3.8271
Epoch 17 | Loss: 2.8743
Epoch 18 | Loss: 6.3047
Epoch 19 | Loss: 2.6984
Epoch 20 | Loss: 1.1845
Epoch 21 | Loss: 0.5914
Epoch 22 | Loss: 0.5823
Epoch 23 | Loss: 0.3787
Epoch 24 | Loss: 0.2400
Epoch 25 | Loss: 0.5390
Epoch 26 | Loss: 2.5048
Epoch 27 | Loss: 8.3455
Epoch 28 | Loss: 1.6420
Epoch 29 | Loss: 0.5892
Epoch 30 | Loss: 0.5261
Epoch 31 | Loss: 0.3242
Epoch 32 | Loss: 1.8154
Epoch 33 | Loss: 0.4293
Epoch 34 | Loss: 0.2156
Epoch 35 | Loss: 2.0572
Epoch 36 | Loss: 1.5774
Epoch 37 | Loss: 2.6315
Epoch 38 | Loss: 1.0033
Epoch 39 | Loss: 1.3288
Epoch 40 | Loss: 0.3430
Epoch 41 | Loss: 1.0451
Epoch 42 | Loss: 

In [58]:

# 10. 평가
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        preds = model(xb.to(device)).argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("\n📈 Accuracy:", accuracy_score(all_labels, all_preds))
print("\n📊 Classification Report:\n", classification_report(all_labels, all_preds))
print("\n🧱 Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))


📈 Accuracy: 0.5495951417004049

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.47      0.49       449
           1       0.58      0.62      0.60       539

    accuracy                           0.55       988
   macro avg       0.54      0.54      0.54       988
weighted avg       0.55      0.55      0.55       988


🧱 Confusion Matrix:
 [[210 239]
 [206 333]]


In [57]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. 데이터 로드
df = pd.read_csv("news_stock_binary_classification.csv", parse_dates=["news_time"])
df = df.dropna()

# 2. feature / label 분리
feature_cols = [col for col in df.columns if col.startswith("x") or col.startswith("finbert_")]
X = df[feature_cols].fillna(0)
y = df["target"]

# 3. 시계열 재구성: x1_, x2_, ..., x5_
X_seq = []
for i in range(len(X)):
    timestep_data = []
    for t in range(1, 6):
        timestep_data.append(X.iloc[i][[col for col in X.columns if col.startswith(f"x{t}_")]].values)
    X_seq.append(timestep_data)
X_seq = np.array(X_seq)

# 4. FinBERT 피처 broadcast
finbert_feats = X[[c for c in X.columns if c.startswith("finbert_")]].values
finbert_feats = np.repeat(finbert_feats[:, np.newaxis, :], 5, axis=1)
X_seq = np.concatenate([X_seq, finbert_feats], axis=-1)

# 5. 정규화
n_samples, time_steps, n_features = X_seq.shape
X_reshaped = X_seq.reshape(-1, n_features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_seq = X_scaled.reshape(n_samples, time_steps, n_features)

# 6. Tensor 변환
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 7. Train/Test Split (시간순 정렬 기반)
df_sorted = df.sort_values("news_time").reset_index(drop=True)
split_idx = int(len(df_sorted) * 0.8)

X_train = X_tensor[:split_idx]
y_train = y_tensor[:split_idx]
X_test  = X_tensor[split_idx:]
y_test  = y_tensor[split_idx:]

train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_dl  = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# 8. LSTM 모델 정의
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, output_dim=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.fc(hn[-1])

# 9. 학습 및 EarlyStopping
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(input_dim=n_features).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

patience = 5
best_loss = float("inf")
epochs_no_improve = 0

for epoch in range(50):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    val_loss = 0
    model.eval()
    with torch.no_grad():
        for xb, yb in test_dl:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = loss_fn(pred, yb)
            val_loss += loss.item()
    val_loss /= len(test_dl)

    print(f"Epoch {epoch+1} | Train Loss: {total_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("⏹️ Early stopping triggered.")
            break

model.load_state_dict(best_model_state)



Epoch 1 | Train Loss: 78.4608 | Val Loss: 0.7263
Epoch 2 | Train Loss: 60.5373 | Val Loss: 1.2160
Epoch 3 | Train Loss: 43.0424 | Val Loss: 1.6089
Epoch 4 | Train Loss: 31.4305 | Val Loss: 2.1749
Epoch 5 | Train Loss: 24.8526 | Val Loss: 2.6882
Epoch 6 | Train Loss: 20.4344 | Val Loss: 2.5977
⏹️ Early stopping triggered.


<All keys matched successfully>

In [63]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. Load data
df = pd.read_csv("news_stock_binary_classification.csv", parse_dates=["news_time"])
df = df.dropna()

# 2. Features & Labels
feature_cols = [col for col in df.columns if col.startswith("x") or col.startswith("finbert_")]
X = df[feature_cols].fillna(0)
y = df["target"]

# 3. Reshape to time-series: x1_ ~ x5_
X_seq = []
for i in range(len(X)):
    timestep_data = []
    for t in range(1, 6):
        timestep_data.append(X.iloc[i][[col for col in X.columns if col.startswith(f"x{t}_")]].values)
    X_seq.append(timestep_data)
X_seq = np.array(X_seq)

# 4. FinBERT features broadcast
finbert_feats = X[[c for c in X.columns if c.startswith("finbert_")]].values
finbert_feats = np.repeat(finbert_feats[:, np.newaxis, :], 5, axis=1)
X_seq = np.concatenate([X_seq, finbert_feats], axis=-1)

# 5. Normalize
n_samples, time_steps, n_features = X_seq.shape
X_reshaped = X_seq.reshape(-1, n_features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_seq = X_scaled.reshape(n_samples, time_steps, n_features)

# 6. Tensor conversion
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 7. Time-based split
split_idx = int(len(df) * 0.8)
X_train, X_test = X_tensor[:split_idx], X_tensor[split_idx:]
y_train, y_test = y_tensor[:split_idx], y_tensor[split_idx:]

train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_dl  = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# 8. Model - Deep LSTM + MLP
class DeepLSTMClassifier(nn.Module):
    def __init__(self, input_dim, lstm_hidden=512, mlp_hidden=[512, 256, 128, 64], output_dim=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, lstm_hidden, num_layers=2, batch_first=True, dropout=0.3)
        self.mlp = nn.Sequential(
            nn.Linear(lstm_hidden, mlp_hidden[0]),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(mlp_hidden[0], mlp_hidden[1]),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(mlp_hidden[1], mlp_hidden[2]),
            nn.ReLU(),
            nn.Linear(mlp_hidden[2], mlp_hidden[3]),
            nn.ReLU(),
            nn.Linear(mlp_hidden[3], output_dim)
        )

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.mlp(hn[-1])

# 9. Training Setup + EarlyStopping
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = DeepLSTMClassifier(input_dim=n_features).to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
patience = 12
best_loss = float("inf")
epochs_no_improve = 0

for epoch in range(100):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation loss
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for xb, yb in test_dl:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = loss_fn(pred, yb)
            val_loss += loss.item()
    val_loss /= len(test_dl)

    print(f"Epoch {epoch+1:02d} | Train Loss: {total_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("⏹️ Early stopping triggered.")
            break

model.load_state_dict(best_model_state)

# 10. 평가
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        preds = model(xb).argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("\n📈 Accuracy:", accuracy_score(all_labels, all_preds))
print("\n📊 Classification Report:\n", classification_report(all_labels, all_preds))
print("\n🧱 Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))


Epoch 01 | Train Loss: 82.2169 | Val Loss: 0.8210
Epoch 02 | Train Loss: 77.6868 | Val Loss: 0.8058
Epoch 03 | Train Loss: 66.2132 | Val Loss: 1.2726
Epoch 04 | Train Loss: 57.0719 | Val Loss: 1.5777
Epoch 05 | Train Loss: 46.6540 | Val Loss: 2.8110
Epoch 06 | Train Loss: 40.9093 | Val Loss: 2.0123
Epoch 07 | Train Loss: 32.2423 | Val Loss: 2.4338
Epoch 08 | Train Loss: 29.7118 | Val Loss: 3.0915
Epoch 09 | Train Loss: 24.8373 | Val Loss: 3.6224
Epoch 10 | Train Loss: 20.8776 | Val Loss: 3.4953
Epoch 11 | Train Loss: 15.8673 | Val Loss: 5.2062
Epoch 12 | Train Loss: 14.9615 | Val Loss: 4.9665
Epoch 13 | Train Loss: 11.6793 | Val Loss: 4.8425
Epoch 14 | Train Loss: 12.0278 | Val Loss: 5.1860
⏹️ Early stopping triggered.

📈 Accuracy: 0.4483805668016194

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.50      0.45       449
           1       0.49      0.41      0.45       539

    accuracy                           0.45     

In [66]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. Load data
df = pd.read_csv("news_stock_binary_classification.csv", parse_dates=["news_time"])
df = df.dropna()

# 2. Features & Labels
feature_cols = [col for col in df.columns if col.startswith("x") or col.startswith("finbert_")]
X = df[feature_cols].fillna(0)
y = df["target"]

# 3. 시계열 재구성: x1_ ~ x5_
X_seq = []
for i in range(len(X)):
    timestep_data = []
    for t in range(1, 6):
        timestep_data.append(X.iloc[i][[col for col in X.columns if col.startswith(f"x{t}_")]].values)
    X_seq.append(timestep_data)
X_seq = np.array(X_seq)

# 4. FinBERT broadcast
finbert_feats = X[[c for c in X.columns if c.startswith("finbert_")]].values
finbert_feats = np.repeat(finbert_feats[:, np.newaxis, :], 5, axis=1)
X_seq = np.concatenate([X_seq, finbert_feats], axis=-1)

# 5. Normalize
n_samples, time_steps, n_features = X_seq.shape
X_reshaped = X_seq.reshape(-1, n_features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_seq = X_scaled.reshape(n_samples, time_steps, n_features)

# 6. Tensor 변환
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 7. 시간순 정렬 기반 분할
df_sorted = df.sort_values("news_time").reset_index(drop=True)
split_idx = int(len(df_sorted) * 0.8)
X_train = X_tensor[:split_idx]
y_train = y_tensor[:split_idx]
X_test  = X_tensor[split_idx:]
y_test  = y_tensor[split_idx:]

# 8. Augmentation (train only)
def jitter(X, sigma=0.02):
    noise = np.random.normal(loc=0, scale=sigma, size=X.shape)
    return X + noise

aug_X = [x.numpy() for x in X_train]
aug_y = [y.item() for y in y_train]

for i in range(len(X_train)):
    for _ in range(9):  # 9배 증식
        aug_X.append(jitter(X_train[i].numpy()))
        aug_y.append(y_train[i].item())

X_train_aug = torch.tensor(np.array(aug_X), dtype=torch.float32)
y_train_aug = torch.tensor(np.array(aug_y), dtype=torch.long)

train_dl = DataLoader(TensorDataset(X_train_aug, y_train_aug), batch_size=32, shuffle=True)
test_dl  = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# 9. Deep LSTM 모델
class DeepLSTMClassifier(nn.Module):
    def __init__(self, input_dim, lstm_hidden=512, mlp_hidden=[512, 256, 128, 64], output_dim=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, lstm_hidden, num_layers=2, batch_first=True, dropout=0.3)
        self.mlp = nn.Sequential(
            nn.Linear(lstm_hidden, mlp_hidden[0]),
            nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(mlp_hidden[0], mlp_hidden[1]),
            nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(mlp_hidden[1], mlp_hidden[2]),
            nn.ReLU(),
            nn.Linear(mlp_hidden[2], mlp_hidden[3]),
            nn.ReLU(),
            nn.Linear(mlp_hidden[3], output_dim)
        )

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.mlp(hn[-1])

# 10. 학습 설정
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = DeepLSTMClassifier(input_dim=n_features).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 11. EarlyStopping
patience = 3
best_loss = float("inf")
epochs_no_improve = 0

for epoch in range(100):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for xb, yb in test_dl:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = loss_fn(pred, yb)
            val_loss += loss.item()
    val_loss /= len(test_dl)

    print(f"Epoch {epoch+1:02d} | Train Loss: {total_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("⏹️ Early stopping triggered.")
            break

model.load_state_dict(best_model_state)

# 12. 평가
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        preds = model(xb).argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("\n📈 Accuracy:", accuracy_score(all_labels, all_preds))
print("\n📊 Classification Report:\n", classification_report(all_labels, all_preds))
print("\n🧱 Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))


Epoch 01 | Train Loss: 433.8221 | Val Loss: 3.3136
Epoch 02 | Train Loss: 87.6928 | Val Loss: 3.3266
Epoch 03 | Train Loss: 44.5543 | Val Loss: 5.9145
Epoch 04 | Train Loss: 31.9904 | Val Loss: 3.6171
⏹️ Early stopping triggered.

📈 Accuracy: 0.4888663967611336

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.40      0.26      0.31       449
           1       0.52      0.68      0.59       539

    accuracy                           0.49       988
   macro avg       0.46      0.47      0.45       988
weighted avg       0.47      0.49      0.47       988


🧱 Confusion Matrix:
 [[115 334]
 [171 368]]


In [74]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. Load data
df = pd.read_csv("news_stock_binary_classification.csv", parse_dates=["news_time"])
df = df.dropna()

# 2. Features & Labels
feature_cols = [col for col in df.columns if col.startswith("x") or col.startswith("finbert_")]
X = df[feature_cols].fillna(0)
y = df["target"]

# 3. Reshape to time-series: x1_ ~ x5_
X_seq = []
for i in range(len(X)):
    timestep_data = []
    for t in range(1, 6):
        timestep_data.append(X.iloc[i][[col for col in X.columns if col.startswith(f"x{t}_")]].values)
    X_seq.append(timestep_data)
X_seq = np.array(X_seq)

# 4. FinBERT features broadcast
finbert_feats = X[[c for c in X.columns if c.startswith("finbert_")]].values
finbert_feats = np.repeat(finbert_feats[:, np.newaxis, :], 5, axis=1)
X_seq = np.concatenate([X_seq, finbert_feats], axis=-1)

# 5. Normalize
n_samples, time_steps, n_features = X_seq.shape
X_reshaped = X_seq.reshape(-1, n_features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_seq = X_scaled.reshape(n_samples, time_steps, n_features)

# 6. Tensor conversion
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 7. Time-based split
split_idx = int(len(df) * 0.8)
X_train, X_test = X_tensor[:split_idx], X_tensor[split_idx:]
y_train, y_test = y_tensor[:split_idx], y_tensor[split_idx:]

train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_dl  = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# 8. Model - Deep LSTM + MLP
class DeepLSTMClassifier(nn.Module):
    def __init__(self, input_dim, lstm_hidden=512, mlp_hidden=[512, 256, 128, 64], output_dim=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, lstm_hidden, num_layers=2, batch_first=True, dropout=0.3)
        self.mlp = nn.Sequential(
            nn.Linear(lstm_hidden, mlp_hidden[0]),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(mlp_hidden[0], mlp_hidden[1]),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(mlp_hidden[1], mlp_hidden[2]),
            nn.ReLU(),
            nn.Linear(mlp_hidden[2], mlp_hidden[3]),
            nn.ReLU(),
            nn.Linear(mlp_hidden[3], output_dim)
        )

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.mlp(hn[-1])

# 9. Focal Loss 정의
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=0.8):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)
        focal = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal.mean()

# 10. Training Setup + EarlyStopping
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = DeepLSTMClassifier(input_dim=n_features).to(device)

loss_fn = FocalLoss(gamma=2.0, alpha=2.0)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
patience = 12
best_loss = float("inf")
epochs_no_improve = 0

for epoch in range(100):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation loss
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for xb, yb in test_dl:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = loss_fn(pred, yb)
            val_loss += loss.item()
    val_loss /= len(test_dl)

    print(f"Epoch {epoch+1:02d} | Train Loss: {total_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("⏹️ Early stopping triggered.")
            break

model.load_state_dict(best_model_state)

# 11. 평가
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        preds = model(xb).argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("\n📈 Accuracy:", accuracy_score(all_labels, all_preds))
print("\n📊 Classification Report:\n", classification_report(all_labels, all_preds))
print("\n🧱 Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))


Epoch 01 | Train Loss: 41.6870 | Val Loss: 0.3657
Epoch 02 | Train Loss: 37.1740 | Val Loss: 0.4378
Epoch 03 | Train Loss: 30.8142 | Val Loss: 0.7631
Epoch 04 | Train Loss: 26.6454 | Val Loss: 1.4703
Epoch 05 | Train Loss: 24.4892 | Val Loss: 0.9435
Epoch 06 | Train Loss: 20.7112 | Val Loss: 1.4488
Epoch 07 | Train Loss: 17.3832 | Val Loss: 1.6344
Epoch 08 | Train Loss: 14.4508 | Val Loss: 1.5729
Epoch 09 | Train Loss: 15.5578 | Val Loss: 0.9707
Epoch 10 | Train Loss: 15.7847 | Val Loss: 1.3458
Epoch 11 | Train Loss: 12.5984 | Val Loss: 1.3393
Epoch 12 | Train Loss: 8.8785 | Val Loss: 0.8765
Epoch 13 | Train Loss: 9.3716 | Val Loss: 1.0733
⏹️ Early stopping triggered.

📈 Accuracy: 0.5738866396761133

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.53      0.59      0.56       449
           1       0.62      0.56      0.59       539

    accuracy                           0.57       988
   macro avg       0.57      0.58      0.57   

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate

# 1. 데이터 로드
df = pd.read_csv("news_stock_binary_classification.csv", parse_dates=["news_time"])
df["context"] = df["context"].astype(str)
df["company"] = df["company"].astype(str)

# 2. 입력 분리
feature_cols_seq = [col for col in df.columns if col.startswith("x")]
feature_cols_finbert = ["finbert_positive", "finbert_neutral", "finbert_negative"]
company_dummies = pd.get_dummies(df["company"], prefix="company")
context_dummies = pd.get_dummies(df["context"], prefix="context")
X_static = pd.concat([df[feature_cols_finbert], company_dummies, context_dummies], axis=1).astype(np.float32).values
X_seq = df[feature_cols_seq].values
y = df["target"].values

# 3. LSTM 시퀀스 리쉐이핑
X_seq = X_seq.reshape((-1, 5, len(feature_cols_seq)//5))

# 4. 시계열 스케일링
scaler = StandardScaler()
X_seq_scaled = scaler.fit_transform(X_seq.reshape(-1, X_seq.shape[-1])).reshape(X_seq.shape)

# 5. TimeSeriesSplit 적용
tscv = TimeSeriesSplit(n_splits=5)

for fold, (train_idx, test_idx) in enumerate(tscv.split(X_seq_scaled), 1):
    X_train_seq = X_seq_scaled[train_idx]
    X_test_seq = X_seq_scaled[test_idx]
    X_train_static = X_static[train_idx]
    X_test_static = X_static[test_idx]
    y_train = y[train_idx]
    y_test = y[test_idx]

    # 모델 정의
    tf.keras.backend.clear_session()
    np.random.seed(42)
    tf.random.set_seed(42)

    seq_input = Input(shape=(5, X_seq.shape[2]), name="seq_input")
    lstm_out = LSTM(64, activation="tanh")(seq_input)

    static_input = Input(shape=(X_static.shape[1],), name="static_input")
    x = Concatenate()([lstm_out, static_input])
    x = Dense(64, activation="relu")(x)
    x = Dense(32, activation="relu")(x)
    output = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=[seq_input, static_input], outputs=output)
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    # 학습
    model.fit([X_train_seq, X_train_static], y_train, epochs=50, batch_size=32, verbose=1)

    # 예측
    y_prob = model.predict([X_test_seq, X_test_static])
    y_pred = (y_prob.flatten() > 0.5).astype(int)

    # 출력
    print(f"\n📦 Fold {fold}")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

📦 Fold 1
Accuracy: 0.5097087378640777
Confusion Matrix:
 [[181 258]
 [146 239]]
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.41      0.47       439
           1       0.48      0.62      0.54       385

    accuracy                           0.51       824
   macro avg       0.52      0.52      0.51       824
weighted avg       0.52      0.5

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

📦 Fold 4
Accuracy: 0.3580097087378641
Confusion Matrix:
 [[295   0]
 [529   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.36      1.00      0.53       295
           1       0.00      0.00      0.00       529

    accuracy                           0.36       824
   macro avg       0.18      0.50      0.26       824
weighted avg       0.13      0.36      0.19

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

📦 Fold 5
Accuracy: 0.49029126213592233
Confusion Matrix:
 [[404   0]
 [420   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.49      1.00      0.66       404
           1       0.00      0.00      0.00       420

    accuracy                           0.49       824
   macro avg       0.25      0.50      0.33       824
weighted avg       0.24      0.49      0.3

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [3]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. Load data
df = pd.read_csv("news_stock_binary_classification.csv", parse_dates=["news_time"])
df = df.dropna()

# 2. Features & Labels
feature_cols = [col for col in df.columns if col.startswith("x") or col.startswith("finbert_")]
X = df[feature_cols].fillna(0)
y = df["target"]

# 3. Reshape to time-series: x1_ ~ x5_
X_seq = []
for i in range(len(X)):
    timestep_data = []
    for t in range(1, 6):
        timestep_data.append(X.iloc[i][[col for col in X.columns if col.startswith(f"x{t}_")]].values)
    X_seq.append(timestep_data)
X_seq = np.array(X_seq)

# 4. FinBERT features broadcast
finbert_feats = X[[c for c in X.columns if c.startswith("finbert_")]].values
finbert_feats = np.repeat(finbert_feats[:, np.newaxis, :], 5, axis=1)
X_seq = np.concatenate([X_seq, finbert_feats], axis=-1)

# 5. Normalize
n_samples, time_steps, n_features = X_seq.shape
X_reshaped = X_seq.reshape(-1, n_features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_seq = X_scaled.reshape(n_samples, time_steps, n_features)

# 6. Tensor conversion
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 7. Time-based split
split_idx = int(len(df) * 0.8)
X_train, X_test = X_tensor[:split_idx], X_tensor[split_idx:]
y_train, y_test = y_tensor[:split_idx], y_tensor[split_idx:]

train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_dl  = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# 8. Model - Deep LSTM + MLP (강화 버전)
class DeepLSTMClassifier(nn.Module):
    def __init__(self, input_dim, lstm_hidden=768, mlp_hidden=[768, 512, 256, 128, 64], output_dim=2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=lstm_hidden,
            num_layers=3,
            batch_first=True,
            dropout=0.4
        )
        self.mlp = nn.Sequential(
            nn.Linear(lstm_hidden, mlp_hidden[0]),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(mlp_hidden[0], mlp_hidden[1]),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(mlp_hidden[1], mlp_hidden[2]),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(mlp_hidden[2], mlp_hidden[3]),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(mlp_hidden[3], mlp_hidden[4]),
            nn.ReLU(),
            nn.Linear(mlp_hidden[4], output_dim)
        )

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.mlp(hn[-1])

# 9. Focal Loss 정의
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=0.8):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)
        focal = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal.mean()

# 10. Training Setup + EarlyStopping
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = DeepLSTMClassifier(input_dim=n_features).to(device)

loss_fn = FocalLoss(gamma=2.0, alpha=2.0)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
patience = 3
best_loss = float("inf")
epochs_no_improve = 0

for epoch in range(100):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation loss
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for xb, yb in test_dl:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = loss_fn(pred, yb)
            val_loss += loss.item()
    val_loss /= len(test_dl)

    print(f"Epoch {epoch+1:02d} | Train Loss: {total_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("⏹️ Early stopping triggered.")
            break

model.load_state_dict(best_model_state)

# 11. 평가
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        preds = model(xb).argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("\n📈 Accuracy:", accuracy_score(all_labels, all_preds))
print("\n📊 Classification Report:\n", classification_report(all_labels, all_preds))
print("\n🧱 Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))


Epoch 01 | Train Loss: 42.6343 | Val Loss: 0.3594
Epoch 02 | Train Loss: 41.9532 | Val Loss: 0.3381
Epoch 03 | Train Loss: 41.3566 | Val Loss: 0.3681
Epoch 04 | Train Loss: 40.1353 | Val Loss: 0.3685
Epoch 05 | Train Loss: 39.3987 | Val Loss: 0.4410
⏹️ Early stopping triggered.

📈 Accuracy: 0.5384615384615384

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.26      0.34       449
           1       0.56      0.77      0.65       539

    accuracy                           0.54       988
   macro avg       0.52      0.51      0.49       988
weighted avg       0.52      0.54      0.51       988


🧱 Confusion Matrix:
 [[115 334]
 [122 417]]


In [4]:
import pandas as pd
import os

def load_stock_and_news(stock_path, news_path):
    stock = pd.read_csv(stock_path, parse_dates=["Datetime"])
    news = pd.read_csv(news_path, parse_dates=["pubDate"])

    stock["Datetime"] = stock["Datetime"].dt.tz_localize(None)
    news["pubDate"] = news["pubDate"].dt.tz_localize(None)

    # 정렬 및 제외 열 제거
    stock = stock.sort_values("Datetime").reset_index(drop=True)
    stock = stock.drop(columns=[col for col in stock.columns if col.startswith("Is_")])

    return stock, news


def make_binary_merged_df(stock_df, news_df, company):
    rows = []

    for _, news_row in news_df.iterrows():
        news_time = news_row["pubDate"]

        # 뉴스 이후 가장 가까운 주가
        future_row = stock_df[stock_df["Datetime"] > news_time].head(1)
        if future_row.empty:
            continue

        target_row = future_row.iloc[0]
        target_return = target_row.get("Returns", None)
        if pd.isna(target_return):
            continue

        # 과거 5개
        past_rows = stock_df[stock_df["Datetime"] < target_row["Datetime"]].tail(5)
        if len(past_rows) < 5:
            continue

        if target_return >= 0.04:
            label = 1
        elif target_return <= -0.04:
            label = 0
        else:
            continue  # 기준 미달인 경우는 무시

        row = {
            "company": company,
            "news_time": news_time,
            "target_return": target_return,
            "target": label,
            "finbert_positive": news_row["finbert_positive"],
            "finbert_neutral": news_row["finbert_neutral"],
            "finbert_negative": news_row["finbert_negative"]
        }

        for i, (_, p_row) in enumerate(past_rows.iterrows(), 1):
            for col in stock_df.columns:
                if col == "Datetime":
                    continue
                row[f"x{i}_{col}"] = p_row[col]

        rows.append(row)

    return pd.DataFrame(rows)

base_dir = "./"  # 압축 풀린 폴더 기준
companies = {
    "AAPL": ("AAPL_1hour_data_365days.csv", "apple_finbert_finnhub.csv"),
    "AMZN": ("AMZN_1hour_data_365days.csv", "amazon_finbert_finnhub.csv"),
    "GOOGL": ("GOOGL_1hour_data_365days.csv", "google_finbert_finnhub.csv"),
    "MSFT": ("MSFT_1hour_data_365days.csv", "microsoft_finbert_finnhub.csv"),
    "TSLA": ("TSLA_1hour_data_365days.csv", "tesla_finbert_finnhub.csv"),
}

dfs = []
for company, (stock_file, news_file) in companies.items():
    stock_path = os.path.join(base_dir, stock_file)
    news_path = os.path.join(base_dir, news_file)
    if not os.path.exists(stock_path) or not os.path.exists(news_path):
        continue

    stock_df, news_df = load_stock_and_news(stock_path, news_path)
    merged_df = make_binary_merged_df(stock_df, news_df, company)
    dfs.append(merged_df)

# 최종 병합
final_df = pd.concat(dfs, ignore_index=True)
final_df.to_csv("news_stock_binary_classification.csv", index=False)
print("news_stock_binary_classification.csv 저장 완료")

import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. Load data
df = pd.read_csv("news_stock_binary_classification.csv", parse_dates=["news_time"])
df = df.dropna()

# 2. Features & Labels
feature_cols = [col for col in df.columns if col.startswith("x") or col.startswith("finbert_")]
X = df[feature_cols].fillna(0)
y = df["target"]

# 3. Reshape to time-series: x1_ ~ x5_
X_seq = []
for i in range(len(X)):
    timestep_data = []
    for t in range(1, 6):
        timestep_data.append(X.iloc[i][[col for col in X.columns if col.startswith(f"x{t}_")]].values)
    X_seq.append(timestep_data)
X_seq = np.array(X_seq)

# 4. FinBERT features broadcast
finbert_feats = X[[c for c in X.columns if c.startswith("finbert_")]].values
finbert_feats = np.repeat(finbert_feats[:, np.newaxis, :], 5, axis=1)
X_seq = np.concatenate([X_seq, finbert_feats], axis=-1)

# 5. Normalize
n_samples, time_steps, n_features = X_seq.shape
X_reshaped = X_seq.reshape(-1, n_features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_seq = X_scaled.reshape(n_samples, time_steps, n_features)

# 6. Tensor conversion
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 7. Time-based split
split_idx = int(len(df) * 0.8)
X_train, X_test = X_tensor[:split_idx], X_tensor[split_idx:]
y_train, y_test = y_tensor[:split_idx], y_tensor[split_idx:]

train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_dl  = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# 8. Model - Deep LSTM + MLP
class DeepLSTMClassifier(nn.Module):
    def __init__(self, input_dim, lstm_hidden=512, mlp_hidden=[512, 256, 128, 64], output_dim=2):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, lstm_hidden, num_layers=2, batch_first=True, dropout=0.3)
        self.mlp = nn.Sequential(
            nn.Linear(lstm_hidden, mlp_hidden[0]),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(mlp_hidden[0], mlp_hidden[1]),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(mlp_hidden[1], mlp_hidden[2]),
            nn.ReLU(),
            nn.Linear(mlp_hidden[2], mlp_hidden[3]),
            nn.ReLU(),
            nn.Linear(mlp_hidden[3], output_dim)
        )

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.mlp(hn[-1])

# 9. Focal Loss 정의
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=0.8):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha

    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)
        focal = self.alpha * (1 - pt) ** self.gamma * ce_loss
        return focal.mean()

# 10. Training Setup + EarlyStopping
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = DeepLSTMClassifier(input_dim=n_features).to(device)

loss_fn = FocalLoss(gamma=2.0, alpha=2.0)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
patience = 12
best_loss = float("inf")
epochs_no_improve = 0

for epoch in range(100):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # Validation loss
    val_loss = 0
    model.eval()
    with torch.no_grad():
        for xb, yb in test_dl:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = loss_fn(pred, yb)
            val_loss += loss.item()
    val_loss /= len(test_dl)

    print(f"Epoch {epoch+1:02d} | Train Loss: {total_loss:.4f} | Val Loss: {val_loss:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        epochs_no_improve = 0
        best_model_state = model.state_dict()
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print("⏹️ Early stopping triggered.")
            break

model.load_state_dict(best_model_state)

# 11. 평가
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        preds = model(xb).argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("\n📈 Accuracy:", accuracy_score(all_labels, all_preds))
print("\n📊 Classification Report:\n", classification_report(all_labels, all_preds))
print("\n🧱 Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))


news_stock_binary_classification.csv 저장 완료
Epoch 01 | Train Loss: 3.9931 | Val Loss: 0.2884
Epoch 02 | Train Loss: 1.8268 | Val Loss: 0.3032
Epoch 03 | Train Loss: 1.3751 | Val Loss: 0.3633
Epoch 04 | Train Loss: 1.0632 | Val Loss: 0.8318
Epoch 05 | Train Loss: 0.9257 | Val Loss: 0.3851
Epoch 06 | Train Loss: 0.5660 | Val Loss: 0.4725
Epoch 07 | Train Loss: 0.4099 | Val Loss: 0.5354
Epoch 08 | Train Loss: 0.4545 | Val Loss: 0.5144
Epoch 09 | Train Loss: 0.4470 | Val Loss: 0.6689
Epoch 10 | Train Loss: 0.3212 | Val Loss: 0.5067
Epoch 11 | Train Loss: 0.3273 | Val Loss: 0.6843
Epoch 12 | Train Loss: 1.6586 | Val Loss: 0.5380
Epoch 13 | Train Loss: 0.9601 | Val Loss: 0.5921
⏹️ Early stopping triggered.

📈 Accuracy: 0.7289719626168224

📊 Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.96      0.79        57
           1       0.92      0.46      0.61        50

    accuracy                           0.73       107
   macro avg  

In [6]:
import pandas as pd
df = pd.read_csv("news_stock_binary_classification.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 532 entries, 0 to 531
Columns: 162 entries, company to x5_Quarter
dtypes: float64(134), int64(26), object(2)
memory usage: 673.4+ KB


In [12]:
import pandas as pd

# 파일 경로
stock_path = "./AAPL_1hour_data_365days.csv"
news_path = "./apple_finbert_finnhub.csv"

# 데이터 불러오기
stock_df = pd.read_csv(stock_path, parse_dates=["Datetime"])
news_df = pd.read_csv(news_path, parse_dates=["pubDate"])

# 타임존 제거
stock_df["Datetime"] = stock_df["Datetime"].dt.tz_localize(None)
news_df["pubDate"] = news_df["pubDate"].dt.tz_localize(None)

# 정렬
stock_df = stock_df.sort_values("Datetime").reset_index(drop=True)

# 제외할 열
exclude_cols = ['Is_Trading_Hours', 'Is_Market_Open', 'Is_Premarket', 'Is_Aftermarket', 'Is_Extended_Hours']
stock_df = stock_df.drop(columns=[col for col in exclude_cols if col in stock_df.columns])

# 병합 결과
rows = []

for _, news_row in news_df.iterrows():
    news_time = news_row['pubDate']

    # 뉴스 이후 가장 가까운 주가
    future_stock = stock_df[stock_df['Datetime'] > news_time].head(1)
    if future_stock.empty:
        continue

    target_row = future_stock.iloc[0]
    target_time = target_row['Datetime']
    target_close = target_row['Close']

    # 과거 3개 주가
    past_rows = stock_df[stock_df['Datetime'] < target_time].tail(3)
    if len(past_rows) < 3:
        continue

    past_last_close = past_rows.iloc[-1]['Close']

    # 상승률
    return_pct = (target_close - past_last_close) / past_last_close * 100
    label = 1 if return_pct >= 0.4 else (-1 if return_pct <= -0.4 else 0)

    # 병합 row 생성
    row = {
        "news_id": news_row['id'],
        "news_time": news_time,
        "target_close": target_close,
        "target_return_pct": return_pct,
        "target_multi_raw": label,
        "finbert_positive": news_row['finbert_positive'],
        "finbert_neutral": news_row['finbert_neutral'],
        "finbert_negative": news_row['finbert_negative'],
    }

    # 과거 3개 flatten
    for i, (_, stock_row) in enumerate(past_rows.iterrows(), 1):
        for col in stock_df.columns:
            if col == "Datetime":
                continue
            row[f"x{i}_{col}"] = stock_row[col]

    rows.append(row)

# 최종 DataFrame
merged_df = pd.DataFrame(rows)

# 클래스 0/1/2로 매핑 (XGBoost용)
label_map = {-1: 0, 0: 1, 1: 2}
merged_df["target_multi"] = merged_df["target_multi_raw"].map(label_map)

# 저장
merged_df.to_csv("news_stock_classification.csv", index=False)
print("병합 완료: news_stock_classification.csv 저장됨")


병합 완료: news_stock_classification.csv 저장됨


In [9]:

import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 1. 데이터 불러오기
df = pd.read_csv("news_stock_classification.csv", parse_dates=["news_time"])

# 2. Feature 및 Label 준비
feature_cols = [col for col in df.columns if col.startswith("x") or col.startswith("finbert_")]
X = df[feature_cols].fillna(0)
y = df["target_multi"]

# 3. 시계열 데이터 3-step 생성 (x1_, x2_, x3_)
X_seq = []
for i in range(len(X)):
    X_seq.append([
        X.iloc[i][[col for col in X.columns if col.startswith("x1_")]].values,
        X.iloc[i][[col for col in X.columns if col.startswith("x2_")]].values,
        X.iloc[i][[col for col in X.columns if col.startswith("x3_")]].values
    ])
X_seq = np.array(X_seq)

# 4. FinBERT 피처 추가 (Broadcast across time steps)
finbert_feats = X[[c for c in X.columns if c.startswith("finbert_")]].values
finbert_feats = np.repeat(finbert_feats[:, np.newaxis, :], 3, axis=1)
X_seq = np.concatenate([X_seq, finbert_feats], axis=-1)

# 5. 정규화
n_samples, time_steps, n_features = X_seq.shape
X_reshaped = X_seq.reshape(-1, n_features)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_seq = X_scaled.reshape(n_samples, time_steps, n_features)

# 6. Tensor로 변환
X_tensor = torch.tensor(X_seq, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.long)

# 7. Train/Test 분리
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, shuffle=False)
train_dl = DataLoader(TensorDataset(X_train, y_train), batch_size=32, shuffle=True)
test_dl = DataLoader(TensorDataset(X_test, y_test), batch_size=32)

# 8. LSTM 모델 정의
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, output_dim=3):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.fc(hn[-1])

# 9. 학습 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(input_dim=n_features).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 10. 학습 루프
for epoch in range(50):
    model.train()
    total_loss = 0
    for xb, yb in train_dl:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")

Epoch 1 | Loss: 165.6778
Epoch 2 | Loss: 139.9572
Epoch 3 | Loss: 124.0224
Epoch 4 | Loss: 111.9599
Epoch 5 | Loss: 102.0316
Epoch 6 | Loss: 91.2283
Epoch 7 | Loss: 82.4791
Epoch 8 | Loss: 74.9245
Epoch 9 | Loss: 68.3347
Epoch 10 | Loss: 62.1140
Epoch 11 | Loss: 56.4879
Epoch 12 | Loss: 51.9471
Epoch 13 | Loss: 47.4376
Epoch 14 | Loss: 43.8949
Epoch 15 | Loss: 39.6787
Epoch 16 | Loss: 36.4289
Epoch 17 | Loss: 33.2387
Epoch 18 | Loss: 30.9987
Epoch 19 | Loss: 28.0263
Epoch 20 | Loss: 25.6803
Epoch 21 | Loss: 23.1786
Epoch 22 | Loss: 21.3123
Epoch 23 | Loss: 19.6293
Epoch 24 | Loss: 17.5688
Epoch 25 | Loss: 15.6692
Epoch 26 | Loss: 14.3345
Epoch 27 | Loss: 12.6775
Epoch 28 | Loss: 11.6631
Epoch 29 | Loss: 10.7014
Epoch 30 | Loss: 9.4993
Epoch 31 | Loss: 8.4388
Epoch 32 | Loss: 7.4168
Epoch 33 | Loss: 7.0659
Epoch 34 | Loss: 5.9982
Epoch 35 | Loss: 5.2894
Epoch 36 | Loss: 4.8408
Epoch 37 | Loss: 4.3750
Epoch 38 | Loss: 3.9858
Epoch 39 | Loss: 4.1333
Epoch 40 | Loss: 3.2084
Epoch 41 | Loss

In [None]:

# 11. 평가
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for xb, yb in test_dl:
        xb = xb.to(device)
        preds = model(xb).argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("\n Accuracy:", accuracy_score(all_labels, all_preds))
print("\n Classification Report:\n", classification_report(all_labels, all_preds))


 Accuracy: 0.5917297612114153

 Classification Report:
               precision    recall  f1-score   support

           0       0.09      0.13      0.11       246
           1       0.77      0.78      0.77      1224
           2       0.25      0.11      0.15       247

    accuracy                           0.59      1717
   macro avg       0.37      0.34      0.34      1717
weighted avg       0.59      0.59      0.59      1717


 Confusion Matrix:
 [[ 32 161  53]
 [237 957  30]
 [ 90 130  27]]
