# 2진분류 (0.02, -0.02 기준 분류)

In [5]:
import pandas as pd
import os

def load_stock_and_news(stock_path, news_path):
    stock = pd.read_csv(stock_path, parse_dates=["Datetime"])
    news = pd.read_csv(news_path, parse_dates=["pubDate"])

    stock["Datetime"] = stock["Datetime"].dt.tz_localize(None)
    news["pubDate"] = news["pubDate"].dt.tz_localize(None)

    # 정렬 및 제외 열 제거
    stock = stock.sort_values("Datetime").reset_index(drop=True)
    stock = stock.drop(columns=[col for col in stock.columns if col.startswith("Is_")])

    return stock, news


def make_binary_merged_df(stock_df, news_df, company):
    rows = []

    for _, news_row in news_df.iterrows():
        news_time = news_row["pubDate"]

        # 뉴스 이후 가장 가까운 주가
        future_row = stock_df[stock_df["Datetime"] > news_time].head(1)
        if future_row.empty:
            continue

        target_row = future_row.iloc[0]
        target_return = target_row.get("Returns", None)
        if pd.isna(target_return):
            continue

        # 과거 5개
        past_rows = stock_df[stock_df["Datetime"] < target_row["Datetime"]].tail(5)
        if len(past_rows) < 5:
            continue

        if target_return >= 0.01:
            label = 1
        elif target_return <= -0.01:
            label = 0
        else:
            continue  # 기준 미달인 경우는 무시

        row = {
            "company": company,
            "news_time": news_time,
            "target_return": target_return,
            "target": label,
            "finbert_positive": news_row["finbert_positive"],
            "finbert_neutral": news_row["finbert_neutral"],
            "finbert_negative": news_row["finbert_negative"]
        }

        for i, (_, p_row) in enumerate(past_rows.iterrows(), 1):
            for col in stock_df.columns:
                if col == "Datetime":
                    continue
                row[f"x{i}_{col}"] = p_row[col]

        rows.append(row)

    return pd.DataFrame(rows)

In [6]:
base_dir = "./"  # 압축 풀린 폴더 기준
companies = {
    "AAPL": ("AAPL_1hour_data_365days.csv", "apple_finbert_finnhub.csv"),
    "AMZN": ("AMZN_1hour_data_365days.csv", "amazon_finbert_finnhub.csv"),
    "GOOGL": ("GOOGL_1hour_data_365days.csv", "google_finbert_finnhub.csv"),
    "MSFT": ("MSFT_1hour_data_365days.csv", "microsoft_finbert_finnhub.csv"),
    "TSLA": ("TSLA_1hour_data_365days.csv", "tesla_finbert_finnhub.csv"),
}

dfs = []
for company, (stock_file, news_file) in companies.items():
    stock_path = os.path.join(base_dir, stock_file)
    news_path = os.path.join(base_dir, news_file)
    if not os.path.exists(stock_path) or not os.path.exists(news_path):
        continue

    stock_df, news_df = load_stock_and_news(stock_path, news_path)
    merged_df = make_binary_merged_df(stock_df, news_df, company)
    dfs.append(merged_df)

# 최종 병합
final_df = pd.concat(dfs, ignore_index=True)
final_df.to_csv("news_stock_binary_classification.csv", index=False)
print("news_stock_binary_classification.csv 저장 완료")


news_stock_binary_classification.csv 저장 완료


In [25]:
import pandas as pd

# CSV 파일 로드
df = pd.read_csv("news_stock_binary_classification.csv")
df.head(5)
# 라벨 분포 확인
label_counts = df["target"].value_counts().sort_index()
label_ratio = df["target"].value_counts(normalize=True).sort_index() * 100

# 출력
print("📊 클래스 분포 (샘플 수):")
print(label_counts)

print("\n📊 클래스 비율 (%):")
print(label_ratio.round(2).astype(str) + " %")


📊 클래스 분포 (샘플 수):
target
0    2428
1    2519
Name: count, dtype: int64

📊 클래스 비율 (%):
target
0    49.08 %
1    50.92 %
Name: proportion, dtype: object


In [7]:
import pandas as pd
import numpy as np
import os

def load_stock_and_news(stock_path, news_path):
    stock = pd.read_csv(stock_path, parse_dates=["Datetime"])
    news = pd.read_csv(news_path, parse_dates=["pubDate"])

    stock["Datetime"] = stock["Datetime"].dt.tz_localize(None)
    news["pubDate"] = news["pubDate"].dt.tz_localize(None)

    stock = stock.sort_values("Datetime").reset_index(drop=True)
    stock = stock.drop(columns=[col for col in stock.columns if col.startswith("Is_")])

    return stock, news

def classify_context(timestamp):
    hour, minute = timestamp.hour, timestamp.minute
    if hour < 9 or (hour == 9 and minute < 30):
        return "premarket"
    elif 9 <= hour < 16:
        return "intraday"
    else:
        return "aftermarket"

def make_binary_merged_df(stock_df, news_df, company):
    rows = []

    for _, news_row in news_df.iterrows():
        news_time = news_row["pubDate"]
        context = classify_context(news_time)

        future_row = stock_df[stock_df["Datetime"] > news_time].head(1)
        if future_row.empty:
            continue

        target_row = future_row.iloc[0]
        target_return = target_row.get("Returns", None)
        if pd.isna(target_return):
            continue

        past_rows = stock_df[stock_df["Datetime"] < target_row["Datetime"]].tail(5)
        if len(past_rows) < 5:
            continue

        if target_return >= 0.01:
            label = 1
        elif target_return <= -0.01:
            label = 0
        else:
            continue

        row = {
            "company": company,
            "news_time": news_time,
            "context": context,
            "target_return": target_return,
            "target": label,
            "finbert_positive": news_row["finbert_positive"],
            "finbert_neutral": news_row["finbert_neutral"],
            "finbert_negative": news_row["finbert_negative"]
        }

        for i, (_, p_row) in enumerate(past_rows.iterrows(), 1):
            for col in stock_df.columns:
                if col == "Datetime":
                    continue
                row[f"x{i}_{col}"] = p_row[col]

        rows.append(row)

    return pd.DataFrame(rows)


In [8]:
base_dir = "./"
companies = {
    "AAPL": ("AAPL_1hour_data_365days.csv", "apple_finbert_finnhub.csv"),
    "AMZN": ("AMZN_1hour_data_365days.csv", "amazon_finbert_finnhub.csv"),
    "GOOGL": ("GOOGL_1hour_data_365days.csv", "google_finbert_finnhub.csv"),
    "MSFT": ("MSFT_1hour_data_365days.csv", "microsoft_finbert_finnhub.csv"),
    "TSLA": ("TSLA_1hour_data_365days.csv", "tesla_finbert_finnhub.csv"),
}

dfs = []
for company, (stock_file, news_file) in companies.items():
    stock_path = os.path.join(base_dir, stock_file)
    news_path = os.path.join(base_dir, news_file)
    if not os.path.exists(stock_path) or not os.path.exists(news_path):
        continue

    stock_df, news_df = load_stock_and_news(stock_path, news_path)
    merged_df = make_binary_merged_df(stock_df, news_df, company)
    dfs.append(merged_df)

final_df = pd.concat(dfs, ignore_index=True)
final_df.to_csv("news_stock_binary_classification.csv", index=False)
