In [2]:
from pathlib import Path
import os
import sys
import pandas as pd
from IPython.display import display

PROJECT_ROOT = Path('..').resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from core.config import load_training_config
from core.data import available_datasets, build_datasets
from core.train_eval import train_and_evaluate
from core.utils import setup_logging
from model import MODEL_REGISTRY

setup_logging()
CONFIG_PATH = PROJECT_ROOT / 'configs' / 'default.yaml'
BASE_CONFIG = load_training_config(CONFIG_PATH)
RUN_HISTORY = []


In [None]:
dataset_names = available_datasets()
if not dataset_names:
    raise RuntimeError('No datasets found in dataset/.')
DATASET_NAME = dataset_names[1]  # change if you want a different dataset
print('Available datasets:', dataset_names)
print('Selected dataset:', DATASET_NAME)

Available datasets: ['fake-news-classification', 'fake-news-detection-datasets', 'llm-fake-news']
Selected dataset: fake-news-detection-datasets


In [None]:
first_data_path = "/Users/dorong/Desktop/hackerthonTA/fake-news-detection/dataset/fake-news-classification"

data1train = pd.read_csv(os.path.join(first_data_path, 'train.csv'), sep=';', index_col=0)
data1val = pd.read_csv(os.path.join(first_data_path, 'val.csv'), sep=';', index_col=0)
data1test = pd.read_csv(os.path.join(first_data_path, 'test.csv'), sep=';', index_col=0)
# data1train과 data1val 합치기
dataset_1 = pd.concat([data1train, data1val], ignore_index=True)
dataset_1.head(5)

# # CSV 파일로 저장
# output_path = "/Users/dorong/Desktop/hackerthonTA/fake-news-detection/dataset/fake-news-classification/dataset1.csv"
# dataset1.to_csv(output_path, sep=';', index=False)

# print(f"dataset1.csv 생성 완료!")
# print(f"총 행 수: {len(dataset1)}")
# print(f"train: {len(data1train)}, val: {len(data1val)}, 합계: {len(dataset1)}")

Unnamed: 0,title,text,label
0,Palestinians switch off Christmas lights in Be...,"RAMALLAH, West Bank (Reuters) - Palestinians s...",1
1,China says Trump call with Taiwan president wo...,BEIJING (Reuters) - U.S. President-elect Donal...,1
2,FAIL! The Trump Organization’s Credit Score W...,While the controversy over Trump s personal ta...,0
3,Zimbabwe military chief's China trip was norma...,BEIJING (Reuters) - A trip to Beijing last wee...,1
4,THE MOST UNCOURAGEOUS PRESIDENT EVER Receives ...,There has never been a more UNCOURAGEOUS perso...,0


In [None]:
third_data_path = "/Users/dorong/Desktop/hackerthonTA/fake-news-detection/dataset/fake-news-detection-datasets"
data3train = pd.read_csv(os.path.join(third_data_path, 'train.csv'))
data3test = pd.read_csv(os.path.join(third_data_path, 'test.csv'))
dataset_3 = data3train.sample(frac=0.1, random_state=42)
print(len(data3train_sample))

2694


In [None]:
fourth_data_path = "/Users/dorong/Desktop/hackerthonTA/fake-news-detection/dataset/llm-fake-news"
data4train = pd.read_csv(os.path.join(fourth_data_path, 'train.csv'))
data4test = pd.read_csv(os.path.join(fourth_data_path, 'test.csv'))

dataset_4 = data4train.sample(frac=0.01, random_state=42)
print(len(data4train_sample))

2320


In [None]:
MODEL_OPTIONS = list(MODEL_REGISTRY.keys())
MODEL_NAME = MODEL_OPTIONS[2]  # change to try other models
print('Available models:', MODEL_OPTIONS)
print('Selected model:', MODEL_NAME)
EPOCHS = 2  # feel free to increase once things run


Available models: ['bow_mlp', 'cnn_text', 'bilstm', 'tiny_transformer']
Selected model: bilstm


In [None]:
config = load_training_config(CONFIG_PATH, overrides={'epochs': EPOCHS})
loaders, vocab, tokenizer, info = build_datasets(
    name=DATASET_NAME,
    batch_size=config.batch_size,
    max_len=config.max_len,
    num_workers=config.num_workers,
    max_vocab_size=20000,
)
model_cls = MODEL_REGISTRY[MODEL_NAME]
model = model_cls(vocab_size=len(vocab), num_classes=2)
results, run_dir = train_and_evaluate(
    model,
    loaders,
    config,
    dataset_name=DATASET_NAME,
    model_name=MODEL_NAME,
    run_root=PROJECT_ROOT / 'runs',
)
RUN_HISTORY.append({'dataset': DATASET_NAME, 'model': MODEL_NAME, 'results': results, 'run_dir': str(run_dir)})
results


[2025-10-10 16:40:22] INFO fake_news: Starting training | dataset=fake-news-detection-datasets model=bilstm epochs=2 batch_size=64
[2025-10-10 16:40:36] INFO fake_news: Epoch 1 | train_loss=0.1104 val_loss=0.0024 val_f1=0.9993
[2025-10-10 16:40:51] INFO fake_news: Epoch 2 | train_loss=0.0052 val_loss=0.0012 val_f1=0.9998
[2025-10-10 16:41:01] INFO fake_news: Training complete | best_val_f1=0.9998 run_dir=/home/gamejoongsa/hackathon/runs/fake-news-detection-datasets/bilstm/20251010_074022


{'train': {'accuracy': 0.9996658994728637,
  'precision': 0.9996737984511883,
  'recall': 0.9996566440627783,
  'f1': 0.9996651865962032,
  'auroc': 0.9999577806649476,
  'loss': 0.001968211633749226},
 'val': {'accuracy': 0.999777258046553,
  'precision': 0.99977678580307,
  'recall': 0.99977678580307,
  'f1': 0.99977678580307,
  'auroc': 0.999999801123335,
  'loss': 0.001180805862987403},
 'test': {'accuracy': 0.999109230597929,
  'precision': 0.9991176683124665,
  'recall': 0.9990970804615935,
  'f1': 0.9991073245210828,
  'auroc': 0.9998260605782303,
  'loss': 0.004000036923826411}}