In [2]:
import numpy as np
import requests
import json
import bibtexparser
from tqdm import tqdm

import torch

def parse_bibtex(bibtex_file):
    with open(bibtex_file, "r") as file:
        bib_database = bibtexparser.load(file)

    papers = []
    for entry in bib_database.entries:
        if "title" in entry:
            papers.append(entry["title"])

    with open("/workspace/dongwoo/chatbot_project/data/titles.json", "w") as f:
        json.dump(papers, f, indent=4)

    return papers


bib_file_path = "/workspace/dongwoo/chatbot_project/references.bib"

paper_titles = parse_bibtex(bib_file_path)



In [3]:
paper_titles

['Tsmixer: Lightweight mlp-mixer model for multivariate time series forecasting',
 'Are transformers effective for time series forecasting?',
 'Time series analysis: forecasting and control',
 'Crossformer: Transformer utilizing cross-dimension dependency for multivariate time series forecasting',
 'Stock prediction based on technical indicators using deep learning model.',
 'Enhancing the locality and breaking the memory bottleneck of transformer on time series forecasting',
 'Informer: Beyond efficient transformer for long sequence time-series forecasting',
 'Autoformer: Decomposition transformers with auto-correlation for long-term series forecasting',
 'Pyraformer: Low-complexity pyramidal attention for long-range time series modeling and forecasting',
 'Swin transformer: Hierarchical vision transformer using shifted windows',
 'Fedformer: Frequency enhanced decomposed transformer for long-term series forecasting',
 'Preformer: predictive transformer with multi-scale segment-wise c

In [4]:
import os
import json
import arxiv
import requests
from tqdm import tqdm

PDF_DIR = "data/pdf"
os.makedirs(PDF_DIR, exist_ok=True)


with open("data/titles.json", "r") as f:
    titles = json.load(f)
    
    
def download_paper_from_arxiv(title):
    """Arxiv에서 논문을 검색하여 PDF를 다운로드"""
    search = arxiv.Search(
        query=title,
        max_results=1,
        sort_by=arxiv.SortCriterion.Relevance
    )
    
    for result in search.results():
        pdf_url = result.pdf_url
        pdf_path = os.path.join(PDF_DIR, f"{result.entry_id.split('/')[-1]}.pdf")

        # PDF 다운로드
        response = requests.get(pdf_url)
        if response.status_code == 200:
            with open(pdf_path, "wb") as f:
                f.write(response.content)
            return pdf_path
        else:
            return None

def download_all_papers():
    """titles.json의 제목을 기반으로 논문을 다운로드"""
    
    # ✅ titles를 함수 내부에서 로드하도록 변경
    with open("data/titles.json", "r") as f:
        titles = json.load(f)

    downloaded_papers = []
    missing_titles = []

    for title in tqdm(titles, desc="Downloading Papers"):
        pdf_path = download_paper_from_arxiv(title)
        if pdf_path:
            downloaded_papers.append({"title": title, "pdf_path": pdf_path})
        else:
            missing_titles.append(title)

    # 다운로드된 논문 목록 저장
    with open("data/downloaded_papers.json", "w") as f:
        json.dump(downloaded_papers, f, indent=4)

    # Arxiv에서 찾을 수 없는 논문 저장
    with open("data/missing_titles.json", "w") as f:
        json.dump(missing_titles, f, indent=4)

    print("📄 모든 논문 다운로드 완료.")
    if missing_titles:
        print(f"⚠️ {len(missing_titles)}개의 논문을 찾을 수 없습니다. 'missing_titles.json'에서 확인하세요.")



In [5]:
download_all_papers()

  for result in search.results():
Downloading Papers: 100%|██████████| 51/51 [03:18<00:00,  3.89s/it]

📄 모든 논문 다운로드 완료.



