In [1]:
from tools import fetch_arxiv_papers

papers = fetch_arxiv_papers("Language Models", 10)

In [2]:
[paper["title"] for paper in papers]

['Long-VITA: Scaling Large Multi-modal Models to 1 Million Tokens with Leading Short-Context Accuray',
 'Scaling up Test-Time Compute with Latent Reasoning: A Recurrent Depth Approach',
 'NoLiMa: Long-Context Evaluation Beyond Literal Matching',
 'Multitwine: Multi-Object Compositing with Text and Layout Control',
 'DuoGuard: A Two-Player RL-Driven Framework for Multilingual LLM Guardrails',
 'A Lightweight Method to Disrupt Memorized Sequences in LLM',
 'Transforming Science with Large Language Models: A Survey on AI-assisted Scientific Discovery, Experimentation, Content Generation, and Evaluation',
 'CodeSCM: Causal Analysis for Multi-Modal Code Generation',
 "An Annotated Reading of 'The Singer of Tales' in the LLM Era",
 'Refining Integration-by-Parts Reduction of Feynman Integrals with Machine Learning']

In [3]:
from llama_index.core import Document

def create_documents_from_papers(papers):
    documents = []
    for paper in papers:
        content = (
            f"Title: {paper['title']}\n"
            f"Authors: {', '.join(paper['authors'])}\n"
            f"Summary: {paper['summary']}\n"
            f"Published: {paper['published']}\n"
            f"Journal Reference: {paper.get('journal_ref', 'N/A')}\n"
            f"DOI: {paper.get('doi', 'N/A')}\n"
            f"Primary Category: {paper['primary_category']}\n"
            f"Categories: {', '.join(paper['categories'])}\n"
            f"PDF URL: {paper['pdf_url']}\n"
            f"arxiv URLs: {paper['arxiv_url']}\n"
        )

        documents.append(Document(text=content))
    
    return documents



In [4]:
documents = create_documents_from_papers(papers)

In [5]:
documents

[Document(id_='f7f708bb-9aa6-4c81-b7d2-7e1464e8d279', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Title: Long-VITA: Scaling Large Multi-modal Models to 1 Million Tokens with Leading Short-Context Accuray\nAuthors: Yunhang Shen, Chaoyou Fu, Shaoqi Dong, Xiong Wang, Peixian Chen, Mengdan Zhang, Haoyu Cao, Ke Li, Xiawu Zheng, Yan Zhang, Yiyi Zhou, Rongrong Ji, Xing Sun\nSummary: Establishing the long-context capability of large vision-language models is\ncrucial for video understanding, high-resolution image understanding,\nmulti-modal agents and reasoning. We introduce Long-VITA, a simple yet\neffective large multi-modal model for long-context visual-language\nunderstanding tasks. It is adept at concurrently processing and analyzing\nmodalities of image, video, and text over 4K frames or 1M tokens whi

In [6]:
from llama_index.core import Settings, VectorStoreIndex
from constants import embed_model

Settings.chunk_size = 1024
Settings.chunk_overlap = 50

index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

In [7]:
index.storage_context.persist("index/")