In [1]:
from tools import fetch_arxiv_papers

papers = fetch_arxiv_papers("Language Models", 10)

In [2]:
[paper["title"] for paper in papers]

['ChatGarment: Garment Estimation, Generation and Editing via Large Language Models',
 'Examining Imbalance Effects on Performance and Demographic Fairness of Clinical Language Models',
 'Comprehensive Multi-Modal Prototypes are Simple and Effective Classifiers for Vast-Vocabulary Object Detection',
 'Memory makes computation universal, remember?',
 'Cross-Lingual Text-Rich Visual Comprehension: An Information Theory Perspective',
 'PepTune: De Novo Generation of Therapeutic Peptides with Multi-Objective-Guided Discrete Diffusion',
 'ResearchTown: Simulator of Human Research Community',
 'Survey of Large Multimodal Model Datasets, Application Categories and Taxonomy',
 'ADC: Enhancing Function Calling Via Adversarial Datasets and Code Line-Level Feedback',
 'Deliberation in Latent Space via Differentiable Cache Augmentation']

In [3]:
from llama_index.core import Document


def create_documents_from_papers(papers):
    documents = []
    for paper in papers:
        content = (
            f"Title: {paper['title']}\n"
            f"Authors: {', '.join(paper['authors'])}\n"
            f"Summary: {paper['summary']}\n"
            f"Published: {paper['published']}\n"
            f"Journal Reference: {paper['journal_ref']}\n"
            f"DOI: {paper['doi']}\n"
            f"Primary Category: {paper['primary_category']}\n"
            f"Categories: {', '.join(paper['categories'])}\n"
            f"PDF URL: {paper['pdf_url']}\n"
            f"arXiv URL: {paper['arxiv_url']}\n"
        )
        documents.append(Document(text=content))
    return documents

In [4]:
documents = create_documents_from_papers(papers)

In [5]:
documents

[Document(id_='c7c54d4f-f645-445e-af2b-8b6c5d53de15', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text="Title: ChatGarment: Garment Estimation, Generation and Editing via Large Language Models\nAuthors: Siyuan Bian, Chenghao Xu, Yuliang Xiu, Artur Grigorev, Zhen Liu, Cewu Lu, Michael J. Black, Yao Feng\nSummary: We introduce ChatGarment, a novel approach that leverages large\nvision-language models (VLMs) to automate the estimation, generation, and\nediting of 3D garments from images or text descriptions. Unlike previous\nmethods that struggle in real-world scenarios or lack interactive editing\ncapabilities, ChatGarment can estimate sewing patterns from in-the-wild images\nor sketches, generate them from text descriptions, and edit garments based on\nuser instructions, all within an interactive dialogue.

In [6]:
from llama_index.core import Settings, VectorStoreIndex

from constants import embed_model

Settings.chunk_size = 1024
Settings.chunk_overlap = 50

index = VectorStoreIndex.from_documents(documents, embed_model=embed_model)

In [7]:
index.storage_context.persist("index/")