In [1]:
import requests
from pathlib import Path

def download_file(url: str, filename: str) -> None:
    """
    Download a file from a URL and save it to a specified location.

    Parameters:
    url (str): The URL of the file to download.
    filename (str): The name to save the file as.

    Returns:
    None
    """
    # Construct the destination path
    destination = Path('../data') / filename
    destination.parent.mkdir(parents=True, exist_ok=True)  # Ensure the directory exists

    response = requests.get(url)
    response.raise_for_status()  # Raise an exception if the request was unsuccessful

    with open(destination, 'wb') as f:
        f.write(response.content)

    print(f"File downloaded successfully to {destination}")

In [None]:
url = "copyright"

download_file(url, "almanack_of_naval_ravikant.pdf")

File downloaded successfully to ..\data\almanack_of_naval_ravikant.pdf


In [4]:
import os

from getpass import getpass
import nest_asyncio

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv()

True

In [5]:
CO_API_KEY = os.environ['CO_API_KEY'] or getpass("Enter your Cohere API key: ")

In [12]:
from llama_index.llms.cohere import Cohere

llm = Cohere(model='command-r-plus', temperature=.2, api_key=CO_API_KEY)
response = llm.complete("alexander was a great")
response

In [11]:
from cohere import Client

api_key = CO_API_KEY
cohere_client = Client(api_key)
response = cohere_client.generate(
    model='command-r-plus',
    prompt="alexander was a great",
    max_tokens=50
)

print(response)




In [13]:
from llama_index.core import Document

manual_doc = Document(text = "I am batman")

manual_doc.__dict__

{'id_': '49e2c801-978c-4ef5-b861-63488015d899',
 'embedding': None,
 'metadata': {},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {},
 'metadata_template': '{key}: {value}',
 'metadata_separator': '\n',
 'text_resource': MediaResource(embeddings=None, data=None, text='I am batman', path=None, url=None, mimetype=None),
 'image_resource': None,
 'audio_resource': None,
 'video_resource': None,
 'text_template': '{metadata_str}\n\n{content}'}

In [14]:
manual_doc_withmeta = Document(text = "i ama batman", metadata= {"file_name" : "myfile", "category" : "superhero"})
manual_doc_withmeta.__dict__

{'id_': 'cdb91110-74ca-4095-bbb4-71b361fc48f5',
 'embedding': None,
 'metadata': {'file_name': 'myfile', 'category': 'superhero'},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {},
 'metadata_template': '{key}: {value}',
 'metadata_separator': '\n',
 'text_resource': MediaResource(embeddings=None, data=None, text='i ama batman', path=None, url=None, mimetype=None),
 'image_resource': None,
 'audio_resource': None,
 'video_resource': None,
 'text_template': '{metadata_str}\n\n{content}'}

In [15]:
manual_doc.metadata = {"category" : "superhero"}
manual_doc.__dict__

{'id_': '49e2c801-978c-4ef5-b861-63488015d899',
 'embedding': None,
 'metadata': {'category': 'superhero'},
 'excluded_embed_metadata_keys': [],
 'excluded_llm_metadata_keys': [],
 'relationships': {},
 'metadata_template': '{key}: {value}',
 'metadata_separator': '\n',
 'text_resource': MediaResource(embeddings=None, data=None, text='I am batman', path=None, url=None, mimetype=None),
 'image_resource': None,
 'audio_resource': None,
 'video_resource': None,
 'text_template': '{metadata_str}\n\n{content}'}

# Chunking 

In [None]:
from llama_index.core import SimpleDirectoryReader

document = SimpleDirectoryReader(input_files=["../data/data.pdf"]).load_data()

In [20]:
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter(
    chunk_size=128, 
    chunk_overlap=16,
    paragraph_separator="\n\n"
)


nodes = parser.get_nodes_from_documents(documents=document, show_progress=True)

Parsing nodes:   0%|          | 0/242 [00:00<?, ?it/s]

In [21]:
type(nodes)

list

In [22]:
nodes[0]

TextNode(id_='662b60b6-b888-4435-bc4c-dceb68b48822', embedding=None, metadata={'page_label': '1', 'file_name': 'almanack_of_naval_ravikant.pdf', 'file_path': '..\\data\\almanack_of_naval_ravikant.pdf', 'file_type': 'application/pdf', 'file_size': 1884309, 'creation_date': '2025-02-27', 'last_modified_date': '2025-02-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e9e20c13-d10d-45cb-81d2-5a9b40db0ad9', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'almanack_of_naval_ravikant.pdf', 'file_path': '..\\data\\almanack_of_naval_ravikant.pdf', 'file_type': 'application/pdf', 'file_size': 1884309, 'creation_date': '2025-02-27', 'last_modified_date': '2025-02-27'}, hash='7d35ad

In [23]:
nodes[0].__dict__

{'id_': '662b60b6-b888-4435-bc4c-dceb68b48822',
 'embedding': None,
 'metadata': {'page_label': '1',
  'file_name': 'almanack_of_naval_ravikant.pdf',
  'file_path': '..\\data\\almanack_of_naval_ravikant.pdf',
  'file_type': 'application/pdf',
  'file_size': 1884309,
  'creation_date': '2025-02-27',
  'last_modified_date': '2025-02-27'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e9e20c13-d10d-45cb-81d2-5a9b40db0ad9', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'almanack_of_naval_ravikant.pdf', 'file_path': '..\\data\\almanack_of_naval_ravikant.pdf', 'file_type': 'application/pdf', 'file_size': 1884309, 'creation_date': '2025-02-27', 'l