In [None]:
%%capture
!pip install llama-index==0.10.37 html2text

In [None]:
import os

from getpass import getpass
import nest_asyncio

from dotenv import load_dotenv

nest_asyncio.apply()

load_dotenv()

In [None]:
import requests
from pathlib import Path

# Base URL for Project Gutenberg texts
base_url = "https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt"

# Directory to save the downloaded files
directory = Path("../data")

# Create the directory if it doesn't exist
directory.mkdir(parents=True, exist_ok=True)

# Generate a list of book IDs to download
book_ids = range(1, 11)  # This will create a range from 1 to 10

# Generate URLs for each book ID
urls = [base_url.format(book_id=book_id) for book_id in book_ids]

# Download each file and save it in the specified directory
for url in urls:
    response = requests.get(url)
    if response.status_code == 200:
        # Extract the filename from the URL using the book ID and create a file name
        book_id = url.split('/')[-2]  # Extracts the book ID from the URL
        filename = f"pg{book_id}.txt"
        file_path = directory / filename
        # Save the file to the specified directory
        file_path.write_text(response.text, encoding='utf-8')
        print(f"Downloaded {filename} to {file_path}")
    else:
        print(f"Failed to download {url}. HTTP status code: {response.status_code}")

In [None]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(r"C:\Users\anteb\PycharmProjects\JupyterProject\data").load_data()

In [None]:
len(documents)

In [None]:
type(documents[0])

In [None]:
documents[3].__dict__

In [None]:
from llama_index.core.node_parser import SentenceSplitter

parser = SentenceSplitter(
    chunk_size=128, # in tokens
    chunk_overlap=16, #in tokens
    paragraph_separator="\n\n"
)

nodes = parser.get_nodes_from_documents(documents, show_progress=True)

In [None]:
type(nodes[42])

In [None]:
nodes[41].__dict__

In [None]:
from llama_index.core.schema import TextNode, NodeRelationship, RelatedNodeInfo

nodes[2].relationships[NodeRelationship.NEXT] = RelatedNodeInfo(
    node_id=nodes[3].node_id
)



nodes[4].relationships[NodeRelationship.PARENT] = RelatedNodeInfo(
    node_id=nodes[5].node_id, metadata={"Romie": "Mom", "Harpreet": "Dad", "Jind":"Daughter", "Jugaad":"Son"},
)

In [None]:
print(nodes[4].relationships)