# Installing the Packages and download .text data

In [1]:
!python --version
!pip install -q llama-index-embeddings-huggingface
!pip install -q transformers einops accelerate langchain bitsandbytes
!pip install -q llama-index llama-index-llms-huggingface


!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'


Python 3.10.12
--2024-02-28 10:06:12--  https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75042 (73K) [text/plain]
Saving to: ‘data/paul_graham/paul_graham_essay.txt’


2024-02-28 10:06:12 (9.20 MB/s) - ‘data/paul_graham/paul_graham_essay.txt’ saved [75042/75042]



# Import the libraries which we have to require in further use

In [2]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import Settings

import torch

# setup prompts - specific to StableLM
from llama_index.core import PromptTemplate

# Set the Prompt Template


In [3]:
# This will wrap the default prompts that are internal to llama-index
# taken from https://huggingface.co/Writer/camel-5b-hf
query_wrapper_prompt = PromptTemplate(
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{query_str}\n\n### Response:"
)

# Set & download the HuggingFace LLM Parameter using the LLamaIndex huggingface library

In [4]:


llm = HuggingFaceLLM(
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.25, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="Writer/camel-5b-hf",
    model_name="Writer/camel-5b-hf",
    system_prompt="You are Act like Q/A Assistant",
    device_map="auto",
    tokenizer_kwargs={"max_length": 2048},
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
)

Settings.chunk_size = 512
Settings.llm = llm

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

# Get the text from the document file (.text) which we have to download

In [5]:
# load documents
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
documents

[Document(id_='2875c02d-7024-427a-8691-fb02a5c5cad3', embedding=None, metadata={'file_path': '/content/data/paul_graham/paul_graham_essay.txt', 'file_name': '/content/data/paul_graham/paul_graham_essay.txt', 'file_type': 'text/plain', 'file_size': 75042, 'creation_date': '2024-02-28', 'last_modified_date': '2024-02-28', 'last_accessed_date': None}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='\n\nWhat I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn\'t write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined 

# Open Source Embbedings model by default it's use OpenAi embeddings model


In [6]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings

Settings.embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-small-en-v1.5"
)

# Create Indexes of the document

In [32]:
index = VectorStoreIndex.from_documents(documents)


# Libraries version which is sustainable for us in deployment

In [8]:
!pip list


Package                                  Version
---------------------------------------- ---------------------
absl-py                                  1.4.0
accelerate                               0.27.2
aiohttp                                  3.9.3
aiosignal                                1.3.1
alabaster                                0.7.16
albumentations                           1.3.1
altair                                   4.2.2
annotated-types                          0.6.0
anyio                                    3.7.1
appdirs                                  1.4.4
argon2-cffi                              23.1.0
argon2-cffi-bindings                     21.2.0
array-record                             0.5.0
arviz                                    0.15.1
asgiref                                  3.7.2
astropy                                  5.3.4
astunparse                               1.6.3
async-timeout                            4.0.3
atpublic                             

# It's mention the index create as a vector Object

In [9]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7adbf9f876a0>

# Query as a Stream Words

In [13]:
query_engine = index.as_query_engine(streaming=True)


# Ask Query & Print Response

In [14]:
# set Logging to DEBUG for more detailed outputs
response_stream = query_engine.query("What did the author do growing up?")

# can be slower to start streaming since llama-index often involves many LLM calls
response_stream.print_response_stream()


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


The author grew up in a small town in New England.<|endoftext|>

# upload new document whatever you want and create it's text

In [71]:
document = SimpleDirectoryReader(input_files=["./data/hadith_data.pdf"]).load_data()
document

[Document(id_='d38b7387-8f35-4f46-b177-a3cc2a0eded0', embedding=None, metadata={'page_label': '1', 'file_name': '/content/data/hadith_data.pdf', 'file_path': 'data/hadith_data.pdf', 'file_type': 'application/pdf', 'file_size': 276257, 'creation_date': '2024-02-28', 'last_modified_date': '2024-02-28', 'last_accessed_date': None}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Hadith 10\nBook:  \'bukhari\'\nChapter:  1\nSection:  1\nDescription:  \'\\n\n\\n\\n Narrated \\\'Umar bin Al-Khattab:\\n\n\\n\\n I heard Allah\\\'s Apostle saying\nHadith 20\nBook:  \'bukhari\'\nChapter:  1\nSection:  1\nDescription:  \'\n\\r\\n\\r\\n Narrated \\\'Aisha:\\r\\n\n\\r\\n\\r\\n (the mother of the faithful believers) Al-Harith bin Hisham asked Allah\\\'s Apostle \\"O Al

# Insert the new document and insert in it

In [33]:
for d in document:
    index.insert(document=d)

# references information regard document

In [19]:
index.ref_doc_info

{'2875c02d-7024-427a-8691-fb02a5c5cad3': RefDocInfo(node_ids=['9bcd6364-dae9-4bcc-90c1-e4721d64e1db', '33c22606-8a8d-4805-8277-1b3b68c8b91f', 'd97f0277-a26c-47f0-a3c1-c0502df216b5', 'f525d1fd-29c9-4d96-bd78-7d0e891f118a', '1e72032a-6edd-4a49-b7aa-67cc89b86939', '19882fc2-16d1-4102-84a4-bf92b12b72b3', 'f8966b77-d80d-4eec-87df-e196b7301ccb', 'b6b5f78c-a34d-47d0-a780-9c3b4bce722a', '8d096b70-7009-4010-aee9-f693d26aaa9d', 'e8643e44-ede4-4963-bd60-ba84d8c31749', '611e6076-efa5-4b10-9161-3776b559b07f', '4221e3ea-c8ba-434a-ad5e-bb241f5a1db9', 'ee31ef16-0191-4128-aad3-8934e1cf7cf9', 'a0f95b8a-8bc4-4ccb-b39b-e7b548867172', 'd3a71a66-d96f-42a8-8638-38a6bf670950', '62dd848c-5b1c-41fb-8069-295e664a9b02', '36455cb4-f05b-4692-9fb9-40eea7cfe5ee', '5494969b-5230-40a4-8c52-7a17f8e22c73', '9cf25752-aaf6-49b7-839e-54838b8ff57e', '781eefb6-27cc-48be-a7b1-eeb12b68abb9', '560de4ed-d121-48bf-97c5-1196757151f7', '574eca60-c99a-4554-a127-f7e36f2ed42f', '6560c186-b4b5-4ee7-831c-b5bf2c617da6', '556b980e-1768-4ac

# second document first embedding id

In [25]:
documents[0].doc_id

'2875c02d-7024-427a-8691-fb02a5c5cad3'

# first document first embedding id

In [81]:
document[0].doc_id

'd38b7387-8f35-4f46-b177-a3cc2a0eded0'

# another way to access id

In [82]:
document[0].id_

'd38b7387-8f35-4f46-b177-a3cc2a0eded0'

# document type

In [77]:
document

[Document(id_='d38b7387-8f35-4f46-b177-a3cc2a0eded0', embedding=None, metadata={'page_label': '1', 'file_name': '/content/data/hadith_data.pdf', 'file_path': 'data/hadith_data.pdf', 'file_type': 'application/pdf', 'file_size': 276257, 'creation_date': '2024-02-28', 'last_modified_date': '2024-02-28', 'last_accessed_date': None}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Hadith 10\nBook:  \'bukhari\'\nChapter:  1\nSection:  1\nDescription:  \'\\n\n\\n\\n Narrated \\\'Umar bin Al-Khattab:\\n\n\\n\\n I heard Allah\\\'s Apostle saying\nHadith 20\nBook:  \'bukhari\'\nChapter:  1\nSection:  1\nDescription:  \'\n\\r\\n\\r\\n Narrated \\\'Aisha:\\r\\n\n\\r\\n\\r\\n (the mother of the faithful believers) Al-Harith bin Hisham asked Allah\\\'s Apostle \\"O Al

# require to convert in json and List format this document type

In [86]:
import json

# Convert each Document object to a dictionary and then to JSON
json_data_list = [json.dumps(doc.dict()) for doc in document]

# string to convert json

In [94]:
documents_json = [json.loads(json_string) for json_string in json_data_list]

# now it's type of list and json

In [97]:
documents_json

[{'id_': 'd38b7387-8f35-4f46-b177-a3cc2a0eded0',
  'embedding': None,
  'metadata': {'page_label': '1',
   'file_name': '/content/data/hadith_data.pdf',
   'file_path': 'data/hadith_data.pdf',
   'file_type': 'application/pdf',
   'file_size': 276257,
   'creation_date': '2024-02-28',
   'last_modified_date': '2024-02-28',
   'last_accessed_date': None},
  'excluded_embed_metadata_keys': ['file_name',
   'file_type',
   'file_size',
   'creation_date',
   'last_modified_date',
   'last_accessed_date'],
  'excluded_llm_metadata_keys': ['file_name',
   'file_type',
   'file_size',
   'creation_date',
   'last_modified_date',
   'last_accessed_date'],
  'relationships': {},
  'text': 'Hadith 10\nBook:  \'bukhari\'\nChapter:  1\nSection:  1\nDescription:  \'\\n\n\\n\\n Narrated \\\'Umar bin Al-Khattab:\\n\n\\n\\n I heard Allah\\\'s Apostle saying\nHadith 20\nBook:  \'bukhari\'\nChapter:  1\nSection:  1\nDescription:  \'\n\\r\\n\\r\\n Narrated \\\'Aisha:\\r\\n\n\\r\\n\\r\\n (the mother of t

# Extract only Id's

In [98]:
# Extracting all id_ values
ids = [obj['id_'] for obj in documents_json]

# extract one document ids

In [99]:
ids

['d38b7387-8f35-4f46-b177-a3cc2a0eded0',
 '123561ee-4851-4f27-accb-f7f9933b192e',
 '95e1f423-0361-4f1a-912a-a68dbfa8a336',
 '490e6a65-83f7-48f3-96fe-24f8c0fdddab',
 'e8e7108c-e3da-442e-988a-4636f025d60b',
 '87fd1597-7ce7-40f1-8d65-9b66a0efc0c3',
 '44844c79-1eb3-409a-8b73-78bf42573481',
 '4003a1db-b35d-4420-9435-64a419cd1827',
 '24149414-7bab-4c5e-a46c-b108d801f372',
 '14e3b4e2-57b5-407e-b148-6f20837d2fba',
 '692c47ac-13e8-489b-9f64-57ecb3b63064',
 'cadd5162-5f93-43f2-ac40-2a4a015a9334',
 'a47a0b3d-092b-4294-a462-e26fff59b931',
 '8d806a60-0f50-42f5-b09d-17bf1cb48f92',
 '78e84e6d-3304-44c3-b587-3c48a4339879',
 '3c21b4c0-a094-4182-9b13-c26353c0289a',
 '10204f87-2051-4cc5-aa09-3900079368cb',
 '93ac0aa0-f48e-4eaa-8f99-38cbff36f852',
 '736080ac-c084-4e73-bfb1-ee07a1f54965',
 'be64a86b-041d-456e-add7-eaf11658dd7b',
 '91a6f65e-f9ff-4709-9b0d-e72b9d048a43',
 '3cbb5789-67a2-4aff-a5fa-0fd9f88d8dbe',
 '36bfce7a-e857-44fc-82e7-201ba8f9505f',
 '9320634f-f501-47ed-b0a9-24baa530211c',
 'ee87d557-169d-

# using this function we can remove the specific id

In [63]:
index.delete_ref_doc('4ab78ab7-0b47-4f6a-a04b-d8f4618aa8de', delete_from_docstore=True)

# after removing for again verificiation

In [59]:
index.ref_doc_info

{'f160cdcf-d69b-4059-902b-179e3d987306': RefDocInfo(node_ids=['30520c72-c7ca-4542-8eaf-3cef7b938118'], metadata={'page_label': '13', 'file_name': '/content/data/hadith_data.pdf', 'file_path': 'data/hadith_data.pdf', 'file_type': 'application/pdf', 'file_size': 276257, 'creation_date': '2024-02-28', 'last_modified_date': '2024-02-28', 'last_accessed_date': None}),
 '602808b9-f30a-4181-acda-44c730b7df8a': RefDocInfo(node_ids=['1c3b29e7-fe39-4bec-8139-b187e70f4bff'], metadata={'page_label': '14', 'file_name': '/content/data/hadith_data.pdf', 'file_path': 'data/hadith_data.pdf', 'file_type': 'application/pdf', 'file_size': 276257, 'creation_date': '2024-02-28', 'last_modified_date': '2024-02-28', 'last_accessed_date': None}),
 'e0a86186-5d2c-4309-b3e4-94a4175f3050': RefDocInfo(node_ids=['317cde5f-97f2-4671-8f20-64ca5ebc4631'], metadata={'page_label': '15', 'file_name': '/content/data/hadith_data.pdf', 'file_path': 'data/hadith_data.pdf', 'file_type': 'application/pdf', 'file_size': 276257,

# store embeddings in storage folder using persists in future if we require same embeddings

In [34]:
index.storage_context.persist("./storage")

# Load again the embeddings using persists

In [35]:
from llama_index.core.indices.loading import load_index_from_storage
from llama_index.core.storage.storage_context import StorageContext
storage_context = StorageContext.from_defaults(
            persist_dir='./storage')

index123 = load_index_from_storage(
            storage_context=storage_context)

# our previous and this reference id's should same in embeddings

In [36]:
index123.ref_doc_info

{'2875c02d-7024-427a-8691-fb02a5c5cad3': RefDocInfo(node_ids=['4c41fc24-f06d-4c55-8971-a5030f2a2f5f', 'b3a66d56-6f5c-4c81-a0bb-efd36181d84a', '45ee6e55-20f6-405b-bbd6-680c882081a4', '1545e74f-8cbd-4e31-9995-376e251de7ae', '02163370-aa60-49f6-8454-9d3066203f46', 'e6270a28-af22-4207-acce-813bdb3135e6', 'aa077af2-0d5f-4b6f-8a46-e024011d4d30', '8447d5c9-de31-4c15-9ab1-41c84b282847', '57d511e8-bea6-4655-92db-c24368679b8f', 'f266dcc7-5b6e-474d-8f33-8e9e3c247772', '94ad21ae-a790-4a58-9917-45f181ecef77', 'cc6a1b1c-63f1-426e-86a1-cbcde9d00d02', 'b965fcef-824f-4133-8c15-823c86a6e033', '1e1456a3-c24a-4f89-a76b-ee7738660038', '59a10910-ca42-41bb-8fe1-b60a68051594', '626105ad-6b66-45f6-84be-015f3003197f', '870fb59b-e7c9-40bb-a129-c3e31bbefed7', '13bb77d4-683a-4f37-8010-c8840d31d017', '2944d492-4320-4442-8126-7612a6ced6e5', 'c9c7330e-6165-4383-9c3c-220aa399a699', 'badc285d-925f-4d66-8034-4b3239146fd9', '43e57946-7336-4dda-8a13-9fd39dd0eb1b', '57cd3bb7-e863-46e0-9c69-229d73dfd480', 'f4d2c1f1-38f4-4ab