In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'pdf-data:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F4698617%2F7984077%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240330%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240330T174612Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D4636a4071b308ae8b39c9fd6cca60ef70b2aedce112d0753ec7d4f221c5f2d997652082079bd1e80ab74b1fd405e3fe28cb949d6ce4800654acbde8c718962cfc9755f434f529c8daaa51bae17d50bdeff0d6555de7dc7f9fd0d5e9fcb277ae625391f631726a262be2257ea8073daee4800edafb84d165ff6af92ba2a49641a104fec690322cd06a75acc476fd17b78bbb4f901f2988dd000d0c4fbc991aad616cbb30e7a907b0d38ef140fafa6d1c19f08d782263f696553cd198d95172b1dfe5ccdf190eb432602d171e1f389c983e4e563b3a44dcd94428615785d70cf2b839a36f64cbf97dfd25f89b5085c5b892c40883a9b7cf0fe66efc9a54e3b835c'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
!pip install pypdf
!pip install -q transformers einops accelerate langchain bitsandbytes
!pip install install sentence_transformers
!pip install llama_index
!pip install llama-index-llms-huggingface



In [None]:
import torch
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core import ServiceContext
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

In [None]:
# /content/data
documents=SimpleDirectoryReader("/kaggle/input/pdf-data").load_data()
documents

 Document(id_='9c723efa-ee71-46a5-92d2-6c235c270733', embedding=None, metadata={'page_label': '2', 'file_name': 'A Christmas Carol.pdf', 'file_path': '/kaggle/input/pdf-data/A Christmas Carol.pdf', 'file_type': 'application/pdf', 'file_size': 437963, 'creation_date': '2024-03-30', 'last_modified_date': '2024-03-30'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='time—of\nall\nthe\ngood\ndays\nin\nthe\nyear,\non\nChristmas\nEve—old\nScrooge\nsat\nbusy\nin\nhis\ncounting-house.\nIt\nwas\ncold,\nbleak,\nbiting\nweather:\nfoggy\nwithal:\nand\nhe\ncould\nhear\nthe\npeople\nin\nthe\ncourt\noutside,\ngo\nwheezing\nup\nand\ndown,\nbeating\ntheir\nhands\nupon\ntheir\nbreasts,\nand\nstamping\ntheir\nfeet\nupon\nthe\npavement\nstones\nto\nwarm\nthem.\nThe\ncity\n

In [None]:
system_prompt="""
You are a Q&A assistant. Your goal is to answer questions as
accurately as possible based on the instructions and context provided.
"""
## Default format supportable by LLama2
query_wrapper_prompt=SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

In [None]:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import notebook_login, login

secret_label = "ir"
secret_value = UserSecretsClient().get_secret(secret_label)
login(token=secret_value)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="meta-llama/Llama-2-7b-chat-hf",
    model_name="meta-llama/Llama-2-7b-chat-hf",
    device_map="auto",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
embed_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [None]:
!pip install llama-index-embeddings-langchain
# The ServiceContext is a bundle of commonly used resources used during the indexing and querying stage in a LlamaIndex pipeline/application
service_context=ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)



  service_context=ServiceContext.from_defaults(


In [None]:
service_context

ServiceContext(llm_predictor=LLMPredictor(system_prompt=None, query_wrapper_prompt=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>), prompt_helper=PromptHelper(context_window=4096, num_output=256, chunk_overlap_ratio=0.1, chunk_size_limit=None, separator=' '), embed_model=LangchainEmbedding(model_name='sentence-transformers/all-mpnet-base-v2', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7dd1de47cbe0>), transformations=[SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7dd1de47cbe0>, id_func=<function default_id_func at 0x7dd202268b80>, chunk_size=1024, chunk_overlap=200, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?')], llama_logger=<llama_index.core.service_context_elements.llama_logger.LlamaLogger object at 0x7dd1de985900>, callback_manager=<llama_index.core.callbacks.ba

In [None]:
index=VectorStoreIndex.from_documents(documents,service_context=service_context)

In [None]:
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x7dd1ded61150>