In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'paper-nurul:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5179709%2F8647788%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240615%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240615T145927Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Db201d4e03b7a0d726ff774e09289e65c16030d3de30f44dee3826b00b70b846712b757d8b3f0487f014cb29568ea7572404e3f6be172e367d69fb1f91605eb1959d69d4095c839c6d93a87af8eb267c97e601483a6e85bc585f58e5aa90e1a57b9062ee4b2e045949e023612489bb92f79df4551cae855926834ae887f6bae743524a1d9f791f0ea58055533306a15636b724fe6e43581859c1040c0c91d31f3363c36ffa33a2a96ba69a478216ac8cde7e11ce19c6e42254082f3b86edc101aa238521c841dde6423ab812965db34e3e7569b5667de51d2377e2305dc30d6a965fbf7b8140f8be1eaf98bfe926b2051cce227b20f235f16432f8510b4d03b6c'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# RAG System Using Llama2 With Hugging Face
## Import the required library
pypdf which is used to manipulate PDF files in Python. Key functions include:

* Reading PDFs: Open and read PDF content.
* Merging PDFs: Combine multiple PDFs into one.
* Splitting PDFs: Extract specific pages from a PDF.
* Rotating Pages: Rotate pages within a PDF.
* Adding Watermarks: Insert watermarks into PDF pages.
* Filling Forms: Fill out and extract data from PDF forms.
* Extracting Text: Extract text from PDF pages.

In [None]:
!pip install pypdf

* Transformers: Provides pre-trained models for NLP tasks.
* Einops: Facilitates flexible tensor operations.
* Accelerate: Streamlines training and inference on GPUs and TPUs.
* Langchain: Builds applications with language models and external data.
* Bitsandbytes: Optimizes memory usage for large model training.

In [None]:
!pip install -q transformers einops accelerate langchain bitsandbytes

Install sentence transformer for generating sentence embeddings. Sentence embeddings are numerical representations of sentences that capture their semantic meaning, allowing similarity comparisons between sentences based on their content rather than just their lexical overlap.

In [None]:
## Embedding
!pip install install sentence_transformers

installs the Python package llama_index, which provides functionality for building and querying large-scale approximate nearest neighbor (ANN) indexes. These indexes are essential for efficiently searching through high-dimensional data, such as embeddings from natural language processing models.

Key features of llama_index typically include:

Building Indexes: Allows you to construct ANN indexes from your dataset.

Querying Indexes: Supports fast retrieval of nearest neighbors based on cosine similarity or other distance metrics.

Efficiency: Designed to handle large datasets efficiently, making it suitable for production systems and applications where speed and scalability are critical.

In [None]:
pip install llama_index

In [None]:
pip install llama-index-llms-huggingface

This is the most important library for the RAG system! llama index can give us some extra feature to store pdfs as vector index, making pipelines, etc

In [None]:
from llama_index.core import VectorStoreIndex,SimpleDirectoryReader,ServiceContext
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.prompts.prompts import SimpleInputPrompt

## Load your data files (you can add more than 1)
with llama index, we can read the directory or files (pdfs) into text chunks

In [None]:
documents=SimpleDirectoryReader("/kaggle/input/paper-nurul/").load_data()
documents

System Prompt: This is a structured text snippet used to integrate with systems like LLama2. It provides instructions to the system about its role or task. In this case, the prompt defines the system as an assistant teacher whose responsibility is to provide accurate answers to questions and fulfill requests from the teacher based on the given context and instructions. It also specifies that if the system doesn't have the answer, it should acknowledge its knowledge limitation.

Query Wrapper Prompt: This represents a standard format for requesting input from users or systems regarding questions or requests. Using SimpleInputPrompt, the system can receive input in the form of questions or specific instructions that it needs to address.

By using these prompts, LLama2 can effectively function as an assistant teacher, responding accurately to queries and fulfilling tasks based on the provided context and instructions.

In [None]:
system_prompt="""
Kamu adalah seorang asisten guru. tugas kamu adalah memberikan semua jawaban dari pertanyaan yang ditanyakan, dan permintaan yang diminta oleh guru seakurat mungkin berdasarkan instruksi dan konteks yang diberikan. Jika kamu tidak tahu jawabannya, bilang kamu tidak mengetahui hal tersebut karena keterbatasan pengetahuan.
"""
## Default format supportable by LLama2
query_wrapper_prompt=SimpleInputPrompt("<|USER|>{query_str}<|ASSISTANT|>")

if model that you stored in huggingface need permission, login into your account. but for us, the model is free to use, so we can skip this one!

In [None]:
from huggingface_hub import notebook_login
notebook_login()

## Instance set up
**Context Parameters:**
context_window=4096: Specifies the maximum context length (in tokens) the model considers, which helps in maintaining relevant conversational context.

**Generation Parameters:**
max_new_tokens=512: Limits the number of tokens the model can generate in response to a query, ensuring concise and relevant answers.
generate_kwargs={"temperature": 0.5, "do_sample": False}: Controls the generation process. temperature influences the randomness of responses, while do_sample=False suggests deterministic output.

**Prompt Settings:**
system_prompt=system_prompt: Provides the system prompt text, defining the assistant's role and behavior.
query_wrapper_prompt=query_wrapper_prompt: Defines the format for user queries or instructions.

**Model and Tokenizer:**
tokenizer_name="Equinox391/Athena-Llama-2-7b-chat-finetune": Specifies the tokenizer used with the model.
model_name="Equinox391/Athena-Llama-2-7b-chat-finetune": Indicates the fine-tuned model to be loaded for chat interactions.

**Device and Model Optimization:**
device_map="auto": Automatically selects the appropriate device (CPU or GPU).
model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}: Optimizes model memory usage by loading in 8-bit format (load_in_8bit=True) and using torch.float16 for reduced precision calculations, which can be beneficial for performance on CUDA-enabled GPUs.
This setup allows the HuggingFaceLLM instance to efficiently handle chat-based interactions using a fine-tuned large language model, ensuring both performance and resource efficiency, especially when leveraging CUDA-enabled GPUs.

### Equinox391 is my username on huggingface, you can find out more [here](https://huggingface.co/Equinox391)

In [None]:
import torch

llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=512,
    generate_kwargs={"temperature": 0.5, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="Equinox391/Athena-Llama-2-7b-chat-finetune",
    model_name="Equinox391/Athena-Llama-2-7b-chat-finetune",
    device_map="auto",
    # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16 , "load_in_8bit":True}
)

In [None]:
pip install -U langchain-community

In [None]:
pip install llama-index-embeddings-langchain

## Setting up embedding algorithm
**HuggingFaceEmbeddings:**
This is a class or module (HuggingFaceEmbeddings) from langchain.embeddings.huggingface. It represents embeddings generated by Hugging Face's models, which are popular for natural language processing tasks.

**LangchainEmbedding:**
This is a class (LangchainEmbedding) from llama_index.embeddings.langchain. It appears to integrate different embeddings, possibly including those from Hugging Face, into the LLama framework or indexing system (llama_index).

**Initialization:**
HuggingFaceEmbeddings(model_name="Hvare/Athena-indobert-finetuned-indonli-SentenceTransformer"): Initializes Hugging Face embeddings using the specified model name (Hvare/Athena-indobert-finetuned-indonli-SentenceTransformer). This model name likely refers to a fine-tuned Indonesian language model for sentence embeddings. **Hvare is the huggingface username of Gilang Kurnia Mandari, one of our ML team member.**

**Embedding Integration:**
LangchainEmbedding(...): Incorporates the Hugging Face embeddings into the LLama framework or indexing system, allowing for seamless integration of these embeddings into larger applications or pipelines within llama_index.
This setup enables the use of fine-tuned embeddings from Hugging Face within the LLama ecosystem, potentially for tasks such as semantic search, clustering, or other applications that require efficient handling of text embeddings.

In [None]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
# from llama_index import ServiceContext
from llama_index.embeddings.langchain import LangchainEmbedding

embed_model=LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="Hvare/Athena-indobert-finetuned-indonli-SentenceTransformer"))

The code snippet below is for initializes a ServiceContext object using the from_defaults method. Here's how I see each part:

**ServiceContext:**
ServiceContext is a class or object within the LLama framework that manages the context and configuration for a service or application. It handles tasks such as data processing, model interactions, and response management.

**from_defaults:**
from_defaults is a method or function that I use to create a ServiceContext object with default settings. It simplifies the initialization process by providing sensible default values for parameters.

**Parameters:**
chunk_size=1024: I set the size of data chunks or batches to process at 1024. Chunking helps manage memory and computational resources efficiently, especially when dealing with large datasets.

**llm:** I understand llm as a reference to a large language model instance that's already been set up earlier in my memory. This model is capable of generating responses or performing various language-related tasks.

**embed_model:** This refers to an embedding model (embed_model) initialized using LangchainEmbedding with Hugging Face embeddings. This model helps encode text into numerical representations (embeddings).

In summary, I initialize service_context to ensure smooth operation within the LLama framework, optimizing resource usage and leveraging models like llm and embed_model for effective text processing and response generation.

In [None]:
service_context=ServiceContext.from_defaults(
    chunk_size=1024,
    llm=llm,
    embed_model=embed_model
)

In [None]:
service_context

The next step is to make sure the documents that we've vectorised using VectorStoreIndex stored to an index as a new temporary knowledge for  our models

In [None]:
index=VectorStoreIndex.from_documents(documents,service_context=service_context)

In [None]:
index

Adding index to query engine

In [None]:
query_engine=index.as_query_engine()

# Testing
## Giving question based on pdf files, as input for the model to test it

In [None]:
response=query_engine.query("Apa itu lstm?")

## model output

In [None]:
print(response)