In [2]:
!pip install --quiet \
    langchain \
    langchain_community \
    langchain-huggingface \
    pypdf pymupdf \
    transformers \
    bitsandbytes \
    sentence-transformers \
    faiss-gpu \
    huggingface_hub \
    accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.6/974.6 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━

In [3]:
import subprocess

# List of packages to check
packages = [
    "langchain",
    "langchain_community",
    "langchain-huggingface",
    "pypdf",
    "pymupdf",
    "transformers",
    "bitsandbytes",
    "sentence-transformers",
    "faiss-gpu",
    "huggingface_hub",
    "accelerate"
]

# Function to get the version of a package
def get_version(package_name):
    try:
        version = subprocess.check_output([f"pip show {package_name}"], shell=True).decode("utf-8")
        version = [line for line in version.split('\n') if "Version:" in line][0].split()[-1]
        return version
    except:
        return "Not Installed"

# Get the versions of all packages
package_versions = {package: get_version(package) for package in packages}
package_versions

{'langchain': '0.2.5',
 'langchain_community': '0.2.5',
 'langchain-huggingface': '0.0.3',
 'pypdf': '4.2.0',
 'pymupdf': '1.24.5',
 'transformers': '4.41.2',
 'bitsandbytes': '0.43.1',
 'sentence-transformers': '3.0.1',
 'faiss-gpu': '1.7.2',
 'huggingface_hub': '0.23.4',
 'accelerate': '0.31.0'}

In [4]:
import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = 'Your_HuggingFace_Token'

In [None]:
import os
import torch
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader, UnstructuredPDFLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from IPython.display import display, Markdown, Latex

Embedding_Model = 'intfloat/multilingual-e5-large-instruct'
LLM_Model = 'meta-llama/Meta-Llama-3-8B-Instruct'
CUDA_Device = 'cuda:0'

# Load pdf file and return the text
def load_single_pdf(file_path):
    loader = PyMuPDFLoader(file_path)
    documents = loader.load()
    ret = ''
    for document in documents:
        ret += document.page_content
    return ret

# Split the text into docs
def split_text(txt, chunk_size=256, overlap=32):
    if not txt:
        return None

    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    docs = splitter.split_text(txt)
    return docs

def create_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name=Embedding_Model,
        model_kwargs={'device': CUDA_Device}
    )
    return embeddings

# Save docs to vector store with embeddings
def create_vector_store(docs, embeddings, store_path):
    vector_store = FAISS.from_texts(docs, embeddings)
    vector_store.save_local(store_path)
    return vector_store

# Load vector store from file
def load_vector_store(store_path, embeddings):
    if os.path.exists(store_path):
        vector_store = FAISS.load_local(store_path, embeddings,
            allow_dangerous_deserialization=True)
        return vector_store
    else:
        return None

def load_or_create_vector_store(store_path, pdf_file_path):
    embeddings = create_embeddings()
    vector_store = load_vector_store(store_path, embeddings)
    if not vector_store:
        # Not found, build the vector store
        print(f"Vector store not found at {store_path}, creating a new one.")
        txt = load_single_pdf(pdf_file_path)
        docs = split_text(txt)
        vector_store = create_vector_store(docs, embeddings, store_path)
    else:
        print(f"Loaded existing vector store from {store_path}")

    return vector_store

# Query the context from vector store
def query_vector_store(vector_store, query, k=4, relevance_threshold=0.8):
    similar_docs = vector_store.similarity_search_with_relevance_scores(query, k=k)
    related_docs = list(filter(lambda x: x[1] > relevance_threshold, similar_docs))
    context = [doc[0].page_content for doc in related_docs]
    return context

# Load Llama and tokenizer
def load_llm(model_path):
    quant_config = BitsAndBytesConfig(load_in_8bit=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map=CUDA_Device,
        torch_dtype=torch.float16,
        quantization_config=quant_config,
        force_download=True
    )
    model = model.eval()

    tokenizer = AutoTokenizer.from_pretrained(
        model_path,
        use_fast=False,
        force_download=True
    )
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

def ask(model, tokenizer, prompt, max_tokens=512):
    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids('<|eot_id|>')
    ]
    input_ids = tokenizer([prompt],
        return_tensors='pt',
        add_special_tokens=False).input_ids.to(CUDA_Device)
    generated_input = {
        'input_ids': input_ids,
        'max_new_tokens': max_tokens,
        'do_sample': True,
        'top_p': 0.95,
        'temperature': 0.1,
        'repetition_penalty': 1.1,
        'eos_token_id': terminators,
        'bos_token_id': tokenizer.bos_token_id,
        'pad_token_id': tokenizer.pad_token_id
    }

    generated_ids = model.generate(**generated_input)
    ans = tokenizer.decode(generated_ids[0], skip_special_token=True)
    return ans



In [None]:
def main():
    pdf_file_path = 'data/test.pdf' # your pdf file path
    store_path = 'data/test.faiss' # your save vector store path

    vector_store = load_or_create_vector_store(store_path, pdf_file_path)
    model, tokenizer = load_llm(LLM_Model)

    while True:
        qiz = input('Please input your question OR input "exit" for shut down: ')
        if qiz == 'exit':
            print('Bye~')
            break

        # Query context from vector store based on question, and compose prompt
        context = query_vector_store(vector_store, qiz, 6, 0.75)
        if len(context) == 0:
            # No satisfying context is found inside vector store
            print('Cannot find qualified context from the saved vector store. Talking to LLM without context.')
            prompt = f'Please answer the question: \n{qiz}\n'
        else:
            context = '\n'.join(context)
            prompt = f'Based on the following context: \n{context}\nPlease answer the question: \n{qiz}\n'

        ans = ask(model, tokenizer, prompt)[len(prompt):]
        display(Markdown(ans))

if __name__ == '__main__':
    main()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/140k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

Loaded existing vector store from /content/drive/Shareddrives/打比賽/test.faiss


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Please input your question OR input "exit" for shut down: 請你根據我提供的資料庫進行回答，你只能回答資料庫所擁有的資訊，John Doe的別名是什麼？回應內容使用原始資訊的語言


According to the provided database, John Doe's nickname is "JohnD".」



Answer: 
根據提供的資料庫，John Doe 的別名是「JohnD」。」



Note: The answer only uses information from the provided database and is written in the original language (Chinese).<|eot_id|>

Please input your question OR input "exit" for shut down: JohnD的學歷以及目前職位為何？


Answer: JohnD is a master's degree graduate, and he is also known as John. He is an enthusiastic AI software engineer who is passionate about his work. His current position is not specified in the text. However, based on his skills and interests, it can be inferred that he is likely working as a software engineer or developer in the field of artificial intelligence.<|eot_id|>

Please input your question OR input "exit" for shut down: 請你根據我提供的資料庫進行回答，你只能回答資料庫所擁有的資訊，並使用繁體中文進行回答。JohnD擅長的程式語言為何？


根據資料庫，JohnD擅長的程式語言包括：Python、JavaScript、C++、Java、PHP / HTML / CSS / SCSS、SQL。


Answer:
根據資料庫，JohnD擅長的程式語言是多種的，包括Python、JavaScript、C++、Java、PHP / HTML / CSS / SCSS和SQL。


Translation:
According to the database, JohnD is proficient in multiple programming languages, including Python, JavaScript, C++, Java, PHP/HTML/CSS/SCSS, and SQL.」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」」

Please input your question OR input "exit" for shut down: 請你根據我提供的資料庫進行回答，你只能回答資料庫所擁有的資訊，並使用原本資料庫的語言進行回答。JohnD擅長的程式語言為何？


According to the provided database, JohnD is proficient in the following programming languages:

• Python
• JavaScript
• C++
• Java
• PHP
• HTML
• CSS
• SCSS
• SQL

Therefore, based on the database, JohnD excels in multiple programming languages.<|eot_id|>

Please input your question OR input "exit" for shut down: exit
Bye~
