### Importing packages

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import accelerate
import torch
import time
from pprint import pprint

### Declaring text generation model, tokenizer, computational device and optional streamer

In [2]:
# setting device
gpu=0
device = torch.device(f"cuda:{gpu}" if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.cuda.set_device(device)
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 2070 SUPER'

In [3]:
# Define model name and hf token
name = "TheBloke/Llama-2-7b-Chat-GPTQ"
# name = "TheBloke/Mistral-7B-Instruct-v0.2-GPTQ"

# hugginf face auth token
# file_path = "../../huggingface_credentials.txt"
# with open(file_path, "r") as file:
#     auth_token = file.read().strip()

In [4]:
# Create tokenizer
tokenizer = AutoTokenizer.from_pretrained(name
    # ,cache_dir='./model/'
    # ,use_auth_token=auth_token
    ,device_map='cuda'                 
    )

In [6]:
# from huggingface_hub import notebook_login
# notebook_login()

In [5]:
# Define model
model = AutoModelForCausalLM.from_pretrained(name
    ,cache_dir=r"C:\Users\user2\.cache\huggingface\hub"
    # ,cache_dir='./model/'
    # ,use_auth_token=auth_token
    ,device_map='cuda'  
    # , torch_dtype=torch.float16
    # ,low_cpu_mem_usage=True
    # ,rope_scaling={"type": "dynamic", "factor": 2}
    # ,load_in_8bit=True,
    ).to(device)

bin D:\NLP 1\venv\Lib\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


In [6]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

### Declare inference function

In [7]:
def llm_inference(plain_text, model, tokenizer, device, streamer=None, max_length=4000, ):
    input_ids = tokenizer(
        plain_text,
        return_tensors="pt",
        truncation=True,
        max_length=max_length,
        )['input_ids'].to(device)
    
    output_ids = model.generate(input_ids
                        ,streamer=streamer
                        ,use_cache=True
                        ,max_new_tokens=float('inf')
                       )
    answer = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
    return answer

### Generating texts using a trained model

In [None]:
text = "what are the steps to train a machine learning model? explain in less than 100 words"
res = llm_inference(text, model, tokenizer, device, streamer=streamer,)
res



KeyboardInterrupt: 

### Setup Vector database

In [8]:
import chromadb
from llama_index import VectorStoreIndex, SimpleDirectoryReader, get_response_synthesizer
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from llama_index.prompts.prompts import SimpleInputPrompt
from llama_index.llms import HuggingFaceLLM
from llama_index.embeddings import LangchainEmbedding
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import set_global_service_context
from llama_index import ServiceContext
from llama_index import VectorStoreIndex, download_loader
from llama_index import SimpleDirectoryReader
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.postprocessor import SimilarityPostprocessor
from llama_index.vector_stores import MilvusVectorStore
from pathlib import Path

### Chroma db

In [9]:
# path = r"test_vdb/vdb_test1"
path = r"D:\NLP 1\RAG-webapp\collections\collection_C1"
db = chromadb.PersistentClient(path=path)

# get collection
chroma_collection = db.get_or_create_collection("default")

# assign chroma as the vector_store to the context
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [10]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [12]:
# Create a system prompt
system_prompt = """<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as
helpfully as possible, while being safe.
If a question does not make any sense, or is not factually coherent, explain
why instead of answering something not correct. If you don't know the answer
to a question, please express that you do not have informaion or knowledge in
that context and please don't share false information.
Try to be exact in information and numbers you tell.
Your goal is to provide answers based on the information provided and your other
knowledge.<</SYS>>
"""

query_wrapper_prompt = SimpleInputPrompt("{query_str} [/INST]")

In [13]:
llm = HuggingFaceLLM(context_window=4096,
                     max_new_tokens=512,
                     system_prompt=system_prompt,
                     query_wrapper_prompt=query_wrapper_prompt,
                     model=model,
                     tokenizer=tokenizer)

embeddings = LangchainEmbedding(
    HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
)

In [14]:
# Create new service context instance
service_context = ServiceContext.from_defaults(
    chunk_size=1024,
    chunk_overlap=20,
    llm=llm,
    embed_model=embeddings
)

# And set the service context
set_global_service_context(service_context)

### Load Vector DB

In [15]:
index = VectorStoreIndex.from_vector_store(
    vector_store, storage_context=storage_context
)

### Insert a single document into the vector db

In [17]:
PyMuPDFReader = download_loader("PyMuPDFReader")
loader = PyMuPDFReader()

# Load documents
# doc_dir = r"D:\NLP 1\RAG-webapp\documents_db\Sattelite imagery article scripts.pdf"
doc_dir = r"C:\Users\user2\Desktop\RAG_Docs\Climate_change_20232.pdf"
document = loader.load(file_path=Path(doc_dir), metadata=False)

# Create indexes
for doc in document:
    print(doc)
    # index.insert(doc, )

Doc ID: 1566b89d-6d46-48c0-a101-a33841c9bb42
Text: 2023  APR  Eye of the Storm: The  Impact of Climate  Shocks on
Inflation  and Growth  Serhan Cevik and Jo√£o Tovar Jalles  WP/23/87
Doc ID: 6dd18d1c-3600-469e-b182-7258c6d085e6
Text: ¬© 2023 International Monetary Fund  WP/23/87 IMF Working Paper
European Department  Eye of the Storm: The Impact of Climate Shocks on
Inflation and Growth  Prepared by Serhan Cevik and Jo√£o Tovar Jalles1
Authorized for distribution by Bernardin Akitoby  April 2023  IMF
Working Papers describe research in progress by the author(s) and are
publi...
Doc ID: b600f50d-41bc-4079-8ce1-09ea1310e96f
Text: I.   INTRODUCTION  Climate change is a multifaceted and evolving
phenomenon and a major source of uncertainty  for the global economy
and financial markets.2 The global surface temperature has already
jumped more than 1.1 degrees Celsius (¬∞C) compared with the
preindustrial average, escalating  the frequency and severity of
weather-related natu...
Doc ID: bd345

In [18]:
len(document)

33

### Insert directory of documents into the vector db

In [16]:
# load some documents
documents = SimpleDirectoryReader(r"C:\Users\user2\Desktop\RAG_Docs").load_data()

# create your index
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)

In [38]:
query_engine = index.as_query_engine()
########## Or ###########
#Customizing query engine

### Customizing query engine

In [16]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=3,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer(streaming=True)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.0)],
)

### Inferencing

In [17]:
# create a query engine and query
# response = query_engine.query("who studied Master of Science in Management with a background in Civil Engineering?")
# response = query_engine.query("how many gold medals Iranian youth won in 2023 chess competitions?")
# response = query_engine.query("describe key points of 2023 climate change?")
# response = query_engine.query("how much headline inflation increased after storm shock?")
# response = query_engine.query("how many ship detection methods are there? just name and use no more than 70 words")
response = query_engine.query("explain Iran's relationship with UAE?")
# response = query_engine.query("say something")
# response = query_engine.query("how many gold medals Iranian youth won in 2023 chess competitions?")
pprint(response.source_nodes)
response.print_response_stream()
# ans = []
# for txt in response.response_gen:
#     ans.append(txt)
#     print(txt, sep="")
# response

[NodeWithScore(node=TextNode(id_='c8ed8ecb-3135-4c5e-a3ba-6bdee40ce423', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='218_3', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='f7b86ebc0736c6d0519f87124fd6b20d3434e57304341e9edee468ade70c3a23')}, hash='44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a', text='The UAE is Iran‚Äôs second -largest trade partner. The trade between Iran and the UAE has \nrecovered from a pandemic slump of $11 billion in 2020/2021 to $24 billion in the 12 \nmonths ending in March, according to Iranian data.  \nThe trade between the two sides has surpassed the $22 billion recorded in 2012 before \nU.S. -led sanctions were imposed on the Iranian economy.  \nIranian officials have said they are now targeting a further increase in bilateral trade \ntowards $30 billion in the next two years.  \n‚ÄúPressure from the UAE centra



 Iran and the United Arab Emirates (UAE) have a complex and multifaceted relationship, with both countries sharing a long history of cultural, economic, and political ties. In recent years, the relationship between the two countries has been marked by increased cooperation and mutual interest, particularly in the fields of trade, investment, and regional security.
Trade between Iran and the UAE has recovered from a pandemic slump in 2020/2021, with bilateral trade reaching $24 billion in the 12 months ending in March 2022, according to Iranian data. The UAE is Iran's second-largest trade partner, and the two countries have been working to expand their economic ties and increase mutual investment.
In diplomatic exchanges, Iranian officials have been asking their Emirati counterparts to find new mechanisms for financing trade and investment, and the UAE has been supportive of Iran's efforts to re-establish its economic relations with the region. The UAE has also been actively promoting t

In [19]:
response.source_nodes

[NodeWithScore(node=TextNode(id_='c8ed8ecb-3135-4c5e-a3ba-6bdee40ce423', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='218_3', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='f7b86ebc0736c6d0519f87124fd6b20d3434e57304341e9edee468ade70c3a23')}, hash='44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a', text='The UAE is Iran‚Äôs second -largest trade partner. The trade between Iran and the UAE has \nrecovered from a pandemic slump of $11 billion in 2020/2021 to $24 billion in the 12 \nmonths ending in March, according to Iranian data.  \nThe trade between the two sides has surpassed the $22 billion recorded in 2012 before \nU.S. -led sanctions were imposed on the Iranian economy.  \nIranian officials have said they are now targeting a further increase in bilateral trade \ntowards $30 billion in the next two years.  \n‚ÄúPressure from the UAE centra

In [18]:
response.get_formatted_sources

<bound method StreamingResponse.get_formatted_sources of StreamingResponse(response_gen=<generator object stream_completion_response_to_tokens.<locals>.gen at 0x000001C3A6457E00>, source_nodes=[NodeWithScore(node=TextNode(id_='c8ed8ecb-3135-4c5e-a3ba-6bdee40ce423', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='218_3', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='f7b86ebc0736c6d0519f87124fd6b20d3434e57304341e9edee468ade70c3a23')}, hash='44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a', text='The UAE is Iran‚Äôs second -largest trade partner. The trade between Iran and the UAE has \nrecovered from a pandemic slump of $11 billion in 2020/2021 to $24 billion in the 12 \nmonths ending in March, according to Iranian data.  \nThe trade between the two sides has surpassed the $22 billion recorded in 2012 before \nU.S. -led sanctions were imposed on

In [83]:
metadata = response.source_nodes[1].node.relationships
key = list(metadata.keys())[0]
metadata[key].node_id

'190_31'

In [65]:
ddd = {"a": 1, "b": 2}
list(ddd.items())[0][1]

1

In [24]:
response.get_formatted_sources

<bound method StreamingResponse.get_formatted_sources of StreamingResponse(response_gen=<generator object stream_completion_response_to_tokens.<locals>.gen at 0x00000227EACDB5E0>, source_nodes=[NodeWithScore(node=TextNode(id_='901d0b5c-0aec-43ae-9424-595ccd14585f', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b223a09c-d050-4e73-8603-644c02632f83', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='b61bb83fa485af0012e8b4018b648acb73bb7032c71d9fc5ed05b97d67fa6f9d')}, hash='44136fa355b3678a1146ad16f7e8649e94fb4fc21fe77e8310c060f61caaff8a', text='as digital photographs), or by pretraining a neural network on the satellite image domain. The latter can be \ndone through an unsupervised pipeline using self-supervised learning (SSL) [6], a contrastive learning \nparadigm that extracts useful patterns, learns invariances and disentangles causal factors in the training da

In [19]:
pprint(response.source_nodes[0].node.text)

('as digital photographs), or by pretraining a neural network on the satellite '
 'image domain. The latter can be \n'
 'done through an unsupervised pipeline using self-supervised learning (SSL) '
 '[6], a contrastive learning \n'
 'paradigm that extracts useful patterns, learns invariances and disentangles '
 'causal factors in the training data. \n'
 'Features learned this way are better adapted for transfer learning of '
 'few-shot object detectors. We propose \n'
 'to use this paradigm to create a ship detector with few data. \n'
 ' \n'
 'For VHR images, a large amount of literature exists, with the number of '
 'works follow- ing the increasing \n'
 'number of sensors and the quantity of publicly available data [7,8]. Many of '
 'these approaches focused on \n'
 'detecting ships with classical image processing pipelines: image processing '
 'using spectral indices or histograms \n'
 '(e.g., sea-land segmentation, cloud removal), ship candidate extraction '
 '(e.g., threshold, ano

In [39]:
for txt in response.response_gen:
    print(txt)

In [1]:
for node in response.source_nodes:
    print(node.score)
response.source_nodes[0]

NameError: name 'response' is not defined

## Other

In [16]:
import sys
import hashlib

def hash_file(file_path):
    # BUF_SIZE is totally arbitrary, change for your app!
    BUF_SIZE = 65536  # lets read stuff in 64kb chunks!

    hashes = dict()
    md5 = hashlib.md5()
    sha256 = hashlib.sha256()

    with open(file_path, 'rb') as f:
        while True:
            data = f.read(BUF_SIZE)
            if not data:
                break
            md5.update(data)
            sha256.update(data)
            
    hashes["md5"] = md5.hexdigest()
    hashes["sha1"] = sha256.hexdigest()
    
    return hashes

In [19]:
file_path = r"C:\Users\user2\Desktop\RAG_Docs\Climate_change_20232.pdf"
hash_file(file_path)

{'md5': '9380c870db48323c8168b2ceb62bfd23',
 'sha1': '534a586bd90d36a2153fe945eb8f1e211e71d1c4c156dcb6d53d092bea475853'}

In [18]:
file_path = r"C:\Users\user2\Desktop\RAG_Docs\Iran medals in Asia competitions 2023.pdf"
hash_file(file_path)

{'md5': 'f3a9d00e7b1c3f4b40ae44f82a5fcb64',
 'sha1': 'cb3ea5988bdea06d2d1ee0b4bd5f0664b518039b0261d7e87b4414d249e27b6a'}

In [21]:
import time
file_path = r"C:\Users\user2\Desktop\RAG_Docs\Climate_change_20232 - Copy.pdf"
t1 = time.time()
hash_file(file_path)
print(time.time() - t1)

0.002991914749145508
