In [None]:
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import numpy as np
from textwrap import dedent
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from vllm import LLM, SamplingParams
import torch
import re

In [None]:
torch.backends.cuda.matmul.allow_tf32 = True

In [None]:
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100
)

In [None]:
df = pd.read_csv('full_housing_eda.csv')

In [None]:
texts = df['Text'].to_list()

In [None]:
def chunk_text(text,chunk_size=500,chunk_overlap=100):
    words = str(text).split()
    chunks = []

    for i in range(0,len(words),chunk_size - chunk_overlap):
        chunk = ' '.join(words[i: i+ chunk_size])
        chunks.append(chunk)

    return chunks

In [None]:
chunked_texts = list(map(chunk_text,texts))

In [None]:
# chunked_texts

In [None]:
docs = chunked_texts

In [None]:
flat_docs = [chunk for doc in docs for chunk in doc]

In [None]:
doc_embeddings = embed_model.encode(flat_docs,convert_to_numpy=True)

In [None]:
faiss.normalize_L2(doc_embeddings)

In [None]:
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

In [None]:
id2doc= {i: flat_docs[i] for i in range(len(flat_docs))} 

In [None]:
def retreive(query, top_k=3):
    q_emb = embed_model.encode([query],convert_to_numpy=True)
    faiss.normalize_L2(q_emb)

    D, I =index.search(q_emb,top_k)

    retreived_docs = [id2doc[i] for i in I[0]]
    return retreived_docs


In [None]:
torch.cuda.empty_cache()

In [None]:
# Load model and tokenizer
model_id = "curiousily/Llama-3-8B-Instruct-Finance-RAG"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=None,
    dtype="bfloat16",
#    attn_implementation="flash_attention_2" <- uncomment on compatible GPU
).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
def gen(prompt):
    input_ids = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True,
    ).to(model.device)
    
    output = model.generate(input_ids,do_sample=True,temperature=0.3,min_p=0.15,repetition_penalty=1.05,max_new_tokens=1024)


    raw_output = tokenizer.decode(output[0], skip_special_tokens=False)
    print(raw_output)
    matches = re.findall(r"<\|im_start\|>assistant\s*(.*?)(?=<\|im_end\|>)", raw_output, re.S)
    if matches:
        return matches[-1].strip()

In [None]:
torch.cuda.empty_cache()

In [None]:
def augment(data_row):
    prompt = dedent(f"""
    {data_row['question']}

    Information:

    ```
    {data_row['context']}
    ```
    """)
    messages = [
        {"role": "system", "content": "Use only the information to answer the question"},
        {"role": "user", "content": prompt},
    ]

    return tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
    

In [None]:
query = 'if i am to buy a house, which loan should i get and what are the details?'
results = retreive(query,top_k=10)

In [None]:
data_row = {
    'question':query,
    'context':'\n'.join(retreive(query))
}

In [None]:
prompt = augment(data_row)

In [None]:
result = gen(prompt)

In [None]:
print(result)