In [1]:
from sentence_transformers import SentenceTransformer
import faiss
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import numpy as np
from textwrap import dedent
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from vllm import LLM, SamplingParams
import torch
import re
from rank_bm25 import BM25Okapi


INFO 09-28 15:10:51 [__init__.py:216] Automatically detected platform cuda.


In [2]:
torch.backends.cuda.matmul.allow_tf32 = True

In [3]:
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

In [4]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 100,
    separators=['\n\n','\n',' ','']
)

In [5]:
df = pd.read_csv('../housing_bank_data/full_housing_eda.csv')

In [6]:
texts = df['Text'].to_list()

In [7]:
def chunk_text(text,chunk_size=500,chunk_overlap=100):
    words = str(text).split()
    chunks = []

    for i in range(0,len(words),chunk_size - chunk_overlap):
        chunk = ' '.join(words[i: i+ chunk_size])
        chunks.append(chunk)

    return chunks

In [8]:
chunked_texts = list(map(chunk_text,texts))

In [9]:
texts = [str(text) for text in texts]

In [10]:
big_text = '\n\n'.join(texts)

In [11]:
docs = splitter.split_text(big_text)

In [12]:
tokenized_docs = [d.split() for d in docs] 
bm25 = BM25Okapi(docs)

In [13]:
docs = chunked_texts

In [14]:
flat_docs = [chunk for doc in docs for chunk in doc]

In [35]:
doc_embeddings = embed_model.encode(docs,convert_to_numpy=True)

OutOfMemoryError: CUDA out of memory. Tried to allocate 48.00 MiB. GPU 0 has a total capacity of 5.63 GiB of which 24.31 MiB is free. Including non-PyTorch memory, this process has 5.10 GiB memory in use. Of the allocated memory 4.95 GiB is allocated by PyTorch, and 51.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [16]:
faiss.normalize_L2(doc_embeddings)

In [17]:
index = faiss.IndexFlatL2(doc_embeddings.shape[1])
index.add(doc_embeddings)

In [33]:
def search(query, top_k=5, alpha=0.5):
        # BM25 score
        bm25_scores = bm25.get_scores(query.split())

        # Embedding score
        q_emb = embed_model.encode([query], convert_to_numpy=True)
        D, I = index.search(q_emb, len(docs))
        faiss_scores = np.zeros(len(docs))
        for idx, score in zip(I[0], D[0]):
            faiss_scores[idx] = -score  # FAISS gives distance, turn to similarity

        # Hybrid score
        scores = alpha * faiss_scores + (1 - alpha) * bm25_scores
        ranked = np.argsort(scores)[::-1]

        return [docs[i] for i in ranked[:top_k]]

In [19]:
id2doc= {i: flat_docs[i] for i in range(len(flat_docs))} 

In [20]:
def retreive(query, top_k=3):
    q_emb = embed_model.encode([query],convert_to_numpy=True)
    faiss.normalize_L2(q_emb)

    D, I =index.search(q_emb,top_k)

    retreived_docs = [id2doc[i] for i in I[0]]
    return retreived_docs


In [21]:
torch.cuda.empty_cache()

In [22]:
# Load model and tokenizer
model_id = "LiquidAI/LFM2-2.6B"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=None,
    dtype="bfloat16",
#    attn_implementation="flash_attention_2" <- uncomment on compatible GPU
).to('cuda')
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [23]:
def gen(prompt):
    input_ids = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        add_generation_prompt=True,
        return_tensors="pt",
        tokenize=True,
    ).to(model.device)
    
    output = model.generate(input_ids,do_sample=True,temperature=0.3,min_p=0.15,repetition_penalty=1.05,max_new_tokens=1024)


    raw_output = tokenizer.decode(output[0], skip_special_tokens=False)
    print(raw_output)
    matches = re.findall(r"<\|im_start\|>assistant\s*(.*?)(?=<\|im_end\|>)", raw_output, re.S)
    if matches:
        return matches[-1].strip()

In [24]:
torch.cuda.empty_cache()

In [25]:
def augment(data_row):
    prompt = dedent(f"""
    {data_row['question']}

    Information:

    ```
    {data_row['context']}
    ```
    """)
    messages = [
        {"role": "system", "content": "Use only the information to answer the question"},
        {"role": "user", "content": prompt},
    ]

    return tokenizer.apply_chat_template(messages,tokenize=False,add_generation_prompt=True)
    

In [26]:
query = 'if i am to buy a house, which loan should i get and what are the details?'
results = retreive(query,top_k=10)

In [34]:
data_row = {
    'question':query,
    'context':'\n'.join(search(query))
}

IndexError: index 656 is out of bounds for axis 0 with size 600

In [None]:
prompt = augment(data_row)

In [None]:
result = gen(prompt)

In [None]:
print(result)