In [1]:
from huggingface_hub import login

# Replace 'your-access-token' with your actual Hugging Face token
login("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")


In [3]:
import json

def convert_pss_to_json(file_path):
    json_data = []

    with open(file_path, 'r') as file:
        lines = file.readlines()

    for line in lines:
        line = line.strip()

        if not line or line.startswith("#"):
            continue

        elif line.startswith("Question:"):
            question = line.split("Question:")[1].strip()
        elif line.startswith("Answer:"):
            answer = line.split("Answer:")[1].strip()

            json_data.append({
                "question": question,
                "answer": answer
            })

    return json_data

structured_data = convert_pss_to_json(r'/content/sample_data/medical_data.pss')
print(structured_data)



[{'question': 'How can diabetes be managed effectively?', 'answer': 'Diabetes can be managed through a combination of lifestyle changes, medication, and regular monitoring of blood glucose levels.'}, {'question': 'What causes hypertension?', 'answer': 'Hypertension can be caused by a variety of factors, including a high-salt diet, lack of exercise, obesity, smoking, stress,'}, {'question': 'What are the risks of untreated hypertension?', 'answer': 'Untreated hypertension can lead to severe complications, including heart attack, stroke, kidney damage, and vision loss.'}, {'question': 'What are the common risk factors for heart disease?', 'answer': 'Common risk factors for heart disease include high blood pressure, high cholesterol, obesity, smoking, diabetes,'}, {'question': 'What lifestyle changes can reduce the risk of heart disease?', 'answer': 'Lifestyle changes that reduce heart disease risk include adopting a heart-healthy diet, engaging in regular physical activity,'}, {'question

In [4]:
from gensim.models import Word2Vec
import numpy as np
import json
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')

def train_word2vec(corpus, vector_size=512, window=5, min_count=1, workers=4):

    model = Word2Vec(sentences=corpus, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
    return model

def json_to_word2vec_embeddings(json_data, word2vec_model):

    def sentence_to_embedding(sentence, model):
        tokens = nltk.word_tokenize(sentence.lower())
        word_vectors = [model.wv[word] for word in tokens if word in model.wv]
        if word_vectors:
            return np.mean(word_vectors, axis=0)
        else:
            return np.zeros(model.vector_size)

    embeddings = []
    for item in json_data:
        question = item['question']
        answer = item['answer']

        question_embedding = sentence_to_embedding(question, word2vec_model)
        answer_embedding = sentence_to_embedding(answer, word2vec_model)

        combined_embedding = np.mean([question_embedding, answer_embedding], axis=0)

        embeddings.append({
            "question": question,
            "answer": answer,
            "embedding": combined_embedding.tolist()
        })

    return embeddings


all_text = [item["question"] + " " + item["answer"] for item in structured_data]
tokenized_corpus = [nltk.word_tokenize(text.lower()) for text in all_text]

word2vec_model = train_word2vec(tokenized_corpus)

embedding_results = json_to_word2vec_embeddings(structured_data, word2vec_model)

print(json.dumps(embedding_results))



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


[{"question": "How can diabetes be managed effectively?", "answer": "Diabetes can be managed through a combination of lifestyle changes, medication, and regular monitoring of blood glucose levels.", "embedding": [0.00028957438189536333, -0.000561531342100352, -0.00024706331896595657, 0.00014975455997046083, -8.461077959509566e-05, -9.68772656051442e-05, -3.772979835048318e-05, 0.0002147926134057343, 0.00024830771144479513, 0.00042911700438708067, -0.00011009896115865558, -0.00023342783970292658, -7.433462451444939e-05, -0.00039431307232007384, -0.00017761328490450978, 5.340711504686624e-05, -0.00023714170674793422, -0.00019800319569185376, -0.00030462152790278196, 0.00024102207680698484, 0.0002352228038944304, -4.560943489195779e-05, -8.116802200675011e-05, 1.235288436873816e-05, 0.0005162713932804763, 7.954665488796309e-05, -0.00018186426314059645, 7.030920096440241e-05, 4.482681833906099e-05, 4.323836765252054e-05, 0.0004551674937829375, 0.00020095884974580258, 0.0002830750891007483,

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [5]:
import numpy as np


def get_sentence_embedding(sentence, model):
    words = sentence.split()
    word_embeddings = []

    for word in words:
        if word in model.wv:
            word_embeddings.append(model.wv[word])
    if word_embeddings:
        return np.mean(word_embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)


In [41]:
import hnswlib
import numpy as np
from typing import List, Dict

class HNSWRetriever:
    def __init__(self, embedding_dim: int, max_elements: int = 1000):
        self.embedding_dim = embedding_dim
        self.index = hnswlib.Index(space='cosine', dim=embedding_dim)
        self.index.init_index(max_elements=max_elements, ef_construction=200, M=16)
        self.index.set_ef(50)
        self.data = []

    def add_documents(self, embeddings: np.ndarray, documents: List[Dict]):
        self.index.add_items(embeddings)
        self.data.extend(documents)

    def retrieve(self, query_embedding: np.ndarray, k: int = 5) -> List[Dict]:
        labels, distances = self.index.knn_query(query_embedding, k=k)
        num_results = len(labels[0])
        results = []

        for idx in range(num_results):
            doc = self.data[labels[0][idx]]
            results.append({
                "question": doc["question"],
                "answer": doc["answer"],
                "similarity": 1 - distances[0][idx]
            })

        return results


dataset = embedding_results
embeddings = np.array([entry["embedding"] for entry in dataset])
documents = [{"question": entry["question"], "answer": entry["answer"]} for entry in dataset]

retriever = HNSWRetriever(embedding_dim=512)
retriever.add_documents(embeddings, documents)

query = "risk of heart disease"
query_embedding = get_sentence_embedding(query, word2vec_model)

top_results = retriever.retrieve(query_embedding, k=2)

for result in top_results:
    print(f"Question: {result['question']}")
    print(f"Answer: {result['answer']}")
    print(f"Similarity: {result['similarity']:.4f}")
    print("-" * 50)


Question: What lifestyle changes can reduce the risk of heart disease?
Answer: Lifestyle changes that reduce heart disease risk include adopting a heart-healthy diet, engaging in regular physical activity,
Similarity: 0.6141
--------------------------------------------------
Question: What are the common risk factors for heart disease?
Answer: Common risk factors for heart disease include high blood pressure, high cholesterol, obesity, smoking, diabetes,
Similarity: 0.4813
--------------------------------------------------


In [20]:
!pip install langchain



In [7]:
!pip install hnswlib

Collecting hnswlib
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: hnswlib
  Building wheel for hnswlib (pyproject.toml) ... [?25l[?25hdone
  Created wheel for hnswlib: filename=hnswlib-0.8.0-cp310-cp310-linux_x86_64.whl size=2364558 sha256=98f1e64bbc695d98c87a87cc1400279173f0d8bed9686664728213b35a2ffa29
  Stored in directory: /root/.cache/pip/wheels/af/a9/3e/3e5d59ee41664eb31a4e6de67d1846f86d16d93c45f277c4e7
Successfully built hnswlib
Installing collected packages: hnswlib
Successfully installed hnswlib-0.8.0


In [24]:
from langchain.llms.base import LLM
from typing import Optional, List
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.llms.base import LLM

class NvidiaLlamaLangChain(LLM):
    def __init__(self, model_name: str, device: str = "cuda"):
        super().__init__()
        # Initialize tokenizer and model
        self._tokenizer = AutoTokenizer.from_pretrained(model_name)
        self._model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            device_map="cuda",
        )
        self._device = device

    @property
    def _llm_type(self) -> str:
        return "gemma"

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        # Prepare input
        messages = [{"role": "user", "content": prompt}]
        tokenized_message = self._tokenizer.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_tensors="pt",
            return_dict=True,
        )

        # Generate response
        response_token_ids = self._model.generate(
            tokenized_message["input_ids"].to(self._device),
            attention_mask=tokenized_message["attention_mask"].to(self._device),
            max_new_tokens=4096,
            pad_token_id=self._tokenizer.eos_token_id,
        )

        # Decode and process response
        generated_tokens = response_token_ids[:, len(tokenized_message["input_ids"][0]) :]
        generated_text = self._tokenizer.batch_decode(
            generated_tokens, skip_special_tokens=True
        )[0]

        # Handle stop tokens if provided
        if stop:
            for token in stop:
                generated_text = generated_text.split(token)[0]

        return generated_text


In [47]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.chains import ConversationalRetrievalChain


llm = NvidiaLlamaLangChain(model_name="google/gemma-2-2b-it")

retriever = HNSWRetriever(embedding_dim=512)

embeddings = np.array([entry["embedding"] for entry in embedding_results])
documents = [{"question": entry["question"], "answer": entry["answer"]} for entry in embedding_results]

retriever.add_documents(embeddings, documents)

def get_relevant_documents(query: str, retriever, embedding_model) -> List[Dict]:
    query_embedding = get_sentence_embedding(query, embedding_model)
    return retriever.retrieve(query_embedding, k=3)  # Retrieve top-3 results

def conversational_qa(query: str, chat_history: List[str], retriever, llm, embedding_model):
    top_results = get_relevant_documents(query, retriever, embedding_model)
    context = "\n".join(
        [f"Q: {result['question']}\nA: {result['answer']}" for result in top_results]
    )
    full_prompt =f"Context:\n{context}\n\n User: {query}\nAssistant:"
    print(full_prompt)
    response = llm(full_prompt)
    chat_history.append((query, response))
    return response


chat_history = []
query = "What are the risk factors for heart disease?"
response = conversational_qa(query, chat_history, retriever, llm, word2vec_model)

print("Assistant:", response)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Context:
Q: What are the common risk factors for heart disease?
A: Common risk factors for heart disease include high blood pressure, high cholesterol, obesity, smoking, diabetes,
Q: What lifestyle changes can reduce the risk of heart disease?
A: Lifestyle changes that reduce heart disease risk include adopting a heart-healthy diet, engaging in regular physical activity,
Q: What are the risks of untreated hypertension?
A: Untreated hypertension can lead to severe complications, including heart attack, stroke, kidney damage, and vision loss.

 User: What are the risk factors for heart disease?
Assistant:
Assistant: Here are the risk factors for heart disease, based on the information you provided:

**Common Risk Factors:**

* **High blood pressure (hypertension):**  This puts extra strain on your heart and blood vessels.
* **High cholesterol:**  High levels of LDL ("bad") cholesterol can build up in your arteries, increasing the risk of plaque formation.
* **Obesity:** Excess weight put