In [1]:
from sentence_transformers import SentenceTransformer

# Initialize the model 
model = SentenceTransformer('all-MiniLM-L6-v2')

# Sample data
tech_sentences = [
    "Artificial intelligence is transforming industries worldwide.",
    "The new smartphone model features a 108-megapixel camera.",
    "Blockchain technology underpins cryptocurrencies.",
    "Machine learning algorithms can process vast amounts of data.",
    "5G networks enable faster mobile internet connections."
]

# Generate embeddings
embeddings = model.encode(tech_sentences)


  from tqdm.autonotebook import tqdm, trange


In [3]:
import chromadb

# Initialize the Chroma client
client = chromadb.Client()

# Create or connect to a collection
collection = client.get_or_create_collection(name="example_collection")


In [4]:
# Define IDs for the documents
doc_ids = ["AI", "Smartphone", "Blockchain", "Machine Learning", "5G"]

# Insert data into Chroma DB
collection.add(
    ids=doc_ids,
    documents=tech_sentences,
    embeddings=embeddings
)


In [5]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer

# Load the dataset
dataset = load_dataset("ruslanmv/ai-medical-chatbot")
model = SentenceTransformer('all-MiniLM-L6-v2')

# Prepare data for embedding 
subset = dataset['train'].select(range(20))  # Select the first 20 items
documents = [
    f"Description: {item['Description']} | Patient: {item['Patient']} | Doctor: {item['Doctor']}"
    for item in subset
]

# Generate embeddings
embeddings = model.encode(documents)

print("Sample Documents:", documents)
print("Embeddings:", embeddings)


Sample Documents: ['Description: Q. What does abutment of the nerve root mean? | Patient: Hi doctor,I am just wondering what is abutting and abutment of the nerve root means in a back issue. Please explain. What treatment is required for\xa0annular bulging and tear? | Doctor: Hi. I have gone through your query with diligence and would like you to know that I am here to help you. For further information consult a neurologist online -->', 'Description: Q. What should I do to reduce my weight gained due to genetic hypothyroidism? | Patient: Hi doctor, I am a 22-year-old female who was diagnosed with hypothyroidism (genetic) when I was 12. Over the past five years, I have become around 50 pounds overweight and all of my attempts to lose have seemed to fail so I have given up, but my weight has stayed the same. There is so much information put there about losing weight with hypothyroidism but it all seems to conflict. I am so unsure as to what type of exercise and diet I should follow as a 

In [6]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Generate an embedding for the query
query = "How do I know if I COVID-19?"  # Example query
query_embedding = model.encode([query])

# Compute cosine similarities
similarities = cosine_similarity(query_embedding, embeddings)[0]

# Rank documents by similarity score
top_indices = np.argsort(similarities)[::-1][:10]  # Top 10 most similar
top_documents = [documents[i] for i in top_indices]
top_similarities = [similarities[i] for i in top_indices]

print("Top matching documents:", top_documents)
print("Similarity scores:", top_similarities)


Top matching documents: ['Description: Q. My symptoms after intercourse threatns me even after having negative HIV result. Kindly help. | Patient: Hello doctor,Before two years had sex with a call girl in a dark location for about 10 to15 min. I did not eject, and I am sure that she did not eject as well (not 100 % sure). Do not know if the condom broke (as it was dark) but had some liquid on my penis (maybe fluid in condom, not sure) After few days from then I had cold followed by itchy skin. I took medicine from the doctor and got it resolved.After a few weeks from then, I had fever 100 F, mild night sweats, mild sore throat, slight weight loss (around four to five kg). Also I feel tired most of times. After 50 days, I got an HIV blood test, which came out negative. Do not know the name of the test, they took blood from my fingertip, tested it using some chemical for 40 minutes, and gave a report. Doctor suggested me get the same test after three months. Now I took medicines for flu 

In [7]:
# Set a similarity threshold 
threshold = 0.4
filtered_documents = [documents[i] for i, score in enumerate(similarities) if score > threshold]
print("Documents above threshold:", filtered_documents)


Documents above threshold: ['Description: Q. My symptoms after intercourse threatns me even after having negative HIV result. Kindly help. | Patient: Hello doctor,Before two years had sex with a call girl in a dark location for about 10 to15 min. I did not eject, and I am sure that she did not eject as well (not 100 % sure). Do not know if the condom broke (as it was dark) but had some liquid on my penis (maybe fluid in condom, not sure) After few days from then I had cold followed by itchy skin. I took medicine from the doctor and got it resolved.After a few weeks from then, I had fever 100 F, mild night sweats, mild sore throat, slight weight loss (around four to five kg). Also I feel tired most of times. After 50 days, I got an HIV blood test, which came out negative. Do not know the name of the test, they took blood from my fingertip, tested it using some chemical for 40 minutes, and gave a report. Doctor suggested me get the same test after three months. Now I took medicines for f

In [8]:
# Augmenting the query with a specific term
query = "HIV symptoms"
keywords = "healthcare diagnosis"
augmented_query = f"{query} {keywords}"

query_embedding_augmented = model.encode([augmented_query])
similarities_augmented = cosine_similarity(query_embedding_augmented, embeddings)[0]

top_indices_augmented = np.argsort(similarities_augmented)[::-1][:5]
top_documents_augmented = [documents[i] for i in top_indices_augmented]

print("Top augmented matching documents:", top_documents_augmented)


Top augmented matching documents: ['Description: Q. My symptoms after intercourse threatns me even after having negative HIV result. Kindly help. | Patient: Hello doctor,Before two years had sex with a call girl in a dark location for about 10 to15 min. I did not eject, and I am sure that she did not eject as well (not 100 % sure). Do not know if the condom broke (as it was dark) but had some liquid on my penis (maybe fluid in condom, not sure) After few days from then I had cold followed by itchy skin. I took medicine from the doctor and got it resolved.After a few weeks from then, I had fever 100 F, mild night sweats, mild sore throat, slight weight loss (around four to five kg). Also I feel tired most of times. After 50 days, I got an HIV blood test, which came out negative. Do not know the name of the test, they took blood from my fingertip, tested it using some chemical for 40 minutes, and gave a report. Doctor suggested me get the same test after three months. Now I took medicine

In [36]:
from chromadb import Client
from sentence_transformers import SentenceTransformer
from datasets import load_dataset

# Step 1: Load the Hugging Face dataset
# Step 2: Initialize the sentence-transformers model
# Step 3: Initialize ChromaDB client and create a collection
# Step 4: Process each document, generate embeddings, and add them to ChromaDB

dataset = load_dataset("ruslanmv/ai-medical-chatbot", split="train")

# The model 'all-MiniLM-L6-v2' is for embedding generation
model = SentenceTransformer('all-MiniLM-L6-v2')

chroma_client = Client()
collection = chroma_client.get_or_create_collection("medical_chatbot")

for i, item in enumerate(dataset):
    # Combine the fields into a single text for embedding
    document_text = f"Description: {item['Description']} | Patient: {item['Patient']} | Doctor: {item['Doctor']}"
    
    # Generate embedding for the document
    embedding = model.encode(document_text).tolist()  # Convert to list to make it JSON serializable

    # Add document to ChromaDB with a unique ID
    collection.add(
        documents=[document_text],         # Text of the document
        embeddings=[embedding],            # Embedding vector
        ids=[f"doc_{i}"],                  # Unique ID for each document
        metadatas=[{"source": "medical_chatbot"}] 
    )

print("Documents added to ChromaDB successfully!")


Add of existing embedding ID: doc_0
Insert of existing embedding ID: doc_0
Add of existing embedding ID: doc_1
Insert of existing embedding ID: doc_1
Add of existing embedding ID: doc_2
Insert of existing embedding ID: doc_2
Add of existing embedding ID: doc_3
Insert of existing embedding ID: doc_3
Add of existing embedding ID: doc_4
Insert of existing embedding ID: doc_4
Add of existing embedding ID: doc_5
Insert of existing embedding ID: doc_5
Add of existing embedding ID: doc_6
Insert of existing embedding ID: doc_6
Add of existing embedding ID: doc_7
Insert of existing embedding ID: doc_7
Add of existing embedding ID: doc_8
Insert of existing embedding ID: doc_8
Add of existing embedding ID: doc_9
Insert of existing embedding ID: doc_9
Add of existing embedding ID: doc_10
Insert of existing embedding ID: doc_10
Add of existing embedding ID: doc_11
Insert of existing embedding ID: doc_11
Add of existing embedding ID: doc_12
Insert of existing embedding ID: doc_12
Add of existing emb

Documents added to ChromaDB successfully!


In [None]:
# List all collections
collections = chroma_client.list_collections()
for coll in collections:
    print("Collection Name:", coll.name)
    print("Metadata:", coll.metadata)
