In [4]:
import json

# Apni file ka path yahan daalo
file_path = r"diabetes_knowledge_base (1).json"

chunks = []
with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)
    for item in data:
        if 'text' in item and item['text']:
            # Har 'text' entry ko ek chunk maan rahe hain
            chunks.append(item['text'])

print(f"Total chunks loaded: {len(chunks)}")
print("Pehle chunk ka example:")
print(chunks[0][:100] + "...")

Total chunks loaded: 638
Pehle chunk ka example:
VOLUME | SUPPLEMENT | PAGES S1–S322 THE JOURNAL OF CLINICAL AND APPLIED RESEARCH AND EDUCATION JANUA...


In [5]:
from sentence_transformers import SentenceTransformer

# Yeh model text ko embeddings mein badlega
model = SentenceTransformer('all-MiniLM-L6-v2')

# Saare chunks ke embeddings ek saath banao
chunk_embeddings = model.encode(chunks)

print("Embeddings ban gaye!")
print(f"Shape of embeddings: {chunk_embeddings.shape}")

Embeddings ban gaye!
Shape of embeddings: (638, 384)


In [6]:
import faiss
import numpy as np

# Embeddings ko FAISS ke liye taiyar karo
chunk_embeddings_np = np.array(chunk_embeddings).astype('float32')

# FAISS index banao aur embeddings usmein add karo
index = faiss.IndexFlatL2(chunk_embeddings_np.shape[1])
index.add(chunk_embeddings_np)

print("Vector database ban gaya aur embeddings add ho gaye.")

# Ab isko save kar lo taaki agli baar dobara na banana pade
faiss.write_index(index, "diabetes_faiss_index.bin")
print("FAISS index saved successfully.")

Vector database ban gaya aur embeddings add ho gaye.
FAISS index saved successfully.


In [7]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from google.colab import userdata # Corrected: Added this import

# Step 1: GPU availability check
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Step 2: Hugging Face token ko Colab secrets se access karo
# Corrected: Fetching the token securely
huggingface_token = userdata.get('HUGGING_FACE_TOKEN')

# Step 3: Model aur Tokenizer ko load karo
model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(model_name, token=huggingface_token) # Corrected: Added token
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True, # Recommended for memory efficiency
    load_in_8bit=True, # Recommended for memory efficiency
    token=huggingface_token # Corrected: Added token
)

print("Mistral-7B model and tokenizer loaded successfully.")

ModuleNotFoundError: No module named 'google.colab'