In [None]:
import os
import json
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from langchain_groq import ChatGroq



In [2]:
# Initialize Pinecone
def initialize_pinecone(api_key, environment):
    return Pinecone(api_key=api_key, environment=environment)


In [3]:
# Create or connect to a Pinecone index
def create_or_connect_index(pc, index_name, dimension, metric, cloud, region):
    if index_name not in pc.list_indexes().names():
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric=metric,
            spec=ServerlessSpec(cloud=cloud, region=region)
        )
        print(f"Index '{index_name}' created.")
    else:
        print(f"Index '{index_name}' already exists.")
    return pc.Index(index_name)


In [4]:
# Generate embeddings using Sentence Transformers
def generate_embeddings(model, inputs):
    """
    Generate embeddings using the multi-qa-mpnet-base-dot-v1 model.
    :param model: The Sentence Transformer model.
    :param inputs: List of input texts.
    :return: List of embeddings.
    """
    embeddings = model.encode(inputs, convert_to_tensor=False)
    return [{"values": embedding.tolist()} for embedding in embeddings]

In [5]:
# Upload a single file to Pinecone
def upload_file_to_pinecone(file_path, index, pc, model, namespace):
    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    # Generate embeddings for question_text
    inputs = [question["question_text"] for question in data]
    embeddings = generate_embeddings(model, inputs)

    # Prepare records for Pinecone
    records = []
    for question, embedding in zip(data, embeddings):
        question_img_link = question.get("question_img_link")
        if question_img_link is None:
            question_img_link = "" 
        record = {
            "id": f"{question['topic']}-{question['question_no']}",
            "values": embedding["values"],
            "metadata": {
                "topic": question["topic"],
                "question_no": question["question_no"],
                "question_text": question["question_text"],
                "question_img_link": question_img_link, 
                "options": question["options"],
                "correct_option": question["correct_option"]
            }
        }
        records.append(record)

    # Upsert records into Pinecone
    index.upsert(vectors=records, namespace=namespace)
    print(f"Uploaded {len(records)} records from {file_path} to namespace '{namespace}'.")

# Upload all files in a directory to Pinecone
def bulk_upload_to_pinecone(directory, namespace, index, pc, model):
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            file_path = os.path.join(directory, filename)
            upload_file_to_pinecone(file_path, index, pc, model, namespace)


In [None]:

# Main function
def main():
    # Pinecone credentials
    pinecone_api_key = "pcsk_3CYnJi_TZbGr8CeCcVxAsz4Li7J5n5hNBRqM7PA7k6xGKx7ftNXUYMYUJLJcb3PZrTneH4"
    pinecone_environment = "us-west1-gcp"
    index_name = "mcq-index"
    embedding_model_name = "multi-qa-mpnet-base-dot-v1"
    embedding_dimension = 768  # Dimension for the selected model

    # Subject directories and namespaces
    subject_dirs = {
        "computer_organization": "QuizMentor\data_for_vectordb\Final_Coal_mcqs_data",  # Replace with actual path
        "operating_system": "QuizMentor\data_for_vectordb\Final_OS_mcqs_data"  # Replace with actual path
    }

    # Initialize Pinecone
    pc = initialize_pinecone(api_key=pinecone_api_key, environment=pinecone_environment)

    # Create or connect to the Pinecone index
    index = create_or_connect_index(
        pc=pc,
        index_name=index_name,
        dimension=768,  # Adjust dimension as per the embedding model
        metric="cosine",
        cloud="aws",
        region="us-east-1"
    )
    embedding_model = SentenceTransformer(embedding_model_name)
    
    # Bulk upload for each subject
    for namespace, directory in subject_dirs.items():
        print(f"Uploading files from {directory} to namespace '{namespace}'...")
        bulk_upload_to_pinecone(directory, namespace, index, pc, embedding_model)
    # Delete the existing index (if needed)
    all_indexes = pc.list_indexes().names()
    print(all_indexes)

if __name__ == "__main__":
    main()


In [None]:
all_indexes = pc.list_indexes().names()