<a href="https://colab.research.google.com/github/ARJUN108-verma/Vector-Database/blob/main/8_Creating_a_Chroma_Vectorstore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Indexing: Creating a Chroma Vectorstore

In [1]:
%load_ext dotenv
%dotenv

cannot find .env file


In [2]:
!pip install langchain_community



In [3]:
!pip install langchain_openai



In [4]:
from langchain.document_loaders import Docx2txtLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter, CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.document_loaders.csv_loader import CSVLoader
from langchain_community.vectorstores import FAISS

In [5]:
import pandas as pd
import torch

In [6]:
files = pd.read_csv("/content/course_section_descriptions.csv", encoding='latin1')

Create text blobs and unite courses and sections

In [7]:
# Create a dictionary to map course IDs to course names
id_to_name = pd.Series(files.course_name.values,index=files.course_id).to_dict()

Created weighted versions of the parameters

In [8]:
from sentence_transformers import SentenceTransformer
# Initialize the SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
import numpy as np

In [10]:
# Aggregate course data
course_agg = files.groupby('course_id').agg({
    'course_name': 'first',
    'course_slug': 'first',
    'course_description': 'first',
    'course_description_short': 'first',
    'course_technology': 'first',
    'course_topic': 'first',
    'course_instructor_quote': 'first',
    'section_name': lambda x: list(x),  # Assuming section_name is not already a string
    'section_description': ' '.join  # Combine section descriptions into one string
}).reset_index()

In [11]:
def create_course_embedding(row):
    # Weights
    weight_course_name = 5
    weight_section_name = 4
    weight_other = 1

    # Helper function to safely encode text or return a zero vector if text is None
    def safe_encode(text, weight):
        if text is None:
            # Return a zero vector if the text is None
            return np.zeros(model.get_sentence_embedding_dimension())
        else:
            # Otherwise, return the encoded text vector multiplied by its weight
            return model.encode(text) * weight

    # Generate embeddings for individual components with weights
    embedding_course_name = safe_encode(row['course_name'], weight_course_name)
    embedding_course_slug = safe_encode(row['course_slug'], weight_other)
    embedding_course_description = safe_encode(row['course_description'], weight_other)
    embedding_course_description_short = safe_encode(row['course_description_short'], weight_other)
    embedding_course_instructor_quote = safe_encode(row['course_instructor_quote'], weight_other)

    # Extract section names for the course from the dataframe (assuming 'df' is your original dataframe)
    section_names = files[files['course_id'] == row['course_id']]['section_name'].unique().tolist()

    # Generate embeddings for section names with weights
    embeddings_section_names = [safe_encode(name, weight_section_name) for name in section_names]

    # If there are no section names, create a zero vector
    if not embeddings_section_names:
        embeddings_section_names = [np.zeros(model.get_sentence_embedding_dimension())]

    # Average the section name embeddings
    embeddings_section_names = np.mean(embeddings_section_names, axis=0)

    # Combine the weighted embeddings into a single composite embedding
    composite_embedding = np.mean([
        embedding_course_name,
        embedding_course_slug,
        embedding_course_description,
        embedding_course_description_short,
        embedding_course_instructor_quote,
        embeddings_section_names
    ], axis=0)

    return composite_embedding

# Apply the function to create embeddings for each course
course_agg['embedding'] = course_agg.apply(create_course_embedding, axis=1)

In [12]:
# Now each course has an embedding, you can upsert these into Pinecone
# (Assuming Pinecone is initialized and 'index' is your Pinecone index object)
vectors_to_upsert = [(str(row['course_id']), row['embedding'].tolist()) for index, row in course_agg.iterrows()]


Connect to Pinecone Index

In [13]:
!pip install pinecone



In [14]:
import os
from pinecone import Pinecone, ServerlessSpec

In [15]:
from dotenv import load_dotenv, find_dotenv

In [16]:
load_dotenv(find_dotenv(), override = True)

False

In [18]:
import pinecone
pc = Pinecone(api_key = os.environ.get("PINECONE_API_KEY"), environment = os.environ.get("PINECONE_ENV"))

PineconeConfigurationError: You haven't specified an API key. Please either set the PINECONE_API_KEY environment variable or pass the 'api_key' keyword argument to the Pinecone client constructor.

In [None]:
index = pc.Index("my-index")

In [None]:
index.upsert(vectors=vectors_to_upsert)

print("Data upserted to Pinecone index.")

Query data

In [None]:
# Ensure you've already initialized and configured Pinecone and the model
# If not, you need to run the initialization code provided earlier

# Create the query embedding
query = "clustering"
query_embedding = model.encode(query, show_progress_bar=False).tolist()

In [None]:
query_results = index.query(
   # namespace="my-index",
    vector=[query_embedding],
    top_k=12,
    include_values=True
)

In [None]:
score_threshold = 0.2


# Print the results that meet the score threshold
for match in query_results['matches']:
    if match['score'] >= score_threshold:
           course_name = id_to_name.get(int(match['id']), "Unknown Course")
           print(f"Matched course name: {course_name}, score: {match['score']}")