# Semantic Search

Implementation of Semantic Searching using OpenAI API embedding and ChromaDB as a vector database

**Last date modified:** 02/11/2024

**Author:** BrenoAV

# Importing Libraries

In [1]:
from ast import literal_eval
import os
from pprint import pprint
import tempfile
import zipfile

import chromadb
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
from datasets import load_dataset
from dotenv import dotenv_values
import pandas as pd
import openai
from openai import OpenAI
import requests

# Load the secrets

In [2]:
secrets = dotenv_values("../.env")

# OpenAI Client

In [3]:
MODEL_NAME = "text-embedding-3-small"

In [4]:
openai_client = OpenAI(api_key=secrets["API_KEY"])

# Download the dataset

In [5]:
with tempfile.NamedTemporaryFile(suffix=".zip", delete=True) as raw_file:
    print(raw_file.name)
    with requests.Session() as session:
        r = session.get(url="https://data.mendeley.com/public-files/datasets/9rw3vkcfy4/files/c9ea673d-5542-44c0-ab7b-f1311f7d61df/file_downloaded", 
                        timeout=5)
        raw_file.write(r.content)
    with zipfile.ZipFile(file=raw_file.name, mode="r") as zip_file:
        with zip_file.open("Meta-data/Data.xlsx") as data_file:
            df = pd.read_excel(data_file)

/tmp/tmp353amysn.zip


In [6]:
df.head()

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract
0,0,12,12,CS,Symbolic computation,(2+1)-dimensional non-linear optical waves; e...,(2 + 1)-dimensional non-linear optical waves t...
1,5,2,74,Medical,Alzheimer's Disease,Aging; Tau; Amyloid; PET; Alzheimer's disease...,(beta-amyloid (A beta) and tau pathology becom...
2,4,7,68,Civil,Green Building,LED lighting system; PV system; Distributed l...,(D)ecreasing of energy consumption and environ...
3,1,10,26,ECE,Electric motor,NdFeB magnets; Electric motor; Electric vehic...,(Hybrid) electric vehicles are assumed to play...
4,5,43,115,Medical,Parkinson's Disease,Parkinson's disease; dyskinesia; adenosine A(...,"(L)-3,4-Dihydroxyphenylalanine ((L)-DOPA) rema..."


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46985 entries, 0 to 46984
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Y1        46985 non-null  int64 
 1   Y2        46985 non-null  int64 
 2   Y         46985 non-null  int64 
 3   Domain    46985 non-null  object
 4   area      46985 non-null  object
 5   keywords  46985 non-null  object
 6   Abstract  46985 non-null  object
dtypes: int64(3), object(4)
memory usage: 2.5+ MB


In [8]:
short_df = df.drop(df.index[20:])
short_df.head()

Unnamed: 0,Y1,Y2,Y,Domain,area,keywords,Abstract
0,0,12,12,CS,Symbolic computation,(2+1)-dimensional non-linear optical waves; e...,(2 + 1)-dimensional non-linear optical waves t...
1,5,2,74,Medical,Alzheimer's Disease,Aging; Tau; Amyloid; PET; Alzheimer's disease...,(beta-amyloid (A beta) and tau pathology becom...
2,4,7,68,Civil,Green Building,LED lighting system; PV system; Distributed l...,(D)ecreasing of energy consumption and environ...
3,1,10,26,ECE,Electric motor,NdFeB magnets; Electric motor; Electric vehic...,(Hybrid) electric vehicles are assumed to play...
4,5,43,115,Medical,Parkinson's Disease,Parkinson's disease; dyskinesia; adenosine A(...,"(L)-3,4-Dihydroxyphenylalanine ((L)-DOPA) rema..."


# Embeddings

In [9]:
text_vectors = openai_client.embeddings.create(input=short_df["Abstract"], 
                                               model=MODEL_NAME)

In [10]:
short_df = pd.DataFrame(data={"Abstract": short_df["Abstract"].tolist(), 
                        "abstract_vector": [records.embedding for records in text_vectors.data], 
                        "vector_id": list(range(0, short_df.shape[0]))})
short_df["Abstract"] = short_df["Abstract"].astype("string")
short_df["abstract_vector"] = short_df["abstract_vector"].astype("string")
short_df["vector_id"] = short_df["vector_id"].astype("string")

In [11]:
short_df["abstract_vector"] = short_df["abstract_vector"].apply(literal_eval)

In [12]:
short_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Abstract         20 non-null     string
 1   abstract_vector  20 non-null     object
 2   vector_id        20 non-null     string
dtypes: object(1), string(2)
memory usage: 608.0+ bytes


# Vector Database (Chromadb)

In [13]:
# Creating ChromaDB Client
chroma_client = chromadb.EphemeralClient()

In [14]:
# Creating a custom embedding function using OpenAI API
embedding_fn = OpenAIEmbeddingFunction(api_key=secrets["API_KEY"], model_name=MODEL_NAME)

In [15]:
# Creating a collection using the custom embedding function with cosine distance
# This is where we can store documents, embeddings, or any additional metadata
collection = chroma_client.create_collection(name="abstract_collection", 
                                             embedding_function=embedding_fn,
                                             metadata={"hnsw:space": "cosine"})

In [16]:
# Adding the id of the vectors with the respective embedded vector
collection.add(ids=short_df["vector_id"].tolist(), embeddings=short_df["abstract_vector"].tolist())

In [17]:
def query_collection(collection, query, max_results):
    """Query Function"""
    results = collection.query(query_texts=query, n_results=max_results, include=["distances"])
    
    return {"id": results["ids"][0], "score": results["distances"][0]}

In [18]:
query = """assert injured athletes"""

In [19]:
query_result = query_collection(
    collection=collection,
    query=query,
    max_results=3,
)

In [20]:
pprint(query_result)

{'id': ['19', '10', '18'],
 'score': [0.5819226503372192, 0.7295423746109009, 0.7723767161369324]}


In [21]:
# Taking the first result is with the lowest distance, then the most relevant
short_df.iloc[int(query_result["id"][0])]["Abstract"]

'[Purpose] The aim of this study was to analyze stabilometry in athletes during an indoor season in order to determine whether injured athletes show different stabilometric values before injury than non-injured athletes in two different training periods (volume and pre-competition periods). [Subjects] The subjects were 51 athletes from Unicaja athletic club who trained regularly. [Methods] At the end of the preseason and volume periods, athletes were subjected to bipodal and monopodal stabilometry. In addition, all injuries happening in the periods after performing stabilometry (volume and pre-competition periods) were tracked. [Results] Variance analysis of bipodal stabilometric measurements taken at the end of the preseason period showed that athletes with higher values for the center-of-pressure spread variables suffered injuries during the volume period. The right-leg monopodal stabilometric measurements taken at the end of the volume period showed that athletes with higher values 

---

The query was "assert injured athletes" and the result is an abstract that is related to athletes and injuries situations and the value is far from the others two. 