# Similarity Function for IBL

The objective of this notebook is to write a similarity function for claims in Experiment 1, using LLM embeddings and Vector Search.
This function can then be used within an IBL model that uses the speedyibl library.

In [None]:
#install packages for openai and mongodb
#!pip install "pymongo[srv]"
#!pip install openai

In [1]:
from openai import OpenAI
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import random
import numpy as np
import matplotlib.pyplot as plt
from collections import deque
#import evaluate
import time # to calculate time
import pandas as pd
uri = "mongodb+srv://archanan:hGKhjjxhr8I891i9@archcluster0.i1cmz5h.mongodb.net/?retryWrites=true&w=majority&appName=ArchCluster0"
# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [None]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [20]:
openai = OpenAI()
EMBEDDING_MODEL = "text-embedding-3-small"
def get_embedding(text):
    """Generate an embedding for the given text using OpenAI's API."""

    # Check for valid input
    if not text or not isinstance(text, str):
        return None
    try:
        # Call OpenAI API to get the embedding
        embedding = openai.embeddings.create(input=text, model=EMBEDDING_MODEL).data[0].embedding
        return embedding
    except Exception as e:
        print(f"Error in get_embedding: {e}")
        return None

In [2]:
database = client["expt_claims_database"]
collection = database["expt1_claims_collection"]
print(database.list_collection_names())

['expt1_claims_collection']


In [22]:
#get claims
claims_info_df = pd.read_excel("DeidentifiedRawData_Exp1.xlsx", sheet_name="Claims_Data")
#Number of tokens before generating embeddings
#sum([num_tokens_from_string(text, "cl100k_base") for text in claims_info_df['text']])
#1901
claims_info_df["text_embedding_optimised"] = claims_info_df['text'].apply(get_embedding)
claims_subset_df = claims_info_df[["img_name", "length", "acc_status", "user_name", "Category", "text", "feedback", "text_embedding_optimised"]]
claims_list = claims_subset_df.to_dict('records')

In [24]:
result = collection.insert_many(claims_list)
print(result.acknowledged)

True


## Steps Further

Since all the claims and their embeddings were generated and uploaded to MongoDB in the previous steps. Those cells need not be re-run again.
In the further steps, the vector search index was created in MongoDB and similarity values and functions will be built from the similarity scores supplied by the vector index.

In [3]:
#get claims
claims_info_df = pd.read_excel("DeidentifiedRawData_Exp1.xlsx", sheet_name="Claims_Data")

In [4]:
def vector_search(img_name, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    img_name (str): The image name to reference against the claims database.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = collection.find_one({ "img_name" : img_name }, {'length': 0, 'acc_status': 0, 'user_name': 0, 'Category': 0, 'text': 0, 'feedback': 0})['text_embedding_optimised']

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "llm_cosine_vector_index",
                "queryVector": query_embedding,
                "path": "text_embedding_optimised",
                "numCandidates": 54,  # Number of candidate matches to consider
                "limit": 54 # Return top 5 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "img_name": 1,  # Exclude the text_embedding_opitimzed field
                "score": {
                    "$meta": "vectorSearchScore"  # Include the search score
                }
            }
        }
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)


In [5]:
img_list = claims_info_df['img_name'].to_list()
img_dict_list = []
for img_name in img_list:
    img_dict_list.append(vector_search(img_name, collection))

In [8]:
for i in range(len(img_list)):
    sim_dict = {}
    for d in img_dict_list[i]:
        sim_dict[d['img_name']] = d['score']
    collection.update_one({"img_name" : img_list[i]}, {"$set": {'similarity_dict' : sim_dict}})

In [4]:
img_list = claims_info_df['img_name'].to_list()

In [11]:
sample_sim_dict = collection.find_one({ "img_name" : img_list[0] }, {'length': 0, 'acc_status': 0, 'user_name': 0, 'Category': 0, 'text': 0, 'feedback': 0, 'text_embedding_optimised': 0})['similarity_dict']
similarities =  [sample_sim_dict[k] for k in sample_sim_dict if sample_sim_dict[k] < 1]
similarities

[0.6514527797698975,
 0.6307398080825806,
 0.6276657581329346,
 0.5906263589859009,
 0.5894308090209961,
 0.570685625076294,
 0.5700032711029053,
 0.5665214657783508,
 0.5628194808959961,
 0.5601605772972107,
 0.5583720207214355,
 0.5578873753547668,
 0.5536611080169678,
 0.5521847009658813,
 0.5459815859794617,
 0.5455030798912048,
 0.544426441192627,
 0.5440582036972046,
 0.5439628958702087,
 0.5438030958175659,
 0.5436297059059143,
 0.5430953502655029,
 0.542999267578125,
 0.5426023602485657,
 0.5422442555427551,
 0.5419376492500305,
 0.5393193364143372,
 0.5382000803947449,
 0.5372725129127502,
 0.5361812710762024,
 0.535228967666626,
 0.5338835716247559,
 0.531522274017334,
 0.5312227606773376,
 0.5301764011383057,
 0.5279795527458191,
 0.526705265045166,
 0.5266759395599365,
 0.5262069702148438,
 0.5246825814247131,
 0.5245420932769775,
 0.5233020782470703,
 0.5216143131256104,
 0.5199585556983948,
 0.5189975500106812,
 0.5163828730583191,
 0.5159635543823242,
 0.5151010155677795

In [17]:
rnge = max(similarities) - min(similarities)
min_sample_sim = min(similarities)
sample_scaled_sim = {}
for k in sample_sim_dict:
    if sample_sim_dict[k] < 1:
        sample_scaled_sim[k] = (sample_sim_dict[k] - min_sample_sim)*0.9/rnge
    else:
        sample_scaled_sim[k] = 1
sample_scaled_sim

{'pretest_tweet_1.jpg': 1,
 'pretest_tweet_12.jpg': 0.9,
 'posttest_tweet_5.jpg': 0.7849990200725184,
 'posttest_tweet_3.jpg': 0.7679315146516624,
 'training_tweet_11.jpg': 0.5622841907361738,
 'training_tweet_15.jpg': 0.5556463495035463,
 'training_tweet_10.jpg': 0.4515707704362682,
 'training_tweet_19.jpg': 0.44778225680424505,
 'pretest_tweet_7.jpg': 0.428450843123722,
 'training_tweet_4.jpg': 0.40789696531350883,
 'posttest_tweet_13.jpg': 0.3931344037646865,
 'pretest_tweet_9.jpg': 0.38320411650436886,
 'posttest_tweet_2.jpg': 0.3805133055031477,
 'pretest_tweet_8.jpg': 0.3570485466737805,
 'training_tweet_22.jpg': 0.3488513521712288,
 'posttest_tweet_8.jpg': 0.31441089076327344,
 'pretest_tweet_13.jpg': 0.31175416579490506,
 'posttest_tweet_14.jpg': 0.3057765346160762,
 'pretest_tweet_14.jpg': 0.30373203450962627,
 'training_tweet_8.jpg': 0.30320287366951404,
 'training_tweet_1.jpg': 0.30231564401826894,
 'training_tweet_20.jpg': 0.30135296178944687,
 'training_tweet_13.jpg': 0.29

In [20]:
for i in range(len(img_list)):
    scaled_sim_dict = {}
    sim_dict = collection.find_one({ "img_name" : img_list[i] }, {'length': 0, 'acc_status': 0, 'user_name': 0, 'Category': 0, 'text': 0, 'feedback': 0, 'text_embedding_optimised': 0})['similarity_dict']
    similarities =  [sim_dict[k] for k in sim_dict if sim_dict[k] < 1]
    max_similarity = max(similarities)
    min_similarity = min(similarities)
    range_similarity = max_similarity - min_similarity
    for k in sim_dict:
        if sim_dict[k] < 1:
            scaled_sim_dict[k] = (sim_dict[k] - min_similarity)*0.9/range_similarity
        else:
            scaled_sim_dict[k] = sim_dict[k]
    collection.update_one({"img_name" : img_list[i]}, {"$set": {'scaled_similarity_dict' : scaled_sim_dict}})

Now the collection `Expt1_Claims` in MongoDB contains a similarity dictionary for all the claims in Experiment 1 that can be used in the IBL cognitive model similarity function.