In [1]:
from compare_models import (
    compare_target_sentence_rankings,
    cosine_distance,
    euclidean_distance,
)
import pandas as pd
from sentence_transformers import SentenceTransformer
from embedding_providers import (
    OpenAIEmbeddingProvider,
    SentenceTransformerProvider,
    GeminiEmbeddingProvider,
)
import openai
from dotenv import load_dotenv
import os

env_path = "/Users/ford/Documents/coding/confidential/.env"
load_dotenv(env_path)
api_key = os.getenv("OPENAI_API_KEY")
assert api_key, "API key is missing"
# Set your OpenAI API key
openai.api_key = api_key

# Initialize embedding providers with caching
embedding_models = {
    "all-MiniLM-L6-v2": SentenceTransformerProvider("all-MiniLM-L6-v2"),
    "all-mpnet-base-v2": SentenceTransformerProvider("all-mpnet-base-v2"),
    "openai-small": OpenAIEmbeddingProvider(model_name="text-embedding-3-small"),
    "openai-large": OpenAIEmbeddingProvider(model_name="text-embedding-3-large"),
    "gemini-exp-03-07": GeminiEmbeddingProvider(
        model_name="gemini-embedding-exp-03-07"
    ),
}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
input_sentences = [
    "Market risk affects investment returns",
    "Cybersecurity poses significant threats",
    "Credit risk in lending operations",
]
candidate_sets = {
    "candidate_set_1": {
        "candidate_sentences": [
            "Market fluctuations impact returns",
            "Weather is nice today",
            "Investment returns affected by market",
            "Cybersecurity threats are increasing",
        ],
        "target_sentences": [
            "Market fluctuations impact returns",
            "Market fluctuations impact returns",
            "Investment returns affected by market",
        ],
    },
    "candidate_set_2": {
        "candidate_sentences": [
            "Credit risk assessment in banking",
            "Market volatility affects investments",
            "Cyber attacks on organizations",
            "Risk management in finance",
        ],
        "target_sentences": [
            "Market volatility affects investments",
            "Market volatility affects investments",
            "Cyber attacks on organizations",
        ],
    },
    "candidate_set_3": {
        "candidate_sentences": [
            "Financial market volatility",
            "Credit risk evaluation methods",
            "Information security threats",
            "Enterprise risk management",
        ],
        "target_sentences": [
            "Financial market volatility",
            "Financial market volatility",
            "Information security threats",
        ],
    },
}

# Initialize distance functions
distance_functions = {"cosine": cosine_distance, "euclidean": euclidean_distance}

comparison_results = compare_target_sentence_rankings(
    input_sentences, candidate_sets, embedding_models, distance_functions
)

Processing comparisons: 100%|██████████| 90/90 [00:52<00:00,  1.72it/s]


In [3]:
comparison_result_df = pd.DataFrame(comparison_results)

In [4]:
comparison_result_df.shape

(90, 8)

In [5]:
comparison_result_df.head()

Unnamed: 0,input_sentence,target_sentence,candidate_sentences,embedded_model,distance_method,sorted_similar_sentences,sorted_similar_sentences_indices,target_order_in_sorted_similar_sentences
0,Market risk affects investment returns,Market fluctuations impact returns,"[Market fluctuations impact returns, Weather i...",all-MiniLM-L6-v2,cosine,"[Investment returns affected by market, Market...","[2, 0, 3, 1]",2
1,Market risk affects investment returns,Market fluctuations impact returns,"[Market fluctuations impact returns, Weather i...",all-MiniLM-L6-v2,euclidean,"[Investment returns affected by market, Market...","[2, 0, 3, 1]",2
2,Market risk affects investment returns,Market fluctuations impact returns,"[Market fluctuations impact returns, Weather i...",all-mpnet-base-v2,cosine,"[Investment returns affected by market, Market...","[2, 0, 3, 1]",2
3,Market risk affects investment returns,Market fluctuations impact returns,"[Market fluctuations impact returns, Weather i...",all-mpnet-base-v2,euclidean,"[Investment returns affected by market, Market...","[2, 0, 3, 1]",2
4,Market risk affects investment returns,Market fluctuations impact returns,"[Market fluctuations impact returns, Weather i...",openai-small,cosine,"[Investment returns affected by market, Market...","[2, 0, 3, 1]",2


In [6]:
comparison_result_df.embedded_model

0      all-MiniLM-L6-v2
1      all-MiniLM-L6-v2
2     all-mpnet-base-v2
3     all-mpnet-base-v2
4          openai-small
            ...        
85         openai-small
86         openai-large
87         openai-large
88     gemini-exp-03-07
89     gemini-exp-03-07
Name: embedded_model, Length: 90, dtype: object