In [1]:
%pip install generative-ai-hub-sdk panda hana-ml aioboto3 xlsxwriter rapidfuzz
import os
import json


Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
df = pd.read_excel('vectors.xlsx', dtype=str, header=0)
df = pd.read_excel('vectors.xlsx', dtype=str)
df = df[["Index", "Customername", "Searchterm", "Embeddingtext"]]
df = df.dropna(subset=["Index"])
df.head(5)
df = df.dropna(subset=["Index"])

df.head(5)

Unnamed: 0,Index,Customername,Searchterm,Embeddingtext
0,0,LLQ Management SA,"Lallique, LLQ",LLQ Management SA / LLQ Management / Lalique ...
1,1,CSS Versicherung AG,CSS,CSS Versicherung AG / CSS Versicherung / CSS
2,2,fenaco Genossenschaft,,fenaco Genossenschaft / fenaco Genossenschaft
3,3,cc energie sa,,cc energie sa / cc energie sa
4,4,apsolut GmbH,,apsolut GmbH / apsolut


In [2]:
import os
from hana_ml import ConnectionContext

cc = ConnectionContext(
    address=os.environ.get("DB_ADDRESS"),
    port=os.environ.get("DB_PORT"),
    user=os.environ.get("DB_USER"),
    password=os.environ.get("DB_PASSWORD"), 
    encrypt=True
) 

print(cc.hana_version())
print(cc.get_current_schema())


4.00.000.00.1744104146 (fa/CE2025.2)
DBADMIN


In [3]:
from hana_ml.dataframe import create_dataframe_from_pandas

v_hdf = create_dataframe_from_pandas(
    connection_context=cc,
    pandas_df=df,
    table_name="DBADMIN.VECTORS",
    allow_bigint=True,
    append=True
)

100%|██████████| 1/1 [00:00<00:00,  5.08it/s]


In [4]:
from gen_ai_hub.proxy.native.openai import embeddings
import time

env_vars = {
    'AICORE_AUTH_URL' : os.environ.get("AICORE_AUTH_URL"),
    'AICORE_CLIENT_ID' : os.environ.get("AICORE_CLIENT_ID"),
    'AICORE_CLIENT_SECRET' : os.environ.get("AICORE_CLIENT_SECRET"),
    'AICORE_BASE_URL' : os.environ.get("AICORE_BASE_URL"),
    'AICORE_RESOURCE_GROUP' : os.environ.get("AICORE_RESOURCE_GROUP"),
}
for key, value in env_vars.items():
    os.environ[key] = value

In [5]:
import json
import time
from gen_ai_hub.proxy.native.amazon.clients import Session as AmazonSession

def get_embeddings_for_models(input_text, models):
    output = {}
    for model in models:
        try:
            if model == 'amazon--titan-embed-text':
                bedrock = AmazonSession().client(model_name=model)
                body = json.dumps({"inputText": input_text})
                response = bedrock.invoke_model(body=body)
                result = json.loads(response.get("body").read())
                output[model] = result["embedding"]
            else:
                result = embeddings.create(model_name=model, input=input_text)
                output[model] = result.data[0].embedding

            time.sleep(0.5)

        except Exception as e:                  
            print(f"Error with model {model}: {e}")
            output[model] = None

    return output


# Define all supported embedding models
models_to_test = [
    "text-embedding-3-small", 
    "text-embedding-3-large",
    "amazon--titan-embed-text",
]

# Create columns for each embedding model
for model in models_to_test:
    df[f'Embedding_{model}'] = None

# Populate the embeddings
for index, row in df.iterrows():
    vector_str = row['Embeddingtext']
    model_embeddings = get_embeddings_for_models(vector_str, models_to_test)

    for model in models_to_test:
        df.at[index, f'Embedding_{model}'] = model_embeddings.get(model)

print("Embeddings added for all working models.")
df.head(3)

KeyboardInterrupt: 

In [10]:
import json

conn = cc.connection
cursor = conn.cursor()

for index, row in df.iterrows():
    for model in models_to_test:
        embedding = row.get(f'Embedding_{model}')
        if not embedding:
            continue

        embedding_str = json.dumps(embedding)
        customername = row['Customername'] if pd.notna(row['Customername']) else None
        searchterm = row['Searchterm'] if pd.notna(row['Searchterm']) else None

        merge_sql = """
            MERGE INTO DBADMIN.VECTORS AS target
            USING (SELECT :index AS INDEX, :model AS MODEL FROM DUMMY) AS source
            ON target.INDEX = source.INDEX AND target.MODEL = source.MODEL
            WHEN MATCHED THEN
                UPDATE SET 
                    CUSTOMERNAME = :customername,
                    SEARCHTERM = :searchterm,
                    VECTOR = TO_REAL_VECTOR(CAST(:vector AS NVARCHAR)),
                    VECTOR_STR = :vector_str
            WHEN NOT MATCHED THEN
                INSERT (INDEX, CUSTOMERNAME, SEARCHTERM, MODEL, VECTOR, VECTOR_STR)
                VALUES (:index, :customername, :searchterm, :model, TO_REAL_VECTOR(CAST(:vector AS NVARCHAR)), :vector_str)
        """

        cursor.execute(merge_sql, {
            'index': row["Index"],
            'customername': customername,
            'searchterm': searchterm,
            'model': model,
            'vector': embedding_str,
            'vector_str': embedding_str
        })

conn.commit()
cursor.close()
conn.close()


In [7]:
import pandas as pd
from rapidfuzz import fuzz, process

# --- Vector Search ---
def run_vector_search(query: str, model: str, metric="COSINE_SIMILARITY", k=4):
    query_vector = get_embeddings_for_models(query, [model])[model]
    if not query_vector:
        print(f"❌ No embedding returned for model: {model}")
        return pd.DataFrame(columns=["Index", "Customername", "Searchterm", "Similarity"])

    vector_str = f"[{','.join(map(str, query_vector))}]"
    sort_order = "ASC" if metric == 'L2DISTANCE' else "DESC"

    sql = f'''
    SELECT TOP {k} 
        Index, Customername, Searchterm, VECTOR,
        {metric}(VECTOR, TO_REAL_VECTOR('{vector_str}')) AS Similarity
    FROM "DBADMIN"."VECTORS"
    WHERE MODEL = '{model}'
    ORDER BY Similarity {sort_order}
    '''

    try:
        hdf = cc.sql(sql)
        return hdf.collect()
    except Exception as e:
        print(f"Error running vector search for model {model}: {e}")
        return pd.DataFrame(columns=["Index", "Customername", "Searchterm", "Similarity"])

# --- Fuzzy Search ---
def run_fuzzy_search(query, df, text_columns=['Customername', 'Searchterm'], top_k=5, threshold=15):
    # Combining text columns into a search_blob for fuzzy matching
    df['search_blob'] = df[text_columns].fillna('').agg(' '.join, axis=1)
    choices = df['search_blob'].tolist()

    # Run fuzzy matching
    matches = process.extract(query, choices, scorer=fuzz.token_sort_ratio, limit=top_k)

    result_rows = []
    for text, score, _ in matches:
        if score >= threshold:
            idx = choices.index(text)
            row = df.iloc[idx][['Index', 'Customername', 'Searchterm']].copy()
            row['Similarity'] = score
            result_rows.append(row)

    return pd.DataFrame(result_rows)

# --- Run Search for All Models + Fuzzy ---
query = "Ich habe heute 4 Stunden für BMW die Rollenverteilung der BTP erledigt"
search_results = {}

# Run vector searches for each model
for model in models_to_test:
    print(f"🔍 Running vector search for model: {model}")
    result_df = run_vector_search(query, model=model)
    search_results[f"Vector_{model}"] = result_df

# Run fuzzy search on the data
print("🔍 Running fuzzy search...")
search_results["Fuzzy_RapidFuzz"] = run_fuzzy_search(query, df)

# --- Save All Results to One Excel File ---
excel_filename = "search_results.xlsx"
with pd.ExcelWriter(excel_filename, engine='xlsxwriter') as writer:
    for name, result_df in search_results.items():
        sheet_name = name[:31]  # Excel max sheet name length
        result_df.to_excel(writer, sheet_name=sheet_name, index=False)

print(f"✅ All search results saved to: {excel_filename}")


🔍 Running vector search for model: text-embedding-3-small
🔍 Running vector search for model: text-embedding-3-large
🔍 Running vector search for model: amazon--titan-embed-text
🔍 Running fuzzy search...
✅ All search results saved to: search_results.xlsx
