In [None]:
import pandas as pd
from openai import OpenAI
import os
import json
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer
import torch

# OpenAI API key
client = OpenAI(base_url="http://localhost:8000/v1", api_key="EMPTY")

Data Preparation

In [None]:
df_plmdb = pd.read_excel('PLM-DB.xlsx', na_filter=False, dtype={'PartNumber': str})
df_qc = pd.read_excel('QC.xlsx', na_filter=False)

In [None]:
# Pre-filters for QC
#...
# Pre-filters for PLM-DB

# Filter and rename columns
#...

# Define exclusion lists
#...

# Filter entries
#...

In [None]:
gt_direct = pd.read_csv('Direct Qualifications_Ground Truth.csv')
gt_similarity = pd.read_csv('Qualifications by Similarity_Ground Truth.csv')

Generate a VectorDB from QC

In [None]:
# convert row to JSON
def row_to_json(row, is_plmdb=False):
    if is_plmdb:
        row_dict = row.drop(columns=["Part Number (PN)"]).to_dict()
    else:
        row_dict = row.drop(columns=["Number"]).to_dict()
    # Ensure all values are strings
    row_dict = {k: str(v) for k, v in row_dict.items()}
    
    return json.dumps(row_dict)

# Function to get embeddings
_e5 = SentenceTransformer(
    "intfloat/multilingual-e5-large-instruct",
    device="cuda" if torch.cuda.is_available() else "cpu"
)

def get_embedding(text: str):
    return _e5.encode("passage: " + text, normalize_embeddings=True).tolist()

file_path = 'QC_with_embeddings.h5'

if os.path.exists(file_path):
    df_qc_filtered= pd.read_hdf(file_path, key='qc')
else:
    df_qc_filtered['json'] = df_qc_filtered.apply(row_to_json, axis=1)
    df_qc_filtered['embeddings'] = df_qc_filtered['json'].apply(get_embedding)

    # Save embeddings to JSON
    df_qc_filtered.to_hdf(file_path, key='qc', mode='w')

Search a Component in PLM-DB by Part Number (PN)

In [None]:
part_number = '...'
plmdb_by_selected_pn = df_plmdb_filtered[df_plmdb_filtered['Part Number (PN)'] == part_number]
display(plmdb_by_selected_pn)

In [None]:
direct_qualifications = gt_direct[gt_direct['PN'] == part_number]['QN'].to_list()
sim_qualifications = gt_similarity[gt_similarity['PN'] == part_number]['QN'].to_list()

Direct Qualifications (Knowledge Graph)

In [None]:
if len(direct_qualifications) == 0:
    print(f'No direct qualifications found for {part_number}')
else:
    df_qc_filtered_by_direct = df_qc_filtered[df_qc_filtered['Number'].isin(direct_qualifications)]
    display(df_qc_filtered_by_direct)

Qualifications by Similarity (Knowledge Graph)

In [None]:
if len(sim_qualifications) == 0:
    print(f'No qualifications by similarity found for {part_number}')
else:
    df_qc_filtered_by_ext = df_qc_filtered[df_qc_filtered['Number'].isin(sim_qualifications)]
    display(df_qc_filtered_by_ext)

Alternative Qualifications (LLM Vector Search)

In [None]:
def find_most_similar_rows(row, df2, top_n=3):
    # Get the embedding of the specified row in df1
    row_embedding = get_embedding(row)
    df2_embeddings = np.array(df2['embeddings'].tolist())
    
    similarities = cosine_similarity([row_embedding], df2_embeddings)[0]
    df2['similarity'] = similarities
    
    # Sort by similarity
    most_similar = df2.sort_values(by=['similarity'], ascending=False).head(top_n)
    
    return most_similar

similar_qual = None

df_qc_without_other_qualifications = df_qc_filtered[~df_qc_filtered['Number'].isin(direct_qualifications)]
df_qc_without_other_qualifications = df_qc_without_other_qualifications[~df_qc_without_other_qualifications['Number'].isin(sim_qualifications)]
for row in plmdb_by_selected_pn.iterrows():
    row_json = row_to_json(row[1], is_plmdb=True)
    similar_qual = pd.concat([similar_qual, find_most_similar_rows(row_json, df_qc_without_other_qualifications, 50)])

similar_qual.drop_duplicates(subset="Number", inplace=True)
print("Components for which we are searching for qualifications:")
display(plmdb_by_selected_pn)
print("Qualifications (alternative) found:")
pd.set_option('display.max_rows', None)
display(similar_qual)