In [None]:
import os
import json
from openai import OpenAI
import hashlib
import redis
EMBEDDING_MODEL = "text-embedding-ada-002" #"text-embedding-3-large"
LARGE_EMBEDDING_MODEL = "text-embedding-3-large"

In [None]:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
redis_client = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)

client.get_api_list()

In [None]:

def get_embedding(client, text, model):
    text_hash = hashlib.sha256(text.encode()).hexdigest()
    cache_key = f"{model}_{text_hash}"
    cached_response = redis_client.get(cache_key)

    if cached_response:
        print("found response in cache")
        return json.loads(cached_response)

    print("no response in cache, obtaining embedding from LLM")
    response = client.embeddings.create(
                    input=text,
                    model=model,
                )
    
    embedding = response.data[0].embedding
    redis_client.set(cache_key, json.dumps(embedding))
    return embedding

In [None]:
resp1 = get_embedding(client, "This is a text string to embed", EMBEDDING_MODEL)

In [None]:
resp2 = get_embedding(client, "This is a text string to embed", EMBEDDING_MODEL)

In [None]:
len(resp1)

In [None]:
import neo4j
from neo4j import GraphDatabase, RoutingControl
import networkx as nx
import requests

NEO4J_URI = "neo4j://localhost:7687"

TEXT_1 = "This simulation uses ADCIRC to model storm surges in the gulf coast"
TEXT_2 = "This experiment involved the user of 3D printing to investigate wind power in Texas."
text1_embedding = get_embedding(client, TEXT_1, EMBEDDING_MODEL)
text2_embedding = get_embedding(client, TEXT_2, EMBEDDING_MODEL)


In [None]:
QUERY_TEXT = "Find all wind hazard datasets with testing conducted in 2021"
query_embedding = get_embedding(client, QUERY_TEXT, EMBEDDING_MODEL)
with GraphDatabase.driver(NEO4J_URI) as driver:
    query = """
    CALL db.index.vector.queryNodes('designsafeEmbeddings', 5, $embedding)
    YIELD node, score
    MATCH (node)<-[r*1..]-(parent)
    RETURN DISTINCT parent
    """
    res = driver.execute_query(query, embedding=query_embedding)
    for record in res.records:
        print(dict(record))

In [None]:
QUERY_TEXT = "List and count how many projects involve 'Barbara Simpson'"
query_embedding = get_embedding(client, QUERY_TEXT, EMBEDDING_MODEL)
with GraphDatabase.driver(NEO4J_URI) as driver:
    query = """
    CALL db.index.vector.queryNodes('designsafeEmbeddings', 5, $embedding)
    YIELD node, score
    MATCH (node)<-[r*1..]-(parent)
    RETURN DISTINCT parent
    """
    res = driver.execute_query(query, embedding=query_embedding)
    for record in res.records:
        print(dict(record))

In [None]:
QUERY_TEXT = "Identify complementary datasets for validating ML-based seismic response prediction across different structural typologies and scales"
query_embedding = get_embedding(client, QUERY_TEXT, EMBEDDING_MODEL)
with GraphDatabase.driver(NEO4J_URI) as driver:
    query = """
    CALL db.index.vector.queryNodes('designsafeEmbeddings', 5, $embedding)
    YIELD node, score
    MATCH (node)<-[r*1..]-(parent)
    RETURN DISTINCT parent
    """
    res = driver.execute_query(query, embedding=query_embedding)
    for record in res.records:
        print(dict(record))

In [None]:
QUERY_TEXT = "Identify complementary datasets for validating ML-based seismic response prediction across different structural typologies and scales"
query_embedding = get_embedding(client, QUERY_TEXT, EMBEDDING_MODEL)
with GraphDatabase.driver(NEO4J_URI) as driver:
    query = """
    CALL db.index.vector.queryNodes('designsafeEmbeddings', 5, $embedding)
    YIELD node, score
    MATCH (node)<-[r*1..]-(parent)
    RETURN DISTINCT parent
    """
    res = driver.execute_query(query, embedding=query_embedding)
    for record in res.records:
        print(dict(record))

In [None]:
QUERY_TEXT = "Identify complementary datasets for validating ML-based seismic response prediction across different structural typologies and scales"
query_embedding = get_embedding(client, QUERY_TEXT, EMBEDDING_MODEL)
with GraphDatabase.driver(NEO4J_URI) as driver:
    query = """
    CALL db.index.vector.queryNodes('designsafeEmbeddings', 5, $embedding)
    YIELD node, score
    MATCH (node)<-[r*1..]-(parent)
    RETURN DISTINCT parent
    """
    res = driver.execute_query(query, embedding=query_embedding)
    for record in res.records:
        print(dict(record))

In [None]:
QUERY_TEXT = "Identify complementary datasets for validating ML-based seismic response prediction across different structural typologies and scales"
query_embedding = get_embedding(client, QUERY_TEXT, EMBEDDING_MODEL)
with GraphDatabase.driver(NEO4J_URI) as driver:
    query = """
    CALL db.index.vector.queryNodes('designsafeEmbeddings', 5, $embedding)
    YIELD node, score
    MATCH (node)<-[r*1..]-(parent)
    RETURN DISTINCT parent
    """
    res = driver.execute_query(query, embedding=query_embedding)
    for record in res.records:
        print(dict(record))