In [1]:
from openai import OpenAI
import numpy as np
import json
import os
from pathlib import Path

In [2]:
envFile = Path(".local.env")
with open(envFile) as f:
    for line in f:
        key, value = line.strip().split("=", 1)
        os.environ[key] = value

configFile = Path("config.json")
with open(configFile, "r") as file:
    CONFIG = json.load(file)

In [3]:
SYSTEM = "system"
ASSISTANT = "assistant"
USER = "user"

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY", None)

if OPENAI_API_KEY is not None:
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY", None))
else:
    ValueError("No OpenAI API key found.")

In [4]:
templateFile = Path("template.json")

with open(templateFile, "r") as file:
    template = json.load(file)

In [5]:
userInput = "The wet lab of Synthetic biology."

In [6]:
corpusTemp = template.get("CorpusTemp")

getCorpusMess = [
        {"role": SYSTEM, "content": corpusTemp.get(SYSTEM).format(coverage=CONFIG.get('coverage'))},
        {"role": USER, "content": corpusTemp.get(USER).format(input=userInput)}    
]
corpusComp = client.chat.completions.create(
    messages=getCorpusMess,
    **CONFIG.get("completion", None)
)

In [7]:
corpus = json.loads(corpusComp.choices[0].message.content.replace("\n", ""))["corpus"]

In [8]:
print(corpus)

['pipettes', 'centrifuge', 'microscope', 'petri dishes', 'agar', 'incubator', 'DNA sequencer', 'autoclave', 'biohazard', 'gloves', 'spectrophotometer', 'electrophoresis', 'PCR machine', 'nucleotides', 'plasmids', 'restriction enzymes', 'ligase', 'transformation', 'colonies', 'sterile technique', 'culture media', 'bioreactor', 'gene synthesis', 'proteins', 'amino acids', 'cell culture', 'genetic engineering', 'lab notebook', 'bioinformatics', 'chassis organism', 'gene editing', 'CRISPR-Cas9', 'gel documentation system', 'fluorescence microscope', 'biosafety cabinet', 'laminar flow hood', 'growth curve', 'microtiter plates', 'assay', 'biomolecules', 'synthetic gene circuits', 'gene expression', 'transcription', 'translation', 'recombinant DNA', 'molecular cloning', 'genotype', 'phenotype', 'biochemical pathways', 'metabolic engineering', 'protein folding', 'enzyme kinetics', 'cell lysis', 'chromatography', 'mass spectrometry', 'nanodrop', 'DNA library', 'RNA extraction', 'cDNA synthesis'

In [9]:
print(len(corpus))

139


In [10]:
corEmbTemp = template.get("CorEmbTemp")

corpus_input = corEmbTemp.format(corpus=corpus, scenario=userInput)
corpus_embed = client.embeddings.create(
    input=corpus_input, **CONFIG.get("embedding", None)
)

In [11]:
testTemp = template.get("TestTemp")

getTestMess = [
    {
        "role": SYSTEM,
        "content": testTemp.get(SYSTEM).format(
            pos_test=CONFIG.get("pos_test", None), neg_test=CONFIG.get("neg_test", None)
        ),
    },
    {"role": ASSISTANT, "content": testTemp.get(ASSISTANT).format(scenario=userInput)},
]
testComp = client.chat.completions.create(
    messages=getTestMess, **CONFIG.get("completion", None)
)

In [12]:
test = json.loads(testComp.choices[0].message.content)

In [13]:
test

{'related paragraphs': ['The wet lab of synthetic biology is a hive of activity, with researchers in lab coats moving between benches cluttered with petri dishes, microscopes, and bioreactors. The air is filled with a sense of purpose, as each experiment brings them closer to groundbreaking discoveries, from new forms of biofuel to synthetic organisms designed to tackle environmental challenges.',
  'In the corner of the synthetic biology wet lab, a graduate student carefully pipettes a solution containing engineered DNA into a series of vials. The precision of her movements reflects the meticulous nature of genetic manipulation, where even a single misplaced nucleotide can alter the outcome of an experiment, potentially leading to revolutionary advancements in medicine or agriculture.',
  'The hum of centrifuges blends with the occasional hiss of the autoclave in the synthetic biology wet lab. Scientists are engrossed in their work, analyzing data and adjusting parameters to optimize 

In [14]:
testEmbTemp = template.get("TestEmbTemp")

related_input = [
    testEmbTemp.format(paragraph=paragraph) for paragraph in test["related paragraphs"]
]
unrelated_input = [
    testEmbTemp.format(paragraph=paragraph)
    for paragraph in test["un-related paragraphs"]
]

related_embed = client.embeddings.create(
    input=related_input, **CONFIG.get("embedding", None)
)
unrelated_embed = client.embeddings.create(
    input=unrelated_input, **CONFIG.get("embedding", None)
)

In [15]:
clsCore = np.matrix(corpus_embed.data[0].embedding)

In [16]:
relatedEmbedMat = np.matrix([embed.embedding for embed in related_embed.data])
unrelatedEmbedMat = np.matrix([embed.embedding for embed in unrelated_embed.data])

In [17]:
clsCore * relatedEmbedMat.T / (
    np.linalg.norm(clsCore) * np.linalg.norm(relatedEmbedMat)
)

matrix([[0.49998493, 0.48861335, 0.49762327]])

In [18]:
clsCore * unrelatedEmbedMat.T / (
    np.linalg.norm(clsCore) * np.linalg.norm(unrelatedEmbedMat)
)

matrix([[0.42306624, 0.40011304, 0.41826684]])