In [1]:
from openai import OpenAI
import numpy as np
import json
import os
from pathlib import Path

In [2]:
envFile = Path('.local.env')
with open(envFile) as f:
    for line in f:
        key, value = line.strip().split('=', 1)
        os.environ[key] = value

configFile = Path('config.json')    
with open(configFile, 'r') as file:  
    CONFIG = json.load(file)

In [3]:
SYSTEM = 'system'
ASSISTANT = 'assistant'
USER = 'user'

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', None)

if OPENAI_API_KEY is not None:
    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY', None))
else:
    ValueError("No OpenAI API key found.")

In [4]:
templateFile = Path('template.json')

with open(templateFile, 'r') as file:
    template = json.load(file)

In [5]:
userInput = "The wet lab of Synthetic biology."

In [21]:
corpusTemp = template.get("CorpusTemp")

getCorpusMess = [
        {"role": SYSTEM, "content": corpusTemp.get(SYSTEM).format(coverage=CONFIG.get('coverage'))},
        {"role": USER, "content": corpusTemp.get(USER).format(input=userInput)}    
]
corpusComp = client.chat.completions.create(
    messages=getCorpusMess,
    **CONFIG.get("completion", None)
)

In [22]:
corpus = json.loads(corpusComp.choices[0].message.content.replace("\n", ""))['corpus']

In [23]:
print(corpus)

['pipettes', 'centrifuge', 'microscope', 'petri dishes', 'incubator', 'agar', 'DNA sequencing', 'gel electrophoresis', 'autoclave', 'bioreactor', 'sterile', 'plasmids', 'genetic engineering', 'PCR machine', 'nucleotides', 'restriction enzymes', 'cloning', 'transformation', 'cell culture', 'bioinformatics', 'chromatography', 'spectrophotometer', 'fluorescence', 'microtiter plates', 'biohazard', 'laminar flow hood', 'gene synthesis', 'proteomics', 'metabolomics', 'antibodies', 'assay', 'biomolecules', 'genomics', 'transfection', 'RNA extraction', 'protein folding', 'biochemical reactions', 'molecular cloning', 'gene expression', 'synthetic organisms', 'bioengineering', 'genotype', 'phenotype', 'bioethics', 'biosafety', 'genetic circuits', 'cell-free systems', 'enzymatic assays', 'microbial fermentation', 'gene editing', 'CRISPR-Cas9', 'recombinant DNA', 'oligonucleotides', 'amplification', 'mutagenesis', 'vector design', 'sequencing library', 'transcriptomics', 'gene library', 'synthetic

In [24]:
print(len(corpus))

125


In [25]:
corEmbTemp = template.get("CorEmbTemp")

corpus_input = corEmbTemp.format(corpus=corpus, scenario=userInput)
corpus_embed = client.embeddings.create(input=corpus_input, **CONFIG.get("embedding", None))

In [26]:
print(corpus_embed.data[0])

Embedding(embedding=[-0.013485885, 0.0038990746, -0.00462907, 0.0062689646, -0.012593284, 0.027732903, 0.008545443, 0.01817031, -0.025989218, -0.039440505, 0.0029234411, 0.008386297, -0.008483169, -0.006009488, 0.020910388, 0.023678143, 0.046027765, -0.003409528, 0.0010655856, -0.019291252, -0.0017990407, 0.012102008, -0.009078236, 0.0034302862, -0.024785245, 0.010019273, 0.018267183, -0.039468184, -0.032991637, -0.005193, -0.010268371, 0.010261451, -0.010330645, 0.006428111, -0.0075213737, 0.0031206436, -0.0024442736, 0.015859235, 0.026791867, -0.004674046, 0.024190178, 0.040852062, 0.0048608696, -0.0016373, -0.019097509, -0.0039336714, -0.011596893, 0.011811394, -0.01129244, 0.010448274, 0.022626396, 0.010655857, -0.0034026087, -0.008254829, -0.015969945, 0.0012480845, 0.0026622342, 0.008047247, -0.0062378277, -0.004539118, -0.012330348, 0.0045564165, -0.015278007, 0.012316509, -0.007486777, -0.018460926, 0.0028646262, 0.010289129, -0.018571636, -0.030500658, 0.024646858, -0.00272623

In [27]:
testTemp = template.get("TestTemp")

getTestMess = [
    {"role": SYSTEM, "content": testTemp.get(SYSTEM).format(pos_test=CONFIG.get("pos_test", None), neg_test=CONFIG.get("neg_test", None))},
    {"role": ASSISTANT, "content": testTemp.get(ASSISTANT).format(scenario=userInput)}
]
testComp = client.chat.completions.create(
    messages=getTestMess,
    **CONFIG.get("completion", None)
)

In [28]:
test = json.loads(testComp.choices[0].message.content)

In [29]:
test

{'related paragraphs': ["The wet lab of synthetic biology is a hive of activity, with researchers in lab coats moving between benches cluttered with petri dishes, microscopes, and pipettes. The air is filled with the hum of centrifuges and the beeping of timers, signaling the end of another incubation period. Amidst this orchestrated chaos, the promise of new life forms designed to address some of humanity's most pressing challenges takes shape.",
  'In one corner of the synthetic biology wet lab, a graduate student carefully adds a drop of engineered plasmid into a tube containing E. coli bacteria. This transformation process is crucial, as it could result in a new strain capable of producing a novel antibiotic. The meticulous nature of this work is not lost on the student, who understands that even a single contaminated sample could compromise weeks of research.',
  "The lab's latest project involves the creation of synthetic algae that can efficiently convert carbon dioxide into bio

In [30]:
testEmbTemp = template.get("TestEmbTemp")

related_input = [testEmbTemp.format(paragraph=paragraph) for paragraph in test['related paragraphs']]
unrelated_input = [testEmbTemp.format(paragraph=paragraph) for paragraph in test['un-related paragraphs']]

related_embed = client.embeddings.create(input=related_input, **CONFIG.get("embedding", None))
unrelated_embed = client.embeddings.create(input=unrelated_input, **CONFIG.get("embedding", None))

In [31]:
for embed in related_embed.data:
    print(embed)

Embedding(embedding=[-0.010157813, -0.0092220465, -0.022973746, -0.00071708194, -0.024221433, 0.012090375, 0.0083066225, 0.013914441, -0.0289952, -0.013460119, 0.0059299115, 0.0044177673, -0.003692209, -0.00026085338, 0.030595496, 0.005631551, 0.03981754, 0.0022834737, 0.0071877716, -0.006570708, -0.024913087, 0.024845278, -0.0021715886, -0.011513997, -0.017074348, -0.00078150065, 0.016328447, -0.026540508, -0.010618916, -0.008109977, -0.007872644, 0.0029463083, -0.0130464835, -0.007553941, -0.016965853, -0.010835905, -0.0045466046, 0.00075225794, 0.023638275, -0.007330171, 0.019881647, 0.025319943, 0.0046042423, 0.011710643, -0.016030086, -0.00650968, -0.017521888, -0.0013841547, -0.012971894, 0.015623232, 0.020071514, 0.0033396014, -0.018240666, -0.0023055116, -0.03113797, -0.010815562, 0.0052653817, -0.0059299115, -0.0001475909, -0.00458729, -0.018444093, -0.024126502, -0.026649002, 0.020939471, -0.0044652335, -0.017562574, 0.0074386657, 0.030270012, 0.0057095317, -0.01813217, 0.022

In [32]:
clsCore = np.matrix(corpus_embed.data[0].embedding)

In [33]:
relatedEmbedMat = np.matrix([embed.embedding for embed in related_embed.data])
unrelatedEmbedMat = np.matrix([embed.embedding for embed in unrelated_embed.data])

In [34]:
clsCore * relatedEmbedMat.T / (np.linalg.norm(clsCore) * np.linalg.norm(relatedEmbedMat))

matrix([[0.50800256, 0.48346676, 0.47265129]])

In [35]:
clsCore * unrelatedEmbedMat.T  / (np.linalg.norm(clsCore) * np.linalg.norm(unrelatedEmbedMat))

matrix([[0.41562626, 0.3994691 , 0.40752689]])