In [1]:
from openai import OpenAI
import numpy as np
import json
import os
from pathlib import Path

In [2]:
envFile = Path('.local.env')
with open(envFile) as f:
    for line in f:
        key, value = line.strip().split('=', 1)
        os.environ[key] = value

configFile = Path('config.json')    
with open(configFile, 'r') as file:  
    CONFIG = json.load(file)

In [3]:
SYSTEM = 'system'
ASSISTANT = 'assistant'
USER = 'user'

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY', None)

if OPENAI_API_KEY is not None:
    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY', None))
else:
    ValueError("No OpenAI API key found.")

In [4]:
templateFile = Path('template.json')

with open(templateFile, 'r') as file:
    template = json.load(file)

In [5]:
userInput = "Biology"

In [6]:
corpusTemp = template.get("CorpusTemp")

getCorpusMess = [
        {"role": SYSTEM, "content": corpusTemp.get(SYSTEM).format(coverage=CONFIG.get('coverage'))},
        {"role": USER, "content": corpusTemp.get(USER).format(input=userInput)}    
]
corpusComp = client.chat.completions.create(
    messages=getCorpusMess,
    **CONFIG.get("completion", None)
)

In [7]:
corpus = json.loads(corpusComp.choices[0].message.content.replace("\n", ""))['corpus']

In [8]:
print(corpus)

['cell', 'organism', 'evolution', 'species', 'habitat', 'gene', 'ecosystem', 'photosynthesis', 'chromosome', 'protein', 'enzyme', 'mutation', 'biodiversity', 'DNA', 'RNA', 'nucleus', 'membrane', 'mitochondria', 'reproduction', 'heredity', 'metabolism', 'organ', 'tissue', 'adaptation', 'allele', 'amino acid', 'antibody', 'bacteria', 'biome', 'biosphere', 'carnivore', 'cellular', 'chlorophyll', 'classification', 'cytoplasm', 'development', 'ecological', 'embryo', 'endangered', 'extinction', 'fertilization', 'fungi', 'genetics', 'genome', 'herbivore', 'hormone', 'host', 'inheritance', 'invertebrate', 'larva', 'mammal', 'microorganism', 'molecule', 'natural selection', 'nervous system', 'omnivore', 'parasite', 'pathogen', 'phylogeny', 'pollination', 'population', 'predator', 'prey', 'primate', 'prokaryote', 'receptor', 'symbiosis', 'taxonomy', 'vertebrate', 'virus', 'zoology', 'anatomy', 'biochemistry', 'botany', 'cell division', 'conservation', 'ecology', 'embryology', 'endocrine', 'epide

In [9]:
print(len(corpus))

125


In [10]:
corEmbTemp = template.get("CorEmbTemp")

corpus_input = corEmbTemp.format(corpus=corpus, scenario=userInput)
corpus_embed = client.embeddings.create(input=corpus_input, **CONFIG.get("embedding", None))

In [11]:
print(corpus_embed.data[0])

Embedding(embedding=[-0.0049266773, 0.009426743, 0.013080466, 0.0058693513, 0.0017021464, 0.03333076, -0.013555243, -0.0054496205, -0.021894814, -0.04142262, 0.0044243764, 0.0053154444, -0.0048303455, -0.0050023664, 0.010617127, 0.03085366, 0.055432, -0.01443599, -0.005728294, -0.021206731, 0.008484068, 0.015385545, -0.0029381162, 0.012977253, -0.01064465, 0.0021072554, 0.010926764, -0.040486827, -0.020890212, -0.012323574, 0.00536361, -0.0032081888, -0.0137616685, -0.008394618, 0.00031694843, 0.0045654336, -0.0030808935, 0.0045069465, 0.014724985, -0.008635446, 0.020725071, 0.031982116, -0.006254678, -0.019610377, -0.025885697, -0.006326927, -0.019018626, 0.010541438, -0.006309725, 0.011621729, 0.006756979, 0.027757285, -0.010390059, -0.01988561, -0.0030568105, -0.0020676907, -0.006137704, 0.013644694, -0.01410571, -0.006502388, -0.0041353814, 0.004149143, -0.025748082, 0.009818951, -0.012020817, -0.015151597, -0.0019180325, 0.0014484156, -0.0028314632, -0.01896358, 0.03580786, -0.002

In [12]:
testTemp = template.get("TestTemp")

getTestMess = [
    {"role": SYSTEM, "content": testTemp.get(SYSTEM).format(pos_test=CONFIG.get("pos_test", None), neg_test=CONFIG.get("neg_test", None))},
    {"role": ASSISTANT, "content": testTemp.get(ASSISTANT).format(scenario=userInput)}
]
testComp = client.chat.completions.create(
    messages=getTestMess,
    **CONFIG.get("completion", None)
)

In [13]:
test = json.loads(testComp.choices[0].message.content)

In [14]:
test

{'related paragraphs': ["The intricate dance of life begins at the cellular level, where complex biological processes dictate the growth, development, and functioning of living organisms. Within each cell, the nucleus acts as the command center, housing the DNA which carries the genetic blueprint for the organism's traits and behaviors.",
  "Photosynthesis is a remarkable biological process carried out by green plants, algae, and certain bacteria. These organisms convert light energy, usually from the sun, into chemical energy that can be used to fuel the organism's activities. This process not only sustains the life of the organism performing it but also supports the life of consumers up the food chain.",
  'Evolution, the gradual development of species over time, is a fundamental concept in biology that explains the diversity of life on Earth. Through mechanisms such as natural selection, genetic drift, and gene flow, species adapt to their environments, leading to the incredible var

In [15]:
testEmbTemp = template.get("TestEmbTemp")

related_input = [testEmbTemp.format(paragraph=paragraph) for paragraph in test['related paragraphs']]
unrelated_input = [testEmbTemp.format(paragraph=paragraph) for paragraph in test['un-related paragraphs']]

related_embed = client.embeddings.create(input=related_input, **CONFIG.get("embedding", None))
unrelated_embed = client.embeddings.create(input=unrelated_input, **CONFIG.get("embedding", None))

In [16]:
for embed in related_embed.data:
    print(embed)

Embedding(embedding=[-0.014064275, 0.001644147, 0.0025696815, -0.021981845, -0.02207167, 0.014000112, -0.0042090164, 0.012299824, -0.0031198692, -0.011106414, -0.010702194, 0.008905663, 0.011722367, -0.0016224924, 0.012684795, 0.0024060688, 0.033056177, -0.030438375, -0.01577098, 0.005729652, -0.027743578, 0.008219131, 0.021712365, -0.018748088, 0.0011926081, 0.029668434, 0.015706817, -0.023919532, 0.010400633, -0.029719763, -0.0147572225, -0.010638032, -0.025510745, -0.024240341, 0.018722424, 0.009175142, 0.0008284897, -0.014564737, -0.0032417767, 0.002864826, 0.015078031, 0.0068717324, -0.007622426, 0.0056558657, -0.019684851, -0.0018061557, -0.03079768, -0.007879074, -0.0041063577, 0.012068842, 0.009143062, 0.040267967, -0.020583117, -0.008770923, -0.022020342, -0.015424505, -0.012011096, -0.022405313, 0.010022079, -0.007725085, -0.04103791, -0.0057553165, -0.018504273, 0.012851615, -0.01615595, 0.00069615594, 0.0044432073, 0.03133664, -0.012813118, 0.014423581, 0.021288898, 0.00214

In [17]:
clsCore = np.matrix(corpus_embed.data[0].embedding)

In [18]:
relatedEmbedMat = np.matrix([embed.embedding for embed in related_embed.data])
unrelatedEmbedMat = np.matrix([embed.embedding for embed in unrelated_embed.data])

In [19]:
clsCore * relatedEmbedMat.T

matrix([[0.77927262, 0.76327933, 0.77897558]])

In [20]:
clsCore * unrelatedEmbedMat.T

matrix([[0.69067203, 0.68917145, 0.71581701]])