In [417]:
import dspy
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShot, BootstrapFewShotWithRandomSearch, BootstrapFinetune
import os
from dotenv import load_dotenv
import pathlib
import re
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score
import dsp
import numpy as np
from scipy import sparse
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances
import faiss
import json
from dsp.utils.utils import deduplicate
from dspy.datasets import Dataset
from sklearn.model_selection import train_test_split
from typing import Optional, Union
import ast
from bert_score import score
import contractions

In [35]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [50]:
llm = dspy.HFClientTGI(model="meta-llama/Meta-Llama-3-8B ", port=8090, url="http://127.0.0.1")
dspy.settings.configure(lm=llm)

In [232]:
path_env = pathlib.Path(os.getcwd()).parent.parent / '.env'
load_dotenv(path_env)
api_key = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = api_key

llm = dspy.OpenAI(
    model="gpt-4o", #"gpt-3.5-turbo" , #"gpt-4-0125-preview",  #gpt-4o-2024-05-13, #"gpt-4-1106-preview", # TODO: try turbo-instruct,
    max_tokens=1000)

# Assumes the Weaviate collection has a text key `content`
dspy.settings.configure(lm=llm, trace=[])

## Index

In [4]:
def create_faiss_index(df, text_column, id_column, model_name="all-mpnet-base-v2", index_file="faiss_index.index"):
    """
    Create a FAISS index from a DataFrame containing text data.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    text_column (str): The name of the column containing text data.
    id_column (str): The name of the column containing unique identifiers for the texts.
    model_name (str): The name of the SentenceTransformer model to use for embeddings.
    index_file (str): The file path to save the FAISS index.

    Returns:
    index: The FAISS index object.
    model: The SentenceTransformer model used for embeddings.
    ids: List of document IDs.
    texts: List of document texts.
    """
    texts = df[text_column].tolist()
    ids = df[id_column].tolist()

    model = SentenceTransformer(model_name, device="cuda")

    # Calculate embeddings for the texts
    embeddings = model.encode(texts, show_progress_bar=False)

    # Create a FAISS index
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)  

    # Normalize embeddings to unit length and add to index
    faiss.normalize_L2(embeddings)
    index.add(embeddings)

    # Save the index to a file
    faiss.write_index(index, index_file)

    return index, model, ids, texts

def retrieve_similar_documents(query_text, model, index, ids, texts, k=5):
    """
    Retrieve the k most similar documents to the query text.

    Parameters:
    query_text (str): The query text.
    model: The SentenceTransformer model used for embeddings.
    index: The FAISS index object.
    ids (list): List of document IDs.
    texts (list): List of document texts.
    k (int): The number of nearest neighbors to retrieve.

    Returns:
    list: A list of dictionaries containing document IDs, distances, and texts of the k most similar documents.
    """
    # Encode the query text
    query_embedding = model.encode([query_text], show_progress_bar=False)
    faiss.normalize_L2(query_embedding)
    
    # Search the index for the k nearest neighbors
    distances, indices = index.search(query_embedding, k)
    
    # Retrieve the corresponding texts and ids
    results = []
    for i in range(k):
        result = {
            "document_id": ids[indices[0][i]],
            "distance": distances[0][i],
            "text": texts[indices[0][i]]
        }
        results.append(result)
    
    return results


############
# DATA #####
############
path_orig_en = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_en_tr.parquet")
path_orig_es = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_es_tr.parquet")
path_source = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/passages/translated_stops_filtered_by_al/df_1.parquet")

path_model = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/LDA_FILTERED_AL/rosie_1_20")
path_corpus_en = path_model / "train_data" / "corpus_EN.txt"
path_corpus_es = path_model / "train_data" / "corpus_ES.txt"

persist_directory = (path_model / 'db_contr_mono').as_posix()

raw = pd.read_parquet(path_source)
with path_corpus_en.open("r", encoding="utf-8") as f:
    lines = [line for line in f.readlines()]
corpus_en = [line.rsplit(" 0 ")[1].strip().split() for line in lines]

ids = [line.split(" 0 ")[0] for line in lines]
df_en = pd.DataFrame({"lemmas": [" ".join(doc) for doc in corpus_en]})
df_en["doc_id"] = ids
df_en["len"] = df_en['lemmas'].apply(lambda x: len(x.split()))
df_en["id_top"] = range(len(df_en))
df_en_raw = df_en.merge(raw, how="inner", on="doc_id")[["doc_id", "id_top", "id_preproc", "lemmas_x", "text", "len"]]

# Read thetas 
thetas = sparse.load_npz(path_model.joinpath(f"mallet_output/{'EN'}/thetas.npz")).toarray()
betas = np.load((path_model.joinpath(f"mallet_output/{'EN'}/betas.npy")))
def get_thetas_str(row,thetas):
    return " ".join([f"{id_}|{round(el, 4)}" for id_,el in enumerate(thetas[row]) if el!=0.0])

def get_most_repr_tpc(row,thetas):
    return np.argmax(thetas[row])

# Save thetas in dataframe and "assigned topic"
df_en_raw["thetas"] = df_en_raw.apply(lambda row: get_thetas_str(row['id_top'], thetas), axis=1)
df_en_raw["id_tpc"] = df_en_raw.apply(lambda row: get_most_repr_tpc(row['id_top'], thetas), axis=1)
tpc = 1
df_tpc = df_en_raw[df_en_raw.id_tpc == tpc]

## Create samples with ChatGPT

In [253]:
class GenerateFacts(dspy.Signature):
    """Extract self-contained and fully contextualized facts from the given passage. Each fact should be a complete sentence that makes sense on its own, without relying on pronouns, vague terms, or references to previously mentioned concepts. All key information should be included in each fact."""
    passage = dspy.InputField(desc="The passage may contain one or several facts")
    facts = dspy.OutputField(desc="List of self-contained and fully contextualized claims in the form 'subject + verb + object' without using pronouns or vague references")

class FactsGenerator(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_facts = dspy.ChainOfThought(GenerateFacts)

    def forward(self, passage):
        facts = self.generate_facts(passage=passage).facts
    
        return dspy.Prediction(facts=facts)

In [255]:
fact_generator = FactsGenerator()

facts = []
for doc in df_tpc.text.values.tolist()[0:20]:
    facts.append([doc, fact_generator(doc)])

In [256]:
df = pd.DataFrame(facts, columns= ["passage", "facts"])
df["facts"] = df["facts"].apply(lambda x: str([re.sub(r'^\d+\.\s*', '', fact).replace('"',"'") for fact in x.facts.split('\n')]))
df.iloc[4].facts

"['Women who are carriers of the faulty X chromosome have a 1 in 2 chance of passing the faulty X chromosome to a child.', 'Women who are carriers of the faulty X chromosome have a 1 in 2 chance of passing the faulty X chromosome to a child in every pregnancy.', 'If a daughter gets the faulty X chromosome gene, the daughter will likely be a healthy carrier like her mother.', 'If a son gets the faulty X chromosome gene, the son will have X-linked agammaglobulinemia.']"

In [257]:
df

Unnamed: 0,passage,facts
0,"Chorioamnionitis: A condition during pregnancy that can cause unexplained fever with uterine tenderness, a high white blood cell count, rapid heart rate in the fetus, rapid heart rate in the woman, and/or foul-smelling vaginal discharge.","['Chorioamnionitis is a condition that occurs during pregnancy.', 'Chorioamnionitis can cause unexplained fever with uterine tenderness.', 'Chorioamnionitis can cause a high white blood cell count.', 'Chorioamnionitis can cause a rapid heart rate in the fetus.', 'Chorioamnionitis can cause a rapid heart rate in the woman.', 'Chorioamnionitis can cause foul-smelling vaginal discharge.']"
1,"Q: I heard that the vitamin K shot might cause cancer. Is this true: A: No. In the early 1990s, a small study in England found an “association” between the vitamin K shot and childhood cancer. An association means that two things are happening at the same time in the same person, but doesn’t tell us whether one causes the other. Figuring out whether vitamin K might cause childhood cancer was very important because every newborn is expected to get a vitamin K shot. If vitamin K was causing cancer, we would expect to see the same association in other groups of children. Scientists looked to see if they could find the same association in other children, but this association between vitamin K and childhood cancer was never found again in any other study.","['In the early 1990s, a small study in England found an association between the vitamin K shot and childhood cancer.', 'An association means that two things are happening at the same time in the same person but does not tell us whether one causes the other.', 'Figuring out whether vitamin K might cause childhood cancer was very important because every newborn is expected to get a vitamin K shot.', 'If vitamin K was causing cancer, scientists would expect to see the same association in other groups of children.', 'Scientists looked to see if they could find the same association between vitamin K and childhood cancer in other children.', 'The association between vitamin K and childhood cancer was never found again in any other study.']"
2,"But what if you know for sure that you never want to be pregnant again, or even pregnant ever? You might be thinking about having your “tubes tied,” or going through a permanent sterilization procedure. But talking to your doctor about sterilization can be tricky! Especially if you’re young, you may worry that your doctor won’t take you seriously or will try to talk you out of it.","['Some people may know for sure that they never want to be pregnant again or ever.', 'Some people might be thinking about having their “tubes tied” or going through a permanent sterilization procedure.', 'Talking to a doctor about sterilization can be tricky.', 'Young people may worry that their doctor won’t take them seriously or will try to talk them out of sterilization.']"
3,Urine test. This is done to look for a substance that may show a certain type of microcephaly.,['A urine test is done to look for a substance that may show a certain type of microcephaly.']
4,"Which children are at risk for X-linked agammaglobulinemia: Women who are carriers have a 1 in 2 chance of passing the faulty X chromosome to a child. This is true for every pregnancy. If a daughter gets the gene, she will likely be a healthy carrier like her mother. If a son gets the gene, he will have X-linked agammaglobulinemia.","['Women who are carriers of the faulty X chromosome have a 1 in 2 chance of passing the faulty X chromosome to a child.', 'Women who are carriers of the faulty X chromosome have a 1 in 2 chance of passing the faulty X chromosome to a child in every pregnancy.', 'If a daughter gets the faulty X chromosome gene, the daughter will likely be a healthy carrier like her mother.', 'If a son gets the faulty X chromosome gene, the son will have X-linked agammaglobulinemia.']"
5,"Your baby can go through periods of increased hunger and fussiness. This increase in hunger means your baby is going through a period of fast growth (a growth spurt). If you breastfeed, you might find your baby wants to eat more often (sometimes every hour!) during certain times of the day. This is called ""cluster feeding."" Formula-fed babies may want to eat more often or will drink more formula than usual during feedings.","['Babies can go through periods of increased hunger and fussiness.', 'An increase in hunger means a baby is going through a period of fast growth, also known as a growth spurt.', 'Breastfed babies might want to eat more often, sometimes every hour, during certain times of the day.', ""The behavior of wanting to eat more often during certain times of the day is called 'cluster feeding.'"", 'Formula-fed babies may want to eat more often during feedings.', 'Formula-fed babies may drink more formula than usual during feedings.']"
6,"Jacqueline Maher, M.D., is a reproductive endocrinology and infertility specialist with a focus on pediatric and adolescent gynecology. She leads the female fertility preservation program at Children's National Hospital to counsel patients in egg freezing or ovarian tissue freezing, and post treatment premature ovarian insufficiency, and hormone replacement/ puberty induction. Her other clinical interests include reproductive endocrine disorders such as premature ovarian insufficiency, polycystic ovary syndrome, precocious puberty, endometriosis and complex contraception; as well as surgery for congenital anomalies of the reproductive tract such as vaginal and uterine septum.","['Jacqueline Maher, M.D., is a reproductive endocrinology and infertility specialist.', 'Jacqueline Maher, M.D., focuses on pediatric and adolescent gynecology.', ""Jacqueline Maher, M.D., leads the female fertility preservation program at Children's National Hospital."", ""The female fertility preservation program at Children's National Hospital counsels patients in egg freezing or ovarian tissue freezing."", ""The female fertility preservation program at Children's National Hospital addresses post-treatment premature ovarian insufficiency."", ""The female fertility preservation program at Children's National Hospital provides hormone replacement and puberty induction."", 'Jacqueline Maher, M.D., has clinical interests in reproductive endocrine disorders such as premature ovarian insufficiency.', 'Jacqueline Maher, M.D., has clinical interests in polycystic ovary syndrome.', 'Jacqueline Maher, M.D., has clinical interests in precocious puberty.', 'Jacqueline Maher, M.D., has clinical interests in endometriosis.', 'Jacqueline Maher, M.D., has clinical interests in complex contraception.', 'Jacqueline Maher, M.D., performs surgery for congenital anomalies of the reproductive tract.', 'Jacqueline Maher, M.D., performs surgery for vaginal septum.', 'Jacqueline Maher, M.D., performs surgery for uterine septum.']"
7,Note: Positive CRP results also occur during the last half of pregnancy or with the use of birth control pills (oral contraceptives).,"['Positive CRP results occur during the last half of pregnancy.', 'Positive CRP results occur with the use of birth control pills (oral contraceptives).']"
8,Read more: Reflux gives preemie baby Ari an extra hurdle to overcome in her first weeks and months of life.,['Reflux gives preemie baby Ari an extra hurdle to overcome in her first weeks and months of life.']
9,"I often have a burning sensation in my vagina after my partner has ejaculated. My health care provider suggested I may have a semen allergy. Does a semen allergy affect the ability to get pregnant: Semen allergy, also called seminal plasma hypersensitivity, happens when you have a harmful immune system reaction to proteins in semen. This condition is not common. Semen allergy isn't a direct cause of infertility.","['A burning sensation in the vagina after a partner has ejaculated may be a symptom of a semen allergy.', 'A health care provider may suggest that a burning sensation in the vagina after ejaculation is due to a semen allergy.', 'A semen allergy is also called seminal plasma hypersensitivity.', 'A semen allergy occurs when the immune system has a harmful reaction to proteins in semen.', 'A semen allergy is not a common condition.', 'A semen allergy is not a direct cause of infertility.']"


In [258]:
df.to_csv("facts_gpt4.csv")

## Training Llama

In [396]:
class GenerateFacts(dspy.Signature):
    """
    Extract self-contained and fully contextualized facts from the given passage.    
    """
    
    passage = dspy.InputField(desc="The passage may contain one or several claims")
    facts = dspy.OutputField(desc="List of self-contained and fully contextualized claims in the form 'subject + verb + object' without using pronouns or vague references")
    
class FactsGenerator(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_facts = dspy.ChainOfThought(GenerateFacts)

    def process_facts(self, facts):
        nolist = False

        if "Facts:" in facts:
            facts = facts.split("Facts:")[1]
        elif "facts:" in facts:
            facts = facts.split("facts:")[1]

        try:
            facts = contractions.fix(facts)
        except Exception as e:
            print("Could not expand contrad")
            print(e)

        if '’' in facts:
            facts = facts.replace('’', "'")
        if '“' in facts:  # Changed elif to if
            facts = facts.replace('“', "'")
        if '”' in facts:  # Changed elif to if
            facts = facts.replace('”', "'")

        facts = facts.replace('"',"'")

        if "1." in facts:
            try:
                # Process facts, ensuring non-empty lines
                facts = [re.sub(r'^\d+\.\s*', '', fact).replace('"', "'") for fact in facts.split('\n') if fact.strip()]
                return facts
            except Exception as e:
                print("The error is 1")
                print(e)
                print(facts)
                return facts

        
        if facts.startswith("[") and not facts.endswith("]"):
            facts = facts + "]"
        elif not facts.startswith("[") and facts.endswith("]"):
            facts = "[" + facts
        elif not facts.startswith("[") and not facts.endswith("]"):
            nolist = True
            try:
                facts = [el.strip().replace('"',"'") for el in facts.split(".") if len(el) > 1]
                return facts
            except Exception as e:
                print("The error is 2")
                print(e)
                return facts

    
        try:
            facts = [el.strip().replace('"',"'") for el in facts.split(".") if len(el) > 1]#ast.literal_eval(facts)
        except Exception as e:
            print("The error is 3")
            print(e)
            print(facts)
            
        return facts
            
    def forward(self, passage):
        facts = self.generate_facts(passage=passage).facts
        processed_facts = self.process_facts(facts)
        return dspy.Prediction(facts = processed_facts)

In [398]:
fact_generator = FactsGenerator()
doc = df_tpc.text.iloc[12]
fact_generator(doc)

Prediction(
    facts=['Some physical disabilities affect bone density.', 'Cerebral palsy affects bone density.', 'Anorexia affects bone density.', 'Congenital conditions where osteoporosis is a concern affect bone density.', 'Osteoporosis is characterized by low bone density.', 'For people with disabilities that affect bone density, the shot that temporarily decreases bone density is probably not the best option.']
)

In [399]:
class FactsDataset(Dataset):

    def __init__(
        self,
        data_fpath: str,
        dev_size: Optional[float] = 0.2,
        test_size: Optional[float] = 0.2,
        text_key: str = "passage",
        seed: Optional[int] = 11235,
        *args,
        **kwargs
    ) -> None:
        super().__init__(*args, **kwargs)

        self._train = []
        self._dev = []
        self._test = []

        # Read the training data
        train_data = pd.read_csv( pathlib.Path(data_fpath))

        train_data, temp_data = train_test_split(
            train_data, test_size=dev_size + test_size, random_state=seed)
        dev_data, test_data = train_test_split(
            temp_data, test_size=test_size / (dev_size + test_size), random_state=seed)

        self._train = [
            dspy.Example({**row}).with_inputs(text_key) for row in self._convert_to_json(train_data)
        ]
        self._dev = [
            dspy.Example({**row}).with_inputs(text_key) for row in self._convert_to_json(dev_data)
        ]
        self._test = [
            dspy.Example({**row}).with_inputs(text_key) for row in self._convert_to_json(test_data)
        ]

    def _convert_to_json(self, data: pd.DataFrame):
        if data is not None:
            return data.to_dict(orient='records')

dataset = FactsDataset(data_fpath="facts_gpt4.csv", dev_size=0.1)

In [400]:
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import numpy as np

# Load the SBERT model (you can choose a different pre-trained model if needed)
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def combined_score(example, pred, trace=None):
    def sbert_similarity_score(example, pred, trace=None):
        try:
            scores = []
            
            predicted_lst = pred["facts"]
            try:
                gt_lst = ast.literal_eval(example.facts)
            except Exception as e:
                print("Error in parsing ground truth facts: ", e)
                gt_lst = example.facts.split(".")

            min_facts = min(len(predicted_lst), len(gt_lst))

            # Generate embeddings for predicted and ground truth facts
            predicted_embeddings = model.encode(predicted_lst[:min_facts])
            gt_embeddings = model.encode(gt_lst[:min_facts])

            # Calculate cosine similarity for each pair of embeddings
            for pred_emb, gt_emb in zip(predicted_embeddings, gt_embeddings):
                similarity = 1 - cosine(pred_emb, gt_emb)
                scores.append(similarity)

            # Return the average similarity score
            return np.mean(scores)
            
        except Exception as e:
            print("An error occurred: ", e)
            print("predicted_lst: ", predicted_lst)
            print("gt_lst: ", gt_lst)
            return 0.0

    # Now you can use sbert_similarity_score to calculate the similarity
    return sbert_similarity_score(example, pred, trace)



In [415]:
trainset = dataset._train
devset = dataset._dev
testset = dataset._test


config = dict(max_bootstrapped_demos=4, max_labeled_demos=16, num_candidate_programs=2, max_rounds=1)
teleprompter = BootstrapFewShotWithRandomSearch(metric=combined_score, **config)

compiled_pred = teleprompter.compile(FactsGenerator(), trainset=trainset, valset=devset)

TypeError: BayesianSignatureOptimizer.compile() got an unexpected keyword argument 'model'

In [None]:
tests = []
for el in testset:
    output = compiled_pred(el.passage)
    tests.append([el.passage, el.facts, output["facts"], combined_score(el, output)])

evaluate = Evaluate(
    devset=devset, metric=combined_score, num_threads=1, display_progress=True)
compiled_score = evaluate(compiled_pred)
uncompiled_score = evaluate(FactsGenerator())

In [404]:
pd.DataFrame(tests)

Unnamed: 0,0,1,2,3
0,"Q: I heard that the vitamin K shot might cause cancer. Is this true: A: No. In the early 1990s, a small study in England found an “association” between the vitamin K shot and childhood cancer. An association means that two things are happening at the same time in the same person, but doesn’t tell us whether one causes the other. Figuring out whether vitamin K might cause childhood cancer was very important because every newborn is expected to get a vitamin K shot. If vitamin K was causing cancer, we would expect to see the same association in other groups of children. Scientists looked to see if they could find the same association in other children, but this association between vitamin K and childhood cancer was never found again in any other study.","['In the early 1990s, a small study in England found an association between the vitamin K shot and childhood cancer.', 'An association means that two things are happening at the same time in the same person but does not tell us whether one causes the other.', 'Figuring out whether vitamin K might cause childhood cancer was very important because every newborn is expected to get a vitamin K shot.', 'If vitamin K was causing cancer, scientists would expect to see the same association in other groups of children.', 'Scientists looked to see if they could find the same association between vitamin K and childhood cancer in other children.', 'The association between vitamin K and childhood cancer was never found again in any other study.']","[In the early 1990s, a small study in England found an association between the vitamin K shot and childhood cancer., An association means that two things are happening at the same time in the same person but does not indicate causation., Figuring out whether vitamin K might cause childhood cancer was very important because every newborn is expected to get a vitamin K shot., Scientists did not find the same association between vitamin K and childhood cancer in any other study after the initial study in England.]",0.922812
1,"If donor eggs are being used, the same steps are taken. The egg donor will complete ovarian stimulation and egg retrieval. After fertilization takes place, the embryo is transferred to the person who intends to carry the pregnancy (either with or without various fertility medications).","['If donor eggs are being used, the same steps are taken as with non-donor eggs.', 'The egg donor will complete ovarian stimulation.', 'The egg donor will undergo egg retrieval.', 'After fertilization takes place, the embryo is transferred to the person who intends to carry the pregnancy.', 'The embryo transfer to the person who intends to carry the pregnancy can occur with or without various fertility medications.']","[If donor eggs are being used, the same steps are taken as with non-donor eggs., The egg donor will complete ovarian stimulation., The egg donor will complete egg retrieval., After fertilization takes place, the embryo is transferred to the person who intends to carry the pregnancy., The embryo transfer to the person who intends to carry the pregnancy can occur with or without various fertility medications.]",0.995013
2,"Which children are at risk for X-linked agammaglobulinemia: Women who are carriers have a 1 in 2 chance of passing the faulty X chromosome to a child. This is true for every pregnancy. If a daughter gets the gene, she will likely be a healthy carrier like her mother. If a son gets the gene, he will have X-linked agammaglobulinemia.","['Women who are carriers of the faulty X chromosome have a 1 in 2 chance of passing the faulty X chromosome to a child.', 'Women who are carriers of the faulty X chromosome have a 1 in 2 chance of passing the faulty X chromosome to a child in every pregnancy.', 'If a daughter gets the faulty X chromosome gene, the daughter will likely be a healthy carrier like her mother.', 'If a son gets the faulty X chromosome gene, the son will have X-linked agammaglobulinemia.']","[Women who are carriers of the faulty X chromosome have a 1 in 2 chance of passing the faulty X chromosome to a child., The 1 in 2 chance of passing the faulty X chromosome is true for every pregnancy., If a daughter inherits the faulty X chromosome, she will likely be a healthy carrier like her mother., If a son inherits the faulty X chromosome, he will have X-linked agammaglobulinemia.]",0.954175
3,Make sure your baby is vaccinated. A baby who is fully immunized is at lower risk for SIDS.,"['Parents should make sure their baby is vaccinated.', 'A baby who is fully immunized is at lower risk for Sudden Infant Death Syndrome (SIDS).']","[Vaccinating a baby is important., A fully immunized baby is at lower risk for Sudden Infant Death Syndrome (SIDS).]",0.900159
4,"Your baby can go through periods of increased hunger and fussiness. This increase in hunger means your baby is going through a period of fast growth (a growth spurt). If you breastfeed, you might find your baby wants to eat more often (sometimes every hour!) during certain times of the day. This is called ""cluster feeding."" Formula-fed babies may want to eat more often or will drink more formula than usual during feedings.","['Babies can go through periods of increased hunger and fussiness.', 'An increase in hunger means a baby is going through a period of fast growth, also known as a growth spurt.', 'Breastfed babies might want to eat more often, sometimes every hour, during certain times of the day.', ""The behavior of wanting to eat more often during certain times of the day is called 'cluster feeding.'"", 'Formula-fed babies may want to eat more often during feedings.', 'Formula-fed babies may drink more formula than usual during feedings.']","[Babies can go through periods of increased hunger and fussiness., An increase in hunger means a baby is going through a period of fast growth, known as a growth spurt., Breastfed babies might want to eat more often, sometimes every hour, during certain times of the day., The behavior of wanting to eat more often during certain times of the day is called 'cluster feeding.', Formula-fed babies may want to eat more often during feedings., Formula-fed babies may drink more formula than usual during feedings.]",0.999691


In [405]:
llm.inspect_history(1)





Extract self-contained and fully contextualized facts from the given passage.

---

Follow the following format.

Passage: The passage may contain one or several claims
Reasoning: Let's think step by step in order to ${produce the facts}. We ...
Facts: List of self-contained and fully contextualized claims in the form 'subject + verb + object' without using pronouns or vague references

---

Passage: Some physical disabilities also affect bone density, such as cerebral palsy, anorexia, and congenital conditions where osteoporosis (low bone density) is a concern. For people with these disabilities, the shot, which temporarily decreases bone density while you’re using it, probably isn’t the best option.
Reasoning: Let's think step by step in order to[32m Reasoning: Let's think step by step in order to produce the facts. We need to identify the main claims in the passage and ensure each fact is self-contained and fully contextualized. The passage discusses the relationship between ce

In [393]:
llm.inspect_history(n=1)





Extract self-contained and fully contextualized facts from the given passage.

---

Follow the following format.

Passage: The passage may contain one or several claims
Facts: List of self-contained and fully contextualized claims in the form 'subject + verb + object' without using pronouns or vague references

---

Passage: Some physical disabilities also affect bone density, such as cerebral palsy, anorexia, and congenital conditions where osteoporosis (low bone density) is a concern. For people with these disabilities, the shot, which temporarily decreases bone density while you’re using it, probably isn’t the best option.
Facts:[32m 1. Some physical disabilities affect bone density, such as cerebral palsy, anorexia, and congenital conditions.
2. Osteoporosis (low bone density) is a concern for people with cerebral palsy, anorexia, and congenital conditions.
3. The shot temporarily decreases bone density while it is being used.
4. The shot probably isn’t the best option for peo

In [395]:
compiled_pred("Some physical disabilities also affect bone density, such as cerebral palsy, anorexia, and congenital conditions where osteoporosis (low bone density) is a concern. For people with these disabilities, the shot, which temporarily decreases bone density while you’re using it, probably isn’t the best option.")

Prediction(
    facts=['Some physical disabilities affect bone density, such as cerebral palsy, anorexia, and congenital conditions.', 'Osteoporosis (low bone density) is a concern for people with cerebral palsy, anorexia, and congenital conditions.', 'The shot temporarily decreases bone density while it is being used.', 'The shot probably is not the best option for people with disabilities that affect bone density.']
)

In [386]:
compiled_pred

generate_facts = Predict(<class '__main__.GenerateFacts'>)

In [408]:
compiled_pred.save("compiled_fact.json")

## Question generation