In [1]:
import dspy
from dspy.evaluate import Evaluate
from dspy.teleprompt import BootstrapFewShot, BootstrapFewShotWithRandomSearch, BootstrapFinetune
import os
from dotenv import load_dotenv
import pathlib
import re
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score
import dsp
import numpy as np
from scipy import sparse
import os
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_distances
import faiss
import json
from dsp.utils.utils import deduplicate

lm = dspy.HFClientTGI(model="meta-llama/Meta-Llama-3-8B ", port=8090, url="http://127.0.0.1")
dspy.settings.configure(lm=lm)


def create_faiss_index(df, text_column, id_column, model_name="all-mpnet-base-v2", index_file="faiss_index.index"):
    """
    Create a FAISS index from a DataFrame containing text data.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    text_column (str): The name of the column containing text data.
    id_column (str): The name of the column containing unique identifiers for the texts.
    model_name (str): The name of the SentenceTransformer model to use for embeddings.
    index_file (str): The file path to save the FAISS index.

    Returns:
    index: The FAISS index object.
    model: The SentenceTransformer model used for embeddings.
    ids: List of document IDs.
    texts: List of document texts.
    """
    texts = df[text_column].tolist()
    ids = df[id_column].tolist()

    model = SentenceTransformer(model_name, device="cuda")

    # Calculate embeddings for the texts
    embeddings = model.encode(texts, show_progress_bar=False)

    # Create a FAISS index
    d = embeddings.shape[1]
    index = faiss.IndexFlatIP(d)  

    # Normalize embeddings to unit length and add to index
    faiss.normalize_L2(embeddings)
    index.add(embeddings)

    # Save the index to a file
    faiss.write_index(index, index_file)

    return index, model, ids, texts

def retrieve_similar_documents(query_text, model, index, ids, texts, k=5):
    """
    Retrieve the k most similar documents to the query text.

    Parameters:
    query_text (str): The query text.
    model: The SentenceTransformer model used for embeddings.
    index: The FAISS index object.
    ids (list): List of document IDs.
    texts (list): List of document texts.
    k (int): The number of nearest neighbors to retrieve.

    Returns:
    list: A list of dictionaries containing document IDs, distances, and texts of the k most similar documents.
    """
    # Encode the query text
    query_embedding = model.encode([query_text], show_progress_bar=False)
    faiss.normalize_L2(query_embedding)
    
    # Search the index for the k nearest neighbors
    distances, indices = index.search(query_embedding, k)
    
    # Retrieve the corresponding texts and ids
    results = []
    for i in range(k):
        result = {
            "document_id": ids[indices[0][i]],
            "distance": distances[0][i],
            "text": texts[indices[0][i]]
        }
        results.append(result)
    
    return results


############
# DATA #####
############
path_orig_en = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_en_tr.parquet")
path_orig_es = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_es_tr.parquet")
path_source = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/passages/translated_stops_filtered_by_al/df_1.parquet")

path_model = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/LDA_FILTERED_AL/rosie_1_20")
path_corpus_en = path_model / "train_data" / "corpus_EN.txt"
path_corpus_es = path_model / "train_data" / "corpus_ES.txt"

persist_directory = (path_model / 'db_contr_mono').as_posix()

raw = pd.read_parquet(path_source)
with path_corpus_en.open("r", encoding="utf-8") as f:
    lines = [line for line in f.readlines()]
corpus_en = [line.rsplit(" 0 ")[1].strip().split() for line in lines]

ids = [line.split(" 0 ")[0] for line in lines]
df_en = pd.DataFrame({"lemmas": [" ".join(doc) for doc in corpus_en]})
df_en["doc_id"] = ids
df_en["len"] = df_en['lemmas'].apply(lambda x: len(x.split()))
df_en["id_top"] = range(len(df_en))
df_en_raw = df_en.merge(raw, how="inner", on="doc_id")[["doc_id", "id_top", "id_preproc", "lemmas_x", "text", "len"]]

# Read thetas 
thetas = sparse.load_npz(path_model.joinpath(f"mallet_output/{'EN'}/thetas.npz")).toarray()
betas = np.load((path_model.joinpath(f"mallet_output/{'EN'}/betas.npy")))
def get_thetas_str(row,thetas):
    return " ".join([f"{id_}|{round(el, 4)}" for id_,el in enumerate(thetas[row]) if el!=0.0])

def get_most_repr_tpc(row,thetas):
    return np.argmax(thetas[row])

# Save thetas in dataframe and "assigned topic"
df_en_raw["thetas"] = df_en_raw.apply(lambda row: get_thetas_str(row['id_top'], thetas), axis=1)
df_en_raw["id_tpc"] = df_en_raw.apply(lambda row: get_most_repr_tpc(row['id_top'], thetas), axis=1)
tpc = 1
df_tpc = df_en_raw[df_en_raw.id_tpc == tpc]

In [249]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [48]:
path_env = pathlib.Path(os.getcwd()).parent.parent / '.env'
load_dotenv(path_env)
api_key = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = api_key

In [111]:
import ast

In [49]:
llm = dspy.OpenAI(
    model="gpt-3.5-turbo" , #"gpt-4-0125-preview",  #gpt-4o-2024-05-13, #"gpt-4-1106-preview", # TODO: try turbo-instruct,
    max_tokens=1000)

# Assumes the Weaviate collection has a text key `content`
dspy.settings.configure(lm=llm, trace=[])

In [68]:
lm = dspy.HFClientTGI(model="meta-llama/Meta-Llama-3-8B ", port=8090, url="http://127.0.0.1")
dspy.settings.configure(lm=lm)

In [198]:
from dspy.primitives.assertions import assert_transform_module, backtrack_handler

In [2]:
class GenerateFacts(dspy.Signature):
    """Extract the claims from the given text."""
    text = dspy.InputField(desc="may contain one or several claims")
    claims = dspy.OutputField(desc="List of sentences in the form 'subject + verb + object'")
    
class GenerateQuestion(dspy.Signature):
    """Form a close-ended question that directly asks the fact."""
    fact = dspy.InputField()
    question = dspy.OutputField(desc="it asks the fact")

class QAGenerator(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_facts = dspy.ChainOfThought(GenerateFacts)#"text -> claims"
        self.generate_question = dspy.ChainOfThought(GenerateQuestion)
        
    def facts_check(self, text):
        correct = False
        try:
            parsed_facts = ast.literal_eval(text)
            if len(parsed_facts[0]) <=1:
                correct = False
            else:
                correct = True
        except Exception as e:
            print(f"-- -- Error: {e}")
            try:
                pattern = r'\d+\.\s'
                parsed_facts = re.split(pattern, text)
                parsed_facts = [s.strip() for s in parsed_facts if s]
                if len(parsed_facts[0]) == 0:
                    correct = False
                else:
                    correct = True
            except Exception as e:
                print(f"-- -- Error: {e}")
                correct = False
        print(f"THIS is hte result from the checking {correct}")
        return correct
    
    def process_facts(self, facts):
        
        print(f"-- -- GENERATED Facts: {facts}")
        #import pdb; pdb.set_trace()
        parsed_facts = []
        try:
            parsed_facts = ast.literal_eval(facts)
        except Exception as e:
            print(f"-- -- Error: {e}")
            try:
                pattern = r'\d+\.\s'
                parsed_facts = re.split(pattern, facts)
                parsed_facts = [s.strip() for s in parsed_facts if s]
            except Exception as e:
                print(f"-- -- Error: {e}")
        return parsed_facts
        
    def forward(self, text):
        #import pdb; pdb.set_trace()
        facts = self.generate_facts(text=text).claims

        print(f"-- -- Facts: {facts}")
        questions = []
        parsed_facts = self.process_facts(facts)
        
        if len(parsed_facts) == 0:
            print(f"-- -- No facts found")
            return questions, parsed_facts
        
        questions = [self.generate_question(fact=fact).question for fact in parsed_facts]
        print(f"-- -- Questions: {questions}")
        return questions, parsed_facts

qa_generator = QAGenerator()

#qa_generator = assert_transform_module(QAGenerator(), backtrack_handler)

In [3]:
doc = df_tpc.iloc[1].text
doc

'Q: I heard that the vitamin K shot might cause cancer. Is this true: A: No. In the early 1990s, a small study in England found an “association” between the vitamin K shot and childhood cancer. An association means that two things are happening at the same time in the same person, but doesn’t tell us whether one causes the other. Figuring out whether vitamin K might cause childhood cancer was very important because every newborn is expected to get a vitamin K shot. If vitamin K was causing cancer, we would expect to see the same association in other groups of children. Scientists looked to see if they could find the same association in other children, but this association between vitamin K and childhood cancer was never found again in any other study.'

In [217]:
questions, facts = qa_generator(text=doc)

-- -- Facts: 1. A small study in England in the early 1990s found an "association" between the vitamin K shot and childhood cancer.
2. An association means that two things are happening at the same time in the same person, but doesn't necessarily mean that one causes the other.
3. Scientists looked for the same association in other groups of children but never found it again in any other study.
4. There is no evidence to support the claim that the vitamin K shot causes cancer.
-- -- GENERATED Facts: 1. A small study in England in the early 1990s found an "association" between the vitamin K shot and childhood cancer.
2. An association means that two things are happening at the same time in the same person, but doesn't necessarily mean that one causes the other.
3. Scientists looked for the same association in other groups of children but never found it again in any other study.
4. There is no evidence to support the claim that the vitamin K shot causes cancer.
-- -- Error: invalid decim

In [219]:
facts

['A small study in England in the early 1990s found an "association" between the vitamin K shot and childhood cancer.',
 "An association means that two things are happening at the same time in the same person, but doesn't necessarily mean that one causes the other.",
 'Scientists looked for the same association in other groups of children but never found it again in any other study.',
 'There is no evidence to support the claim that the vitamin K shot causes cancer.']

In [52]:
class GenerateFacts(dspy.Signature):
    """Divide the text in atomic facts. Facts should have the form 'subject + verb + object'"""
    text = dspy.InputField()
    claims = dspy.OutputField(desc="List of atomic facts")

class FactsGenerator(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_facts = dspy.ChainOfThought(GenerateFacts)

    def forward(self, text):
        facts = self.generate_facts(text=text).claims

        #print(f"-- -- Facts: {facts}")
        #parsed_facts = self.process_facts(facts)
    
        return facts

In [63]:
class GenerateFacts(dspy.Signature):
    """Extract the claims from the given text."""
    text = dspy.InputField(desc="may contain one or several claims")
    claims = dspy.OutputField(desc="List of claims in the form 'subject + verb + object'")

class FactsGenerator(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_facts = dspy.ChainOfThought(GenerateFacts)

    def forward(self, text):
        facts = self.generate_facts(text=text).claims

        #print(f"-- -- Facts: {facts}")
        #parsed_facts = self.process_facts(facts)
    
        return facts

In [64]:
fact_generator = FactsGenerator()
fact_generator(doc)

'1. Urine tests can be used to detect specific substances.\n2. Certain substances in the urine may indicate a certain type of microcephaly.'

In [69]:
# llama
fact_generator = FactsGenerator()
fact_generator(doc)

for doc in df_tpc.text.values.tolist()[0:10]:
    print(doc)
    print(fact_generator(doc))

Chorioamnionitis: A condition during pregnancy that can cause unexplained fever with uterine tenderness, a high white blood cell count, rapid heart rate in the fetus, rapid heart rate in the woman, and/or foul-smelling vaginal discharge.
chorioamnionitis is a condition during pregnancy that can cause unexplained fever with uterine tenderness, a high white blood cell count, rapid heart rate in the fetus, rapid heart rate in the woman, and/or foul-smelling vaginal discharge. chorioamnionitis is a condition during pregnancy
Q: I heard that the vitamin K shot might cause cancer. Is this true: A: No. In the early 1990s, a small study in England found an “association” between the vitamin K shot and childhood cancer. An association means that two things are happening at the same time in the same person, but doesn’t tell us whether one causes the other. Figuring out whether vitamin K might cause childhood cancer was very important because every newborn is expected to get a vitamin K shot. If v

In [67]:
# gpt
# TODO: Create training dataset for this
fact_generator = FactsGenerator()
fact_generator(doc)

for doc in df_tpc.text.values.tolist()[0:10]:
    print(doc)
    print(fact_generator(doc))

Chorioamnionitis: A condition during pregnancy that can cause unexplained fever with uterine tenderness, a high white blood cell count, rapid heart rate in the fetus, rapid heart rate in the woman, and/or foul-smelling vaginal discharge.
1. Chorioamnionitis can cause unexplained fever during pregnancy.
2. Chorioamnionitis can cause uterine tenderness.
3. Chorioamnionitis can result in a high white blood cell count.
4. Chorioamnionitis can lead to a rapid heart rate in the fetus.
5. Chorioamnionitis can cause a rapid heart rate in the woman.
6. Chorioamnionitis can result in foul-smelling vaginal discharge.
Q: I heard that the vitamin K shot might cause cancer. Is this true: A: No. In the early 1990s, a small study in England found an “association” between the vitamin K shot and childhood cancer. An association means that two things are happening at the same time in the same person, but doesn’t tell us whether one causes the other. Figuring out whether vitamin K might cause childhood ca

In [60]:
llm.inspect_history(1)





Extract the claims from the given text.

---

Follow the following format.

Text: may contain one or several claims
Reasoning: Let's think step by step in order to ${produce the claims}. We ...
Claims: List of sentences in the form 'subject + verb + object' separated by ';'

---

Text: Urine test. This is done to look for a substance that may show a certain type of microcephaly.
Reasoning: Let's think step by step in order to[32m produce the claims. We know that a urine test is done to look for a specific substance, which may indicate a certain type of microcephaly.
Claims: Urine test looks for a substance; Substance may indicate a certain type of microcephaly.[0m





In [250]:
import pickle; results = pickle.load(open("example_doc0_10.pickle", "rb"))
results[0]

{'question': 'Did a small study in England in the early 1990s find an "association" between the vitamin K shot and childhood cancer?',
 'answer1': 'A small study in England in the early 1990s found an "association" between the vitamin K shot and childhood cancer.',
 'answer2': Prediction(
     rationale='produce the answer. We need to consider the information provided in the context, which clearly states that a small study in the early 1990s suggested a link between the vitamin K shot and cancer. However, larger studies conducted since then have found no connection between vitamin K and cancer. Therefore, based on the evidence presented, we can determine whether the small study in England did find an "association" between the vitamin K shot and childhood cancer.',
     answer='Yes, a small study in England in the early 1990s did find an "association" between the vitamin K shot and childhood cancer, but larger studies conducted since then have not found any connection between vitamin K 

In [225]:
results[0]["answer2"].context

['Q: I heard that the vitamin K shot might cause cancer. Is this true: A: No. In the early 1990s, a small study in England found an “association” between the vitamin K shot and childhood cancer. An association means that two things are happening at the same time in the same person, but doesn’t tell us whether one causes the other. Figuring out whether vitamin K might cause childhood cancer was very important because every newborn is expected to get a vitamin K shot. If vitamin K was causing cancer, we would expect to see the same association in other groups of children. Scientists looked to see if they could find the same association in other children, but this association between vitamin K and childhood cancer was never found again in any other study.',
 "You may have seen or heard comments from parents who don't want to get the shot for their baby. This is because there's still old information going around from a small study in the early 1990s. That study suggested a link between the

In [251]:
class CheckAnswersFaithfulness(dspy.Signature):
    """Verify that answers to the question are not contradictory."""
    question = dspy.InputField()
    answer1 = dspy.InputField()
    answer2 = dspy.InputField()
    faithfulness = dspy.OutputField(desc="boolean indicating if answer2 is faithful to answer1")
    rationale = dspy.OutputField()
    
class QAChecker(dspy.Module):
    def __init__(self):
        super().__init__()
        self.checker = dspy.ChainOfThought(CheckAnswersFaithfulness)#"text -> claims"
    
    def forward(self, answer1, answer2, question):
        response = self.checker(question=question, answer1=answer1, answer2=answer2)
        #print(f"-- -- faithfulness: {faithfulness}")
       
        return response.faithfulness, response.rationale
    
checker = QAChecker()

In [233]:
results[0].keys()

dict_keys(['question', 'answer1', 'answer2', 'text_passage'])

In [252]:
results_faith = []
for el in range(len(results[10:50])):
    faith, rationale = checker(results[el]["answer1"], results[el]["answer2"].answer, results[el]["question"])
    results_faith.append({
        "question": results[el]["question"],
        "answer1": results[el]["answer1"],
        "answer2": results[el]["answer2"].answer,
        "faith": faith,
        "rationale": rationale,
        "context": results[el]["answer2"].context,
        "text_passage": results[el]["answer2"].text_passage
        
    })
results_faith_df = pd.DataFrame(results_faith)

In [253]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       'display.max_colwidth',None
                       ):
    display(results_faith_df)

Unnamed: 0,question,answer1,answer2,faith,rationale,context,text_passage
0,"Did a small study in England in the early 1990s find an ""association"" between the vitamin K shot and childhood cancer?","A small study in England in the early 1990s found an ""association"" between the vitamin K shot and childhood cancer.","Yes, a small study in England in the early 1990s did find an ""association"" between the vitamin K shot and childhood cancer, but larger studies conducted since then have not found any connection between vitamin K and cancer.",True,"Answer 2 acknowledges the initial finding of an association between the vitamin K shot and childhood cancer in the small study in England in the early 1990s, but also states that larger studies conducted since then have not found any connection between vitamin K and cancer, which does not contradict the initial finding.","[You may have seen or heard comments from parents who don't want to get the shot for their baby. This is because there's still old information going around from a small study in the early 1990s. That study suggested a link between the vitamin K shot and cancer. Larger studies since then have found no connection between vitamin K and cancer. But the outdated information can still be found online., Is Vitamin K safe: A study from the early 1990’s found a possible link between getting vitamin K and developing childhood cancer. Pediatricians became very concerned about this and have done many studies since then, in many different ways, trying to see if this link was true. None of the studies found this link again, even though doctors and scientists looked very hard for it.]","Q: I heard that the vitamin K shot might cause cancer. Is this true: A: No. In the early 1990s, a small study in England found an “association” between the vitamin K shot and childhood cancer. An association means that two things are happening at the same time in the same person, but doesn’t tell us whether one causes the other. Figuring out whether vitamin K might cause childhood cancer was very important because every newborn is expected to get a vitamin K shot. If vitamin K was causing cancer, we would expect to see the same association in other groups of children. Scientists looked to see if they could find the same association in other children, but this association between vitamin K and childhood cancer was never found again in any other study."
1,Does an association between two things happening at the same time in the same person necessarily mean that one causes the other?,"An association means that two things are happening at the same time in the same person, but doesn't necessarily mean that one causes the other.","No, an association between two things happening at the same time in the same person does not necessarily mean that one causes the other.",True,Both answers convey the same message that an association between two things happening at the same time in the same person does not imply causation.,"[In this five-year study, The National CADDRE Study: Child Development and Autism, a number of factors will be studied for their potential association with ASDs, including:\n- infections or abnormal responses to infections in the child, mother or father\n- genetic factors in the child, mother and father\n- mother′s reproductive history\n- abnormal hormone function in the child, mother or father\n- gastrointestinal problems in the child\n- family history of medical and developmental problems\n- smoking, alcohol and drug use in pregnancy, and\n- parentâ€™s occupation and other socio-demographic factors.., A woman's sexual habits and patterns can increase her risk for developing cervical cancer. Risky sexual practices include:\n- Having sex at an early age\n- Having multiple sexual partners\n- Having a partner or many partners who take part in high-risk sexual activities., Babies might have a higher risk of SIDS if:\n- their mother smoked, drank, or used drugs during pregnancy and after birth\n- their mother had poor prenatal care\n- they were born prematurely or at a low birth weight\n- there's a family history of SIDS\n- their mothers were younger than 20 when they gave birth\n- they are around tobacco smoke after birth\n- they get overheated\n- they sleep on a soft surface\n- they sleep with soft objects or loose blankets and pillows\n- they sleep in a parent’s bed.]","Q: I heard that the vitamin K shot might cause cancer. Is this true: A: No. In the early 1990s, a small study in England found an “association” between the vitamin K shot and childhood cancer. An association means that two things are happening at the same time in the same person, but doesn’t tell us whether one causes the other. Figuring out whether vitamin K might cause childhood cancer was very important because every newborn is expected to get a vitamin K shot. If vitamin K was causing cancer, we would expect to see the same association in other groups of children. Scientists looked to see if they could find the same association in other children, but this association between vitamin K and childhood cancer was never found again in any other study."
2,Did scientists find the same association in other groups of children in any other study?,Scientists looked for the same association in other groups of children but never found it again in any other study.,"Answer: No, scientists did not find the same association between vitamin K and cancer in any other study. Multiple larger studies conducted since the initial study in the early 1990s have found no connection between the vitamin K shot and cancer.",False,"Answer 1 states that scientists never found the same association in any other study, while answer 2 clarifies that multiple larger studies have been conducted since the initial study and found no connection between vitamin K and cancer. This shows a contradiction between the two answers.","[You may have seen or heard comments from parents who don't want to get the shot for their baby. This is because there's still old information going around from a small study in the early 1990s. That study suggested a link between the vitamin K shot and cancer. Larger studies since then have found no connection between vitamin K and cancer. But the outdated information can still be found online., Is Vitamin K safe: A study from the early 1990’s found a possible link between getting vitamin K and developing childhood cancer. Pediatricians became very concerned about this and have done many studies since then, in many different ways, trying to see if this link was true. None of the studies found this link again, even though doctors and scientists looked very hard for it.]","Q: I heard that the vitamin K shot might cause cancer. Is this true: A: No. In the early 1990s, a small study in England found an “association” between the vitamin K shot and childhood cancer. An association means that two things are happening at the same time in the same person, but doesn’t tell us whether one causes the other. Figuring out whether vitamin K might cause childhood cancer was very important because every newborn is expected to get a vitamin K shot. If vitamin K was causing cancer, we would expect to see the same association in other groups of children. Scientists looked to see if they could find the same association in other children, but this association between vitamin K and childhood cancer was never found again in any other study."
3,Is there any evidence to support the claim that the vitamin K shot causes cancer?,There is no evidence to support the claim that the vitamin K shot causes cancer.,"There is no evidence to support the claim that the vitamin K shot causes cancer, as larger studies conducted after the initial small study have found no connection between vitamin K and cancer.",True,"Both answers state that there is no evidence to support the claim that the vitamin K shot causes cancer. Answer 2 provides additional information about larger studies that have been conducted to investigate this claim, which supports the initial statement.","[You may have seen or heard comments from parents who don't want to get the shot for their baby. This is because there's still old information going around from a small study in the early 1990s. That study suggested a link between the vitamin K shot and cancer. Larger studies since then have found no connection between vitamin K and cancer. But the outdated information can still be found online., Is Vitamin K safe: A study from the early 1990’s found a possible link between getting vitamin K and developing childhood cancer. Pediatricians became very concerned about this and have done many studies since then, in many different ways, trying to see if this link was true. None of the studies found this link again, even though doctors and scientists looked very hard for it.]","Q: I heard that the vitamin K shot might cause cancer. Is this true: A: No. In the early 1990s, a small study in England found an “association” between the vitamin K shot and childhood cancer. An association means that two things are happening at the same time in the same person, but doesn’t tell us whether one causes the other. Figuring out whether vitamin K might cause childhood cancer was very important because every newborn is expected to get a vitamin K shot. If vitamin K was causing cancer, we would expect to see the same association in other groups of children. Scientists looked to see if they could find the same association in other children, but this association between vitamin K and childhood cancer was never found again in any other study."
4,"Have you considered having your ""tubes tied"" or undergoing a permanent sterilization procedure if you are certain you never want to be pregnant again?","You may consider having your ""tubes tied"" or undergoing a permanent sterilization procedure if you are certain you never want to be pregnant again.","It is crucial to carefully consider the permanence and potential risks of female sterilization before deciding to have your ""tubes tied"" or undergo a permanent sterilization procedure.",True,"Answer 1 suggests considering the option of permanent sterilization if certain about not wanting to be pregnant again, while Answer 2 emphasizes the importance of careful consideration due to the permanence and risks involved. Both answers align in the sense that they emphasize the need for thoughtful decision-making before undergoing such a procedure.","[What else should I know: Female sterilization is permanent. You should not get permanent sterilization if you feel you may change your mind about wanting to get pregnant in the future. While in very rare cases it can be reversed, this operation is not without risk, and can be very expensive. It doesn’t protect against STDs., Female sterilization is permanent. You should not get permanent sterilization if you feel you may change your mind about wanting to get pregnant in the future. While in very rare cases it can be reversed, this operation is not without risk, and can be very expensive. It doesn’t protect against STDs., Sterilization: Permanent sterilization includes tubal ligation, tubal occlusion, and vasectomy. In women, we commonly call this ""having your tubes tied."" These options are for women who do not want children. We offer sterilization for women, including laparoscopic tubal ligation, and postpartum tubal ligation.]","But what if you know for sure that you never want to be pregnant again, or even pregnant ever? You might be thinking about having your “tubes tied,” or going through a permanent sterilization procedure. But talking to your doctor about sterilization can be tricky! Especially if you’re young, you may worry that your doctor won’t take you seriously or will try to talk you out of it."
5,"Have you found it challenging to talk to your doctor about sterilization, especially if you are young?","Talking to your doctor about sterilization can be tricky, especially if you are young.","It can be challenging for a young person to talk to their doctor about sterilization, especially if they are under the age of 30. This age group is more likely to regret the decision, and there may be societal pressure or judgment from healthcare providers about their choice. Additionally, there may be concerns about the permanence of the procedure and the potential impact on future fertility.",True,"Both answers acknowledge that it can be challenging for a young person to talk to their doctor about sterilization, especially due to societal pressures, potential regret, concerns about permanence, and impact on future fertility. The second answer provides more specific reasons for the challenges faced by young individuals, but both ultimately convey the same message.","[Sterilization is a big decision. Consider this info when you’re contemplating the procedure., Making Decisions Expand All:\n- Sterilization is permanent birth control. It is not meant to be reversible. Before having the procedure, you (and your partner, if appropriate) must be certain that you do not want children in the future.\n- If you have a sterilization procedure and you change your mind after the operation, you can have surgery to try to reverse it. You can also try assisted reproductive technology (ART) to attempt pregnancy. These procedures are expensive and may not be covered by insurance. There is also no guarantee that you will be able to get pregnant afterward.\n- Choosing to have sterilization is a major decision. You should avoid making this choice during times of stress, such as during a divorce or after losing a pregnancy. You should also not make this choice under pressure from a partner or others.\n- Most women who choose sterilization do not regret their decision. But research shows that women younger than age 30 are more likely than older women to regret having the surgery.\n- If you are not sure you want to have sterilization, there are long-acting methods of birth control that allow you to get pregnant when you stop using them. For example, the intrauterine device (IUD) and the birth control implant are birth control methods that\n- are as effective at preventing pregnancy as female sterilization\n- are safer than sterilization\n- last for several years\n- can be removed at any time if you wish to get pregnant\n- Read Long-Acting Reversible Contraception (LARC) to learn more.\n- Sometimes previous surgery, obesity, or other conditions affect which methods can be used.]","But what if you know for sure that you never want to be pregnant again, or even pregnant ever? You might be thinking about having your “tubes tied,” or going through a permanent sterilization procedure. But talking to your doctor about sterilization can be tricky! Especially if you’re young, you may worry that your doctor won’t take you seriously or will try to talk you out of it."
6,Do you worry that your doctor won't take you seriously or will try to talk you out of sterilization?,You may worry that your doctor won't take you seriously or will try to talk you out of sterilization.,"It is valid to have concerns about how your doctor may respond to your desire for sterilization. It is crucial to find a healthcare provider who will listen to your concerns, provide you with accurate information, and support you in making an informed decision that aligns with your reproductive health goals. If you feel that your doctor is not taking you seriously or trying to dissuade you from sterilization, it may be beneficial to seek a second opinion or find a provider who is more supportive of your choices. Your reproductive health decisions should be respected and supported by your healthcare team.",True,Both answers acknowledge the potential concerns and emphasize the importance of finding a supportive healthcare provider who respects your reproductive health decisions. They both encourage seeking a second opinion if needed.,"[Sterilization is a big decision. Consider this info when you’re contemplating the procedure., Of the 492 participants, 137 (28%) reported having had an operation that would keep you from getting pregnant, like having your tubes tied, sterilization, or hysterectomy. Sterilized women in the drug-treatment programs were more likely than nonsterilized women to be older (average age: 37 years (standard deviation (SD)=plus or minus 6.8 years) versus 33 years (SD=plus or minus 6.0 years)) and to have fewer years of education (51% and 40%, respectively, had not completed high school). Sterilized women reported ever having had pelvic inflammatory disease more often than nonsterilized women (34% versus 23%). Sterilized and nonsterilized women in this study were not significantly different with regard to ethnicity., While the number and rate of all tubal sterilizations increased rapidly during the early and mid 1970s, the increase was greater for nonpregnancy-associated sterilizations (those performed on women not pregnant when hospitalized) than for sterilizations performed during hospitalizations to end a pregnancy. For example, in 1970, only 29% of tubal sterilizations were performed on nulliparous women, but by 1976, the percentage had risen to a peak of 58%. This trend did not appear to continue, however, as 49% of sterilizations in 1979 and 47% in 1980 were performed on women who were not pregnant.]","But what if you know for sure that you never want to be pregnant again, or even pregnant ever? You might be thinking about having your “tubes tied,” or going through a permanent sterilization procedure. But talking to your doctor about sterilization can be tricky! Especially if you’re young, you may worry that your doctor won’t take you seriously or will try to talk you out of it."
7,Are urine tests conducted to detect the presence of Zika virus in relation to microcephaly?,Urine tests are conducted to detect a substance related to microcephaly.,"Based on the provided context, there is no mention of urine tests being conducted to detect the presence of Zika virus in relation to microcephaly.",False,"Answer 1 mentions urine tests being conducted to detect a substance related to microcephaly, while answer 2 specifically states that there is no mention of urine tests being conducted to detect the presence of Zika virus in relation to microcephaly. These answers are contradictory.","[Testing tissues: During the first months of the outbreak, NCEZID scientists developed special tests that identified the virus in brain tissues from infants who died from microcephaly and in placentas from women who had miscarriages. This discovery cemented the connection between birth defects and Zika virus infectionExternal during pregnancy. They also found that the virus could make copies of itself in the fetuses’ brains and in women’s placentas and persist in these tissues for months. This may help explain how the virus can cause so much damage., INS maintains ongoing passive, national surveillance in Colombia for both symptomatic Zika virus disease and major birth defects. Surveillance for Zika virus disease based on clinical symptoms and laboratory testing started in August 2015 in Colombia, and following a cluster of laboratory-confirmed cases of Zika virus disease, immediate mandatory reporting began in October 2015. At the time, symptomatic Zika virus disease was defined as illness with fever and at least one additional symptom (rash, nonpurulent conjunctivitis, headache, pruritus, arthralgia, myalgia, or malaise) of unknown etiology. Beginning December 24, 2015, the case definition has included both fever and rash, and at least one of the other symptoms. Colombia’s birth defects surveillance system includes reporting of microcephaly (International Classification of Disease, 10th Revision code Q02) among live births and pregnancy losses (including spontaneous abortions, pregnancy terminations, and stillbirths) from all reporting areas. * Congenital microcephaly in a newborn is defined as having a head circumference below the third percentile for gestational age and sex. The following clinical specimens are requested for all infants and fetuses with microcephaly to ascertain whether the mother was infected with Zika virus during pregnancy: maternal serum, infant serum from cord and peripheral blood specimens, cerebrospinal fluid (if obtained from infant for clinical reasons), and tissues from fetal losses., The Brazil Ministry of Health developed a case definition for Zika virus–related microcephaly (head circumference ≥2 standard deviations [SD] below the mean for sex and gestational age at birth). A task force and registry were established to investigate Zika virus–related cases of microcephaly and to describe the clinical characteristics of cases. Among the first 35 cases of microcephaly reported to the registry, 74% of mothers reported a rash illness during pregnancy, 71% of infants had severe microcephaly (>3 SD below the mean), approximately half had at least one neurologic abnormality, and among 27 who had neuroimaging studies, all were abnormal. Cerebrospinal fluid from all infants is being tested for Zika virus; results are not currently available.]",Urine test. This is done to look for a substance that may show a certain type of microcephaly.
8,What substance in urine may indicate a certain type of microcephaly?,The presence of this substance in urine may indicate a certain type of microcephaly.,A substance in urine that may indicate a certain type of microcephaly is alpha-fetoprotein (AFP). High levels of AFP in the amniotic fluid or maternal blood during pregnancy can be associated with certain types of microcephaly.,True,"Both answers mention that a substance in urine, specifically alpha-fetoprotein (AFP), may indicate a certain type of microcephaly. The second answer provides additional context by specifying that high levels of AFP in amniotic fluid or maternal blood during pregnancy can be associated with certain types of microcephaly, but it does not contradict the first answer.","[Microcephaly may be caused by problems during a woman’s pregnancy. In some cases, it may be caused by inheriting an abnormal gene., The symptoms of microcephaly can be like other health conditions. Make sure your child sees their healthcare provider for a diagnosis.]",Urine test. This is done to look for a substance that may show a certain type of microcephaly.
9,What is the chance that a woman who is a carrier of X-linked agammaglobulinemia will pass the faulty X chromosome to her child?,Women who are carriers of X-linked agammaglobulinemia have a 1 in 2 chance of passing the faulty X chromosome to a child.,The chance that a woman who is a carrier of X-linked agammaglobulinemia will pass the faulty X chromosome to her child is 50%.,True,Both answers convey the same information that carriers of X-linked agammaglobulinemia have a 50% chance of passing the faulty X chromosome to their child.,"[In each pregnancy, if the mother is a carrier and the father has the disease, the expected outcomes are:\n- 25% chance of a healthy boy\n- 25% chance of a boy with the disease\n- 25% chance of a carrier girl\n- 25% chance of a girl with the disease., If the father has the disease and the mother is not a carrier, the expected outcomes are:\n- 50% chance of a having a healthy boy\n- 50% chance of a having a girl without the disease who is a carrier.]","Which children are at risk for X-linked agammaglobulinemia: Women who are carriers have a 1 in 2 chance of passing the faulty X chromosome to a child. This is true for every pregnancy. If a daughter gets the gene, she will likely be a healthy carrier like her mother. If a son gets the gene, he will have X-linked agammaglobulinemia."


In [None]:
# Define device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Load model and tokenizer
model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

model.to(device)

In [259]:
predictions = []
for row_id, row in results_faith_df.iterrows():
    premise = row["text_passage"]
    hypothesis = row["context"][0]
    inputs = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        output = model(inputs["input_ids"])
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    label_names = ["entailment", "neutral", "contradiction"]
    #prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    prediction = label_names[np.argmax(prediction)]
    predictions.append(prediction)

In [261]:
results_faith_df["nli_preds"] = predictions

In [263]:
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       'display.max_colwidth',None
                       ):
    display(results_faith_df[results_faith_df.nli_preds == "contradiction"])

Unnamed: 0,question,answer1,answer2,faith,rationale,context,text_passage,nli_preds
11,Will the daughter likely be a healthy carrier like her mother?,"If a daughter inherits the faulty gene, she will likely be a healthy carrier like her mother.",It is not guaranteed that the daughter will be a healthy carrier like her mother. Genetic testing and counseling are important to assess the risks and implications for the daughter's health.,False,"Answer 1 states that the daughter will likely be a healthy carrier like her mother, while answer 2 acknowledges that it is not guaranteed and that genetic testing and counseling are important to assess the risks. This shows a contradiction in the certainty of the outcome.","[Females pass one of their X chromosomes on to their child of either assigned sex. Therefore, they have a 50% chance of passing XLAS to any of their children., If a genetic test is ordered, the pediatrician or other healthcare professional will take a blood sample for testing. These tests can include:\n- Karyotype or chromosome microarray: This is used to assess the person for the presence of an extra X chromosome and to determine the degree of mosaicism (if any).\n- Prenatal genetic testing: Pregnant women who are at higher risk for having a child with triple X syndrome — mothers of advanced maternal age or women with triple X syndrome themselves — may be advised to undergo prenatal genetic testing, such as non-invasive prenatal testing (NIPT), amniocentesis or chorionic villi sampling (CVS)..]","Which children are at risk for X-linked agammaglobulinemia: Women who are carriers have a 1 in 2 chance of passing the faulty X chromosome to a child. This is true for every pregnancy. If a daughter gets the gene, she will likely be a healthy carrier like her mother. If a son gets the gene, he will have X-linked agammaglobulinemia.",contradiction
23,What are her clinical interests in the field of endocrinology?,"Her clinical interests include reproductive endocrine disorders such as premature ovarian insufficiency, polycystic ovary syndrome, precocious puberty, endometriosis, and complex contraception.","Dr. Camille Powe's clinical interests in the field of endocrinology include studying hormonal changes during pregnancy and their effects on maternal health, as well as applying this knowledge to improve outcomes for pregnant women.",False,"Answer 1 lists specific clinical interests in the field of endocrinology related to reproductive endocrine disorders, while answer 2 focuses on hormonal changes during pregnancy and maternal health. These two answers are not contradictory but rather provide different aspects of Dr. Camille Powe's clinical interests in endocrinology.","[A 34-year-old woman was referred for evaluation of ovarian enlargement and a pituitary mass. She had normal childhood development and regular menses until age 33, when her cycles became more frequent. She and her husband had been trying to conceive for approximately five years, but were unsuccessful., Endocrinologist Dr. Camille Powe, who co-leads the Diabetes in Pregnancy Program at Mass General, envisions a better future for these patients. She has devoted her career to better understanding how a growing baby affects the mother’s biology, and she is committed to applying that knowledge to help pregnant women benefit from these medical advances.]","Jacqueline Maher, M.D., is a reproductive endocrinology and infertility specialist with a focus on pediatric and adolescent gynecology. She leads the female fertility preservation program at Children's National Hospital to counsel patients in egg freezing or ovarian tissue freezing, and post treatment premature ovarian insufficiency, and hormone replacement/ puberty induction. Her other clinical interests include reproductive endocrine disorders such as premature ovarian insufficiency, polycystic ovary syndrome, precocious puberty, endometriosis and complex contraception; as well as surgery for congenital anomalies of the reproductive tract such as vaginal and uterine septum.",contradiction


## Classify questions

In [9]:
df_en = pd.read_csv("questions_rosie/FullTrialQa7152024.csv")
df_filtered = df_en.dropna(subset=['question', 'answerPassageText'])
df_filtered[['question', 'answerPassageText']]

Unnamed: 0,question,answerPassageText
0,How long should your kid nurse on each side?,How Long Does Nursing Take: Newborns may nurse...
1,What do I need to do if I have to be rescreene...,For people who do not receive a diagnosis of d...
2,How many ounces of milk should my 2 week old d...,How much milk should I store to meet my baby’s...
3,Do breastfed or formula babies spit up more?,Spitting up and dribbling milk with burps or a...
4,How do you get a breast pump from your insurance,How can I get a breast pump: Under the Patient...
...,...,...
16104,Thanks,Suggested citation for this article: Lengerich...
16105,What if I have a low lying placenta during pre...,What is the difference between placental abrup...
16106,Can you recommend any pelvic floor therapists ...,Do physical therapy: Your health care professi...
16107,Can I give birth vaginally with a placenta pre...,Can I still have a vaginal delivery with place...


In [14]:
len(df_filtered[['question', 'answerPassageText']].question.values.tolist())

16109

In [15]:
len(list(set(df_filtered[['question', 'answerPassageText']].question.values.tolist())))

11778

In [19]:
from sentence_transformers import SentenceTransformer
from sklearn.svm import OneClassSVM
import numpy as np

# Load pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can choose different models like 'paraphrase-MiniLM-L6-v2', etc.

def get_bert_embedding(text, tokenizer, model):
    # Tokenize input text and get embeddings
    inputs = tokenizer(text, return_tensors='pt', max_length=128, truncation=True, padding='max_length')
    with torch.no_grad():
        outputs = model(**inputs)
    # Take the mean of the last hidden state (average of token embeddings)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.numpy()
# Sample positive questions (these are the ones from mothers)
positive_questions = list(set(df_filtered[['question', 'answerPassageText']].question.values.tolist()))

# Artificially generated questions (to be classified)
new_questions = [
    "Does the shot mentioned in the text temporarily decrease bone density?",
    "Can women improve their chances of having a baby born without a birth defect?",
]

# Generate embeddings for positive questions using SentenceTransformer
positive_embeddings = model.encode(positive_questions)

# Train One-Class SVM on the embeddings of positive questions
ocsvm = OneClassSVM(kernel='rbf', gamma='auto', nu=0.1)
ocsvm.fit(positive_embeddings)

# Generate embeddings for new questions
new_embeddings = model.encode(new_questions)

# Predict on new questions
predictions = ocsvm.predict(new_embeddings)

# Output the predictions
for i, question in enumerate(new_questions):
    if predictions[i] == 1:
        print(f"'{question}' is classified as similar to the positive class (keep).")
    else:
        print(f"'{question}' is classified as not similar to the positive class (discard).")




Batches:   0%|          | 0/369 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'Does the shot mentioned in the text temporarily decrease bone density?' is classified as not similar to the positive class (discard).
'Can women improve their chances of having a baby born without a birth defect?' is classified as similar to the positive class (keep).
