In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
fname = "test_fold_two"
basepath = "./-paraphrase-multilingual-MiniLM-L12-v2_3015"

## Load data

In [3]:
import pandas as pd

def loadData(fname, basepath):
    sample = pd.read_csv(f"{basepath}/{fname}.csv")
    sample['Document'] = sample.apply(lambda row: ': '.join(row.astype(str)), axis=1)
    return sample

## Question Generation

In [4]:
from qa_generation import QAGenerator
from prompt_factory import PromptFactory
from tqdm.notebook import tqdm

template = '''
        Gedraag je als een {role} die online informatie zoekt.
        Patiënten stellen meestal vragen als:
{{
    "qa_list": [
        {{
            "question": "Hoe kan ik een verstopte voedingssonde doorspoelen?",
            "answer": "Om een verstopte voedingssonde door te spoelen, kunt u de volgende stappen proberen: 1. Sluit de spuit rechtstreeks aan op de sonde (niet op het voedingssysteem). Als er een verstopping is in het voedingssysteem, kunt u dit vervangen. 2. Neem een spuit van 10 cc en spuit met lichte druk lauwwarm water door de voedingssonde. Herhaal dit zo nodig nog een keer. 3. Als het oplossen van de verstopping niet lukt, laat dan lauwwarm water 30 minuten inwerken en herhaal de bovenstaande procedure nogmaals. 4. Als u de verstopping kunt zien, kunt u proberen om de voedingssonde op die plek zachtjes te kneden. Als de verstopping dan loskomt, kunt u het doorspuiten met lauwwarm water. Belangrijk: Gebruik nooit een voerdraad of koolzuurhoudend bronwater of frisdranken, omdat dit kan leiden tot perforatie. Daarnaast wordt het afraden om natriumbicarbonaat te gebruiken als medicatie de oorzaak is van de verstopping, omdat dit de verstopping groter kan maken."
        }}
    ]
}}


        in het json formaat: 
        {format}

Document Informatie:
        {document}

        Belangrijke Opmerking: Bij het genereren van vragen, gebruik specifieke termen en benamingen uit het document in plaats van algemene termen zoals 'dit onderzoek' of 'die procedure'. Verwijs direct naar de procedure of het document met de exacte naam om nauwkeurigheid en duidelijkheid in de vragen te waarborgen. Vermijd algemeenheden en zorg ervoor dat elke vraag direct gerelateerd is aan de verstrekte documentinformatie.


        Stel {n} vragen die beantwoord kunnen worden op basis van deze paragraaf in het formaat:
        {format}

        Zorg ervoor dat elke vraag en antwoord paar in een geldig JSON-formaat is. Dit betekent dat vragen en antwoorden tussen dubbele aanhalingstekens moeten staan, en de algemene structuur moet overeenkomen met het vereiste JSON-schema.
        '''

roles = [
    'Patiënt',
    'Nieuwe Ouder',
    'Oudere Patiënt',
    'Persoon die een Tweede Mening Zoekt',
    'Reiziger die Medisch Advies Na Reizen Zoekt',
    'Zorgverlener die Informatie Zoekt']

prompt_factory = PromptFactory(prompt=template,roles=roles)

num_questions_per_doc = 5

QAGenerator = QAGenerator()
# Generate the data
import pandas as pd
from tqdm import tqdm


def generateQuestions(df):
    data = []
    for index, row in tqdm(df.iterrows(), total=df.shape[0],desc="Generating QAs"):
        doc = row["Document"]
        # Do something with col1 and col2
        try:
            for i in range(2):
                qa_list = QAGenerator.generate_qas(prompt_factory.generate_prompt(doc, num_questions_per_doc))
                for qa in qa_list:
                    new_row = row.to_dict()
                    new_row['Question'] = qa.question
                    new_row['Answer'] = qa.answer
                    data.append(new_row)

        except Exception as e:
                print(f"Failed to generate QA for doc: {doc[:100]}. Error: {e}")
                continue
    if(len(data) == 0):
       return pd.DataFrame()
    result = pd.DataFrame(data)
    result = result.dropna(subset=['Document', 'Question', 'Answer'])
    result = result[(result['Document'] != '') & (result['Question'] != '') & (result['Answer'] != '')]
    return result


In [5]:
from fact_qa import FactQAGenerator
from prompt_factory import PromptFactory
import langchain

template = '''
Gedraag je als iemand die online informatie zoekt en stelt vragen gebaseerd op specifieke feiten zoals locatie, telefoonnummer, naam, enzovoort.

Patiënten of gebruikers kunnen bijvoorbeeld vragen stellen als:
{{
    "qa_list": [
        {{
            "question": "Wat is het telefoonnummer van de kliniek voor noodgevallen?",
            "answer": "Het telefoonnummer van de kliniek voor noodgevallen is 012-345-6789."
        }},
        {{
            "question": "Waar is de hoofdingang van het ziekenhuis gelegen?",
            "answer": "De hoofdingang van het ziekenhuis is gelegen aan de Eerste Gezondheidsstraat 123 in Amsterdam."
        }}
        // meer vragen en antwoorden gebaseerd op specifieke feiten
    ]
}}

in het json formaat:
{format}

Document Informatie:
{document}

Belangrijke Opmerking: Bij het genereren van vragen, gebruik specifieke termen en benamingen uit het document in plaats van algemene termen. Verwijs direct naar de specifieke feit (zoals locatie, telefoonnummer, naam) om nauwkeurigheid en duidelijkheid in de vragen te waarborgen. Vermijd algemeenheden.

Maak vragen die beantwoord kunnen worden met de volgende informatie: {fact}

Stel {n} vragen die beantwoord kunnen worden op basis van deze paragraaf in het formaat:
{format}

Zorg ervoor dat elke vraag en antwoord paar in een geldig JSON-formaat is. Dit betekent dat vragen en antwoorden tussen dubbele aanhalingstekens moeten staan, en de algemene structuur moet overeenkomen met het vereiste JSON-schema.
        '''

base_prompt = langchain.PromptTemplate(
            template=template,
            input_variables=['fact', 'n', 'document', 'format']
        )

num_questions_per_doc = 5

FactQAGenerator = FactQAGenerator()

def generateFactQuestions(df):
    data = []
    # Generate the data
    for index, row in tqdm(df.iterrows(), total=df.shape[0],desc="Generating Fact QAs"):
        doc = row['Document']
        # Assuming phone numbers are separated by commas
        facts = row['Context Entities']
        try:
            for i in range(2):
                for entity in extract_desired_entities(facts)[:5]:
                    fact = entity["value"]
                    prompt = base_prompt.partial(
                        n=num_questions_per_doc,
                        document=doc)
                    try:
                        qa_list = FactQAGenerator.generate_question_for_fact(prompt, fact)
                        for qa in qa_list:
                            new_row = row.to_dict()
                            new_row['Question'] = qa.question
                            new_row['Answer'] = qa.answer
                            new_row['Fact'] = entity
                            print(entity)
                            data.append(new_row)
                    except:
                        pass
        except Exception as e:
                print(f"Failed to generate QA for document: {doc[:20]}. Error: {e:20}")

    if(len(data) == 0):
        return pd.DataFrame()

    result = pd.DataFrame(data)
    result = result.dropna(subset=['Document', 'Question', 'Answer'])
    result = result[(result['Document'] != '') & (result['Question'] != '') & (result['Answer'] != '')]
    return result

In [6]:
def update_df_with_lambda(df, column_name, lambda_function):
     # Check if the column does not exist in the DataFrame, if not, initialize it with NA
    if column_name not in df.columns:
        df[column_name] = pd.NA
    
    # Apply the lambda function only to rows where the column value is NA
    df[column_name] = df.apply(lambda row: lambda_function(row) if pd.isna(row[column_name]) else row[column_name], axis=1)
    return df

In [7]:
from answer_context_filter import calculate_bleu_score, calculate_rouge_score

def calculate_scores(df):
    # Calculate BLEU and ROUGE scores
    #df['Question-Context BLEU'] = df.apply(lambda row: calculate_bleu_score(row['Document'], row['Question']), axis=1)
    #df['Answer-Context BLEU'] = df.apply(lambda row: calculate_bleu_score(row['Document'], row['Answer']), axis=1)

    #df['Question-Context ROUGE'] = df.apply(lambda row: calculate_rouge_score(row['Document'], row['Question']), axis=1)
    #df['Answer-Context ROUGE'] = df.apply(lambda row: calculate_rouge_score(row['Document'], row['Answer']), axis=1)
    #df['Question-Context ROUGE-L F1'] = df['Question-Context ROUGE'].apply(lambda x: x[0]['rouge-l']['f'] if x else None)
    #df['Answer-Context ROUGE-L F1'] = df['Answer-Context ROUGE'].apply(lambda x: x[0]['rouge-l']['f'] if x else None)

    df = update_df_with_lambda(
        df, 'Question-Context ROUGE',
        lambda row: calculate_rouge_score(row['Document'], row['Question']) if pd.isna(row.get('Question-Context ROUGE')) else row['Question-Context ROUGE']
    )

    # Update 'Answer-Context ROUGE' only if it is NA
    df = update_df_with_lambda(
        df, 'Answer-Context ROUGE',
        lambda row: calculate_rouge_score(row['Document'], row['Answer']) if pd.isna(row.get('Answer-Context ROUGE')) else row['Answer-Context ROUGE']
    )

    # Update 'Question-Context ROUGE-L F1' based on the 'Question-Context ROUGE' column, only if it is NA
    df = update_df_with_lambda(
        df, 'Question-Context ROUGE-L F1',
        lambda row: row['Question-Context ROUGE'][0]['rouge-l']['f'] if pd.isna(row.get('Question-Context ROUGE-L F1')) and row['Question-Context ROUGE'] else None
    )

    # Update 'Answer-Context ROUGE-L F1' based on the 'Answer-Context ROUGE' column, only if it is NA
    df = update_df_with_lambda(
        df, 'Answer-Context ROUGE-L F1',
        lambda row: row['Answer-Context ROUGE'][0]['rouge-l']['f'] if pd.isna(row.get('Answer-Context ROUGE-L F1')) and row['Answer-Context ROUGE'] else None
    )

    return df


def rougeFilter(df):
    df = calculate_scores(df)
    df = df[df['Answer-Context ROUGE-L F1'] > 0.1]
    return df

## Vector filter

In [8]:
from vector_filter import precompute_embeddings
from vector_filter import filter_dataframe

def vectorFilter(df):
    df["Embedding"] = precompute_embeddings(df, 'Question')
    df = filter_dataframe(df, threshold=0.9)
    return df

## Entity Filter

In [9]:
def contains_all_elements(list1, list2):
    """
    Check if list1 contains all elements of list2.
    Each element is a dictionary with 'entityType' and 'value' keys.
    """
    # Convert each dictionary in the lists to a tuple (entityType, value) for easy comparison
    #set1 = {tuple(d.items()) for d in list1}
    #set2 = {tuple(d.items()) for d in list2}
    set1 = {d['value'] for d in list1 if 'value' in d}
    set2 = {d['value'] for d in list2 if 'value' in d}

    # Check if every element in set2 is also in set1
    return set2.issubset(set1)

def check_entity_match(row):
    return contains_all_elements(row['Context Entities'], row['Answer Entities'])

# Apply the function to each row to create the new column


In [10]:
from extract import extract_all_entities

def extractEntities(df):
    df['Context Entities'] = df['Document'].apply(extract_all_entities)
    df['Question Entities'] = df['Question'].apply(extract_all_entities)
    df['Answer Entities'] = df['Answer'].apply(extract_all_entities)
    df['Entity Match'] = df.apply(check_entity_match, axis=1)
    """  
    df = update_df_with_lambda(df, 'Context Entities', 
                               lambda row: extract_all_entities(row['Document']) 
                                if row.get('Context Entities') is None else row['Context Entities'])

    # Update or create 'Question Entities' column by applying entity extraction function if the value is NA
    df = update_df_with_lambda(df, 'Question Entities', 
                               lambda row: extract_all_entities(row['Question']) 
                               if row.get('Question Entities')is None else row['Question Entities'])

    # Update or create 'Answer Entities' column by applying entity extraction function if the value is NA
    df = update_df_with_lambda(df, 'Answer Entities', 
                               lambda row: extract_all_entities(row['Answer']) 
                               if row.get('Answer Entities') is None else row['Answer Entities'])

    # Update or create 'Entity Match' column by applying the check entity match function if the value is NA
    df = update_df_with_lambda(df, 'Entity Match', 
                               lambda row: check_entity_match(row) 
                               if row.get('Entity Match') is None else row['Entity Match'])
    """
    return df



  return torch._C._cuda_getDeviceCount() > 0
2024-01-18 14:54:17.279287: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-01-18 14:54:17.407383: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-18 14:54:18.102758: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/lib:/usr/local/nvidia/lib:/usr/

2024-01-18 14:54:32,718 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-PER, S-LOC, B-MISC, E-MISC, B-ORG, E-ORG, I-ORG, I-PER, B-LOC, I-LOC, E-LOC, I-MISC, <START>, <STOP>


In [11]:
def contains_desired_entity(entities_list):
    desired_types = {'Phone', 'Email'}
    return any(entity['entityType'] in desired_types for entity in entities_list)


def filterFactQuestions(df):
    df = extractEntities(df)
    return df[~((df['Question'].str.split().str.len() < 30) & df['Answer Entities'].apply(contains_desired_entity))]

def extract_desired_entities(entities_list):
    # Define the desired entity types
    desired_types = {'Phone', 'Email'}
    
    # Extract entities that match the desired types
    matching_entities = [entity for entity in entities_list if entity['entityType'] in desired_types]
    
    return matching_entities

In [12]:
def filterFactQuestionCorrectness(df):
    df['Answer Entities'] = df['Answer'].apply(extract_all_entities)
    # Apply a filter condition to each row
    #filtered_df = df[df.apply(lambda row: row["Fact"] in row["Answer Entities"], axis=1)]
    filtered_df = df[df.apply(lambda row: any( row["Fact"]['value'] == entity['value'] for entity in row["Answer Entities"]) if isinstance(row["Answer Entities"], list) else False, axis=1)]
    return filtered_df

#### LLM Scoring

In [13]:
from llm_filter import estimate_relevance

def llmScore(df):
    return update_df_with_lambda(df, 'LLM Score', lambda row: estimate_relevance(row['Question'], row['Document']))

# Run

In [14]:
import os

sample = loadData(fname, basepath)

df = pd.DataFrame(columns = sample.columns)



for i in tqdm(range(5), desc="Processing"):
    raw_file_path = f"{basepath}/output_raw_{i}_{fname}.csv"

    if os.path.exists(raw_file_path):
        # Load the file into a DataFrame if it exists
        print(f"Skipping generation: {i}")
        new_questions_df = pd.read_csv(raw_file_path)
    else:
        df_gen = sample[~sample['Document'].isin(df['Document'])]
        new_questions_df = generateQuestions(df_gen)
        if df_gen.empty:
            break
    new_questions_df.to_csv(raw_file_path)
    df = pd.concat([df, new_questions_df], ignore_index=True, sort=False)
    df = rougeFilter(df)
    df = vectorFilter(df)
    df = filterFactQuestions(df)
    df = llmScore(df)


## take top one for each doc
idx = df.groupby('Document')['LLM Score'].idxmax()
df = df.loc[idx]

df.to_csv(f"{basepath}/output_full_{fname}.csv")
df.to_csv(f"{basepath}/output_{fname}.csv", columns = ['topic_id', 'filename', 'header', 'section_text', 'Question', 'Answer'])


Processing:   0%|          | 0/5 [00:00<?, ?it/s]
Generating QAs:   0%|          | 0/50 [00:00<?, ?it/s][A
Generating QAs:   2%|▏         | 1/50 [00:12<10:14, 12.54s/it][A
Generating QAs:   4%|▍         | 2/50 [00:26<10:36, 13.25s/it][A
Generating QAs:   6%|▌         | 3/50 [00:46<12:54, 16.48s/it][A
Generating QAs:   8%|▊         | 4/50 [01:05<13:23, 17.47s/it][A
Generating QAs:  10%|█         | 5/50 [01:17<11:35, 15.45s/it][A
Generating QAs:  12%|█▏        | 6/50 [01:31<11:05, 15.12s/it][A
Generating QAs:  14%|█▍        | 7/50 [01:42<09:51, 13.76s/it][A
Generating QAs:  16%|█▌        | 8/50 [01:58<09:59, 14.28s/it][A
Generating QAs:  18%|█▊        | 9/50 [02:18<10:55, 15.99s/it][A
Generating QAs:  20%|██        | 10/50 [02:30<09:59, 14.98s/it][A
Generating QAs:  22%|██▏       | 11/50 [02:58<12:18, 18.93s/it][A
Generating QAs:  24%|██▍       | 12/50 [03:19<12:25, 19.62s/it][A
Generating QAs:  26%|██▌       | 13/50 [03:45<13:11, 21.40s/it][A
Generating QAs:  28%|██▊      

Failed to generate QA for doc: 29: Enteroscopie distaal poliklinisch: Begeleiding na sedatie: - Voor zowel Midazolam sedatie als vo. Error: 'list' object has no attribute 'qa_list'



Generating QAs:  30%|███       | 15/50 [04:20<11:27, 19.64s/it][A
Generating QAs:  32%|███▏      | 16/50 [04:32<09:44, 17.18s/it][A
Generating QAs:  34%|███▍      | 17/50 [04:45<08:48, 16.02s/it][A
Generating QAs:  36%|███▌      | 18/50 [05:13<10:28, 19.64s/it][A
Generating QAs:  38%|███▊      | 19/50 [05:28<09:26, 18.29s/it][A
Generating QAs:  40%|████      | 20/50 [05:55<10:30, 21.01s/it][A
Generating QAs:  42%|████▏     | 21/50 [06:14<09:49, 20.33s/it][A
Generating QAs:  44%|████▍     | 22/50 [06:31<08:56, 19.17s/it][A
Generating QAs:  46%|████▌     | 23/50 [06:51<08:48, 19.57s/it][A
Generating QAs:  48%|████▊     | 24/50 [07:14<08:53, 20.53s/it][A
Generating QAs:  50%|█████     | 25/50 [07:24<07:16, 17.47s/it][A
Generating QAs:  52%|█████▏    | 26/50 [07:38<06:34, 16.45s/it][A
Generating QAs:  54%|█████▍    | 27/50 [07:48<05:32, 14.47s/it][A
Generating QAs:  56%|█████▌    | 28/50 [08:00<05:01, 13.70s/it][A
Generating QAs:  58%|█████▊    | 29/50 [08:09<04:17, 12.28s/i

## Export Q&A Pairs

In [15]:
df.to_csv(f"{basepath}/output_full_{fname}.csv")
df.to_csv(f"{basepath}/output_{fname}.csv", columns = ['topic_id', 'filename', 'header', 'section_text', 'Question', 'Answer'])

In [16]:
df

Unnamed: 0,topic_id,filename,header,section_text,Document,Question,Answer,Question-Context ROUGE,Answer-Context ROUGE,Question-Context ROUGE-L F1,Answer-Context ROUGE-L F1,Embedding,Context Entities,Question Entities,Answer Entities,Entity Match,LLM Score
44,10,Anesthesie volwassenen,Vormen van anesthesie,Algehele anesthesie (narcose) Bij deze vorm va...,10: Anesthesie volwassenen: Vormen van anesthe...,Wat is het verschil tussen algehele en regiona...,Het verschil tussen algehele en regionale anes...,"[{'rouge-1': {'r': 0.03333333333333333, 'p': 0...","[{'rouge-1': {'r': 0.12666666666666668, 'p': 0...",0.062893,0.217143,"[0.022561257416644272, 0.0007618076489396394, ...","[{'entityType': 'MISC', 'value': 'iPod'}]",[],[],True,100.0
127,10,Septumcorrectie (operatie aan neustussenschot),Wat is het neustussenschot,Het neustussenschot (neusseptum of kortweg sep...,10: Septumcorrectie (operatie aan neustussensc...,Wat is het neustussenschot?,Het neustussenschot (neusseptum of kortweg sep...,"[{'rouge-1': {'r': 0.03896103896103896, 'p': 0...","[{'rouge-1': {'r': 0.22077922077922077, 'p': 1...",0.074074,0.361702,"[-0.005245539726644542, -0.014108693282673543,...",[],[],[],True,100.0
97,11,Anesthesie volwassenen,Eten en drinken voor de operatie.,Voor uw veiligheid dient u nuchter te zijn voo...,11: Anesthesie volwassenen: Eten en drinken vo...,Hoe lang moet ik nuchter zijn voor een operatie?,Voor uw veiligheid dient u nuchter te zijn voo...,"[{'rouge-1': {'r': 0.07142857142857142, 'p': 0...","[{'rouge-1': {'r': 0.24285714285714285, 'p': 1...",0.126582,0.390805,"[0.006605985727322925, -0.03711994934170077, 0...",[],[],[],True,100.0
96,13,Azathioprine (reumatologie),Mogelijke bijwerkingen,Bijwerkingen kunnen optreden. Meest voorkomend...,13: Azathioprine (reumatologie): Mogelijke bij...,Hoe vaak komen maag- en darmklachten voor bij ...,Maag- en darmklachten komen vaak voor bij het ...,"[{'rouge-1': {'r': 0.06060606060606061, 'p': 0...","[{'rouge-1': {'r': 0.09090909090909091, 'p': 0...",0.102564,0.155844,"[0.005589070069189393, -0.030945203118361214, ...",[],"[{'entityType': 'MISC', 'value': 'Azathioprine'}]","[{'entityType': 'MISC', 'value': 'Azathioprine'}]",False,85.0
131,13,Botox (ZGT Helon),Andere indicaties,Niet alleen rimpels kunnen behandeld worden me...,13: Botox (ZGT Helon): Andere indicaties: Niet...,Welke indicaties kunnen behandeld worden met b...,Botuline-toxine kan gebruikt worden voor de be...,"[{'rouge-1': {'r': 0.09523809523809523, 'p': 0...","[{'rouge-1': {'r': 0.21428571428571427, 'p': 0...",0.163265,0.305085,"[-0.0030785119180306996, -0.0232077315595189, ...","[{'entityType': 'MISC', 'value': 'Botox'}, {'e...",[],"[{'entityType': 'MISC', 'value': 'Botuline-tox...",False,100.0
105,14,Epoxyhars,Wat zijn de reacties bij een epoxyhars allergie?,De reactie uit zich in een typische allergisch...,14: Epoxyhars: Wat zijn de reacties bij een ep...,Hoe kan huidcontact met verharders worden beha...,Bij huidcontact met verharders moeten de huid ...,"[{'rouge-1': {'r': 0.058823529411764705, 'p': ...","[{'rouge-1': {'r': 0.1323529411764706, 'p': 0....",0.08,0.2,"[0.02238983736009096, -0.012224410984802246, 0...",[],[],[],True,80.0
133,14,Formaldehyde,Behandeling van de contact dermatitis,"Probeer formaldehyde te vermijden, of neem maa...",14: Formaldehyde: Behandeling van de contact d...,Wat is de behandeling voor contact dermatitis ...,De behandeling voor contact dermatitis veroorz...,"[{'rouge-1': {'r': 0.1, 'p': 0.3, 'f': 0.14999...","[{'rouge-1': {'r': 0.4, 'p': 0.5, 'f': 0.44444...",0.15,0.333333,"[0.020079466654988606, -0.013833082667701904, ...",[],[],[],True,100.0
55,15,Aftercare (Nocepta),Medicatie,- Verdovingsvloeistof. De verdovingsvloeistof ...,15: Aftercare (Nocepta): Medicatie: - Verdovin...,Wat zijn mogelijke bijwerkingen van de ontstek...,De ontstekingsremmer kan een aantal bijwerking...,"[{'rouge-1': {'r': 0.034482758620689655, 'p': ...","[{'rouge-1': {'r': 0.12643678160919541, 'p': 0...",0.06383,0.215686,"[-0.01583096234668532, -0.0333633840205851, 0....","[{'entityType': 'ORG', 'value': 'Nocepta'}]",[],[],True,100.0
15,16,"Lymfoedeem, fysiotherapie",Bewegen,Probeer de Nederlandse Norm Gezond Bewegen te ...,"16: Lymfoedeem, fysiotherapie: Bewegen: Probee...",Wat wordt verstaan onder matig intensieve lich...,Matig intensieve lichamelijke activiteit voor ...,"[{'rouge-1': {'r': 0.08620689655172414, 'p': 0...","[{'rouge-1': {'r': 0.29310344827586204, 'p': 1...",0.147059,0.426667,"[0.014031632256061398, -0.02711931882815061, 0...","[{'entityType': 'MISC', 'value': 'Nederlandse ...",[],[],True,100.0
135,17,Anesthesie volwassenen,De dag van de opname,Op de operatiedag krijgt u operatiekleding aan...,17: Anesthesie volwassenen: De dag van de opna...,Moet ik mijn operatiekleding zelf meenemen of ...,Op de operatiedag krijgt u operatiekleding aan.,"[{'rouge-1': {'r': 0.018518518518518517, 'p': ...","[{'rouge-1': {'r': 0.12962962962962962, 'p': 1...",0.03125,0.229508,"[0.008764605165415593, -0.04564820668966094, 0...",[],[],[],True,100.0


# Fact Q&A

In [17]:
sample = loadData(fname, basepath)

sample['Context Entities'] = sample['Document'].apply(extract_all_entities)

new_questions_df = generateFactQuestions(sample)
new_questions_df.to_csv(f"{basepath}/output_fact_raw_{fname}.csv")
df = vectorFilter(new_questions_df)
df = llmScore(df)
df = filterFactQuestionCorrectness(df)

## take top one for each doc
df['Fact Value'] = df['Fact'].apply(lambda x: x.get('value'))
idx = df.groupby(['Document', 'Fact Value'])['LLM Score'].idxmax()
df = df.loc[idx]

Generating Fact QAs: 100%|██████████| 50/50 [00:00<00:00, 15307.68it/s]


KeyError: 'Question'

In [None]:

df.to_csv(f"{basepath}/output_full_fact_{fname}.csv")
df.to_csv(f"{basepath}/output_fact_{fname}.csv", columns = ['topic_id', 'filename', 'header', 'section_text', 'Fact', 'Question', 'Answer'])