In [94]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import os
import re

In [95]:
html = open('search_results.html')
soup = BeautifulSoup(html, 'html.parser')

In [96]:
# Extract all links inside result-item medicine divs
urls = [a['href'] for a in soup.select('.result-item.medicine a[href]')]

In [97]:
base_url ='https://www.farmacotherapeutischkompas.nl'

In [98]:
# Function to fetch and extract full 'recipe' sections containing ' FNA'
def extract_fna_recipe(url):
    full_url = base_url + url
    filename= url.split('/')[-1]
    filepath = f'pages/{filename}.html'       
    try:
        if not os.path.exists(filepath):
            try:
                response = requests.get(full_url)
                file = open(filepath, 'wb')
                file.write(response.content)
                file.close()

            except Exception as e:
                print(e)
            if response.status_code == 200:
                page_soup = BeautifulSoup(response.text, 'html.parser')
            
        else:
            try:
                file = open(filepath)
                page_soup = BeautifulSoup(file.read(), 'html.parser')
                file.close()
            except Exception as e:
                print(e)
            
        fna_recipes = [section for section in page_soup.find_all('section', class_='recipe') if 'FNA' in section.get_text()]

        atc = page_soup.find_all('span',class_='byline-item')
        atc_text = atc[-1].get_text() if atc else ""
            
        return fna_recipes, atc_text
    
    except Exception as e:
        print(e)
        return [],""

In [99]:
def get_ATC():
    pass

In [100]:
columns = ["Brandname", "ActiveIngredient", "Strength", "DosageForm", "ATC", "PrescribingProduct"]

#Initialize library
records = []

# Iterate through URLs and extract relevant content

for url in urls:
    print(base_url+url)
    fna_recipes, atc = extract_fna_recipe(url)

    #print(fna_recipes)
    for section in fna_recipes:
        name = section.find("span", class_="name").text.strip() if section.find("span", class_="name") else ""

        try:
            doses = [dose for dose in section.find_all(class_='doses')]
            #print(doses)

            for dose in doses:
                try:
                    dosage_form = dose.find("dd", title="Toedieningsvorm").text.strip() if dose.find("dd", title="Toedieningsvorm") else ""
                except Exception as e:
                    print(e)
                            
                try:
                    strengths_text =  dose.find("dd", title="Sterkte").text.strip()
                    strengths = re.split(r",\s", strengths_text,flags=re.IGNORECASE)

                except Exception as e:
                    print(e)
                    strengths=[]
                
                if strengths:
                    for strength in strengths:
                        records.append({
                            "Brandname": name,
                            "ActiveIngredient": "",
                            "Strength": strength,
                            "DosageForm": dosage_form,
                            "ATC": atc,
                            "PrescribingProduct": ""
                        })
                else:
                    records.append({
                        "Brandname": name,
                        "ActiveIngredient": "",
                        "Strength": "",
                        "DosageForm": dosage_form,
                        "ATC": atc,
                        "PrescribingProduct": ""
                    })
        ####################################################
        except Exception as e:
            print(e)
        
            try:
                dosage_form = section.find("dd", title="Toedieningsvorm").text.strip() if section.find("dd", title="Toedieningsvorm") else ""
                #dosage_form = section.find_all("dd", title="Toedieningsvorm")

                print(dosage_form)
            except Exception as e:
                print(e)
                        
            try:
                strengths=  dose.find("dd", title="Sterkte").text.strip().split(',')
            except Exception as e:
                print(e)
                strengths=[]
            
            if strengths:
                for strength in strengths:
                    records.append({
                        "Brandname": name,
                        "ActiveIngredient": "",
                        "Strength": strength,
                        "DosageForm": dosage_form,
                        "ATC": atc,
                        "PrescribingProduct":""
                    })
            else:
                records.append({
                    "Brandname": name,
                    "ActiveIngredient": "",
                    "Strength": "",
                    "DosageForm": dosage_form,
                    "ATC": atc,
                    "PrescribingProduct": ""
                })

#Create dataframe
df = pd.DataFrame(records, columns=columns)
# Display DataFrame
#display(df)





https://www.farmacotherapeutischkompas.nl/bladeren/preparaatteksten/h/hydrocortison__systemisch_
https://www.farmacotherapeutischkompas.nl/bladeren/groepsteksten/corticosteroiden__systemisch
https://www.farmacotherapeutischkompas.nl/bladeren/preparaatteksten/c/colecalciferol
https://www.farmacotherapeutischkompas.nl/bladeren/groepsteksten/vitamine_d_en_analoga
https://www.farmacotherapeutischkompas.nl/bladeren/preparaatteksten/i/indifferente__vette__creme
'NoneType' object has no attribute 'text'
'NoneType' object has no attribute 'text'
https://www.farmacotherapeutischkompas.nl/bladeren/groepsteksten/indifferente_middelen
https://www.farmacotherapeutischkompas.nl/bladeren/preparaatteksten/l/lidocaine__gel__aanstipvloeistof_
https://www.farmacotherapeutischkompas.nl/bladeren/groepsteksten/anesthetica__lokaal_via_huid_of_slijmvlies
https://www.farmacotherapeutischkompas.nl/bladeren/preparaatteksten/v/valproinezuur
https://www.farmacotherapeutischkompas.nl/bladeren/groepsteksten/anti_epi

In [101]:
'''Cleaning up dosage forms'''

## Remove values between parantheses, term FNA, percentages, etc.
df['DosageForm']=df['DosageForm'].str.replace(r"\(.*\)|\'.*\'|FNA|\d+\%","", regex=True)


In [102]:
'''Cleaning up Strengths'''
'''No need for standardization. Already standardized.'''

'No need for standardization. Already standardized.'

In [103]:
'''Extract active ingredient'''
df['ActiveIngredient'] = df['Brandname'].apply(lambda x: re.sub(r"FNA","",x) if 'Zure' in x else re.sub(r"\s.*$", "", x))

## Manual replace
dictionary = {
    'drank':'',
    'smeersel':'',
    'poeder':'',
    'Polymyxine':'Polymyxine B',
    'mondspoeling':'',
    'gel':'',
    'crème':'',
    'zalf':'',
}

df['ActiveIngredient'] = df['ActiveIngredient'].replace(dictionary,regex=True)

In [105]:
'''Create prescribing product'''
df['PrescribingProduct'] = df['ActiveIngredient'] + ' - ' + df['Strength'] + ' - ' + df['DosageForm']

In [108]:
from datetime import date
output_filename = f'output/edups_FNA_formularium_v{date.today()}.csv'

df.to_csv(output_filename, index=False, encoding='latin-1',sep=';', decimal=',', errors='ignore')
