In [203]:
import pandas as pd
import re

In [199]:
# Load the provided file
file_path = 'source_file_GIB.csv'
data = pd.read_csv(file_path, delimiter='|')

In [200]:
# Function to extract various strength formats, including concentrations, percentages, and ratios
def optimized_extract_strength(product_name):    
    units = r'mg|miligram|g|gram|ml|mililiter|µg|microgram|IE|internationale eenheden|eenheden|SQ-E|anti-Xa-eenheden|dosis|ME|uur|%|miljoen internationale eenheden|mmol|mol|l|liter|Bq|MBq|SQ-T|DU|AU|Allerganeenheden'
    
    match=re.search(rf'(\d+([.,]?)\d*)(\/\d+([.,]?)\d*)*\s?({units})\s?(\/\s?\d*([.,]?)\d*\s?({units}))*',product_name, re.IGNORECASE)


    if match:
        return match.group(0).strip()  # Clean up any extra spaces
    else:
        return None
# Apply the optimized function to extract strength from PRODUCTNAAM
data['STERKTE'] = data.apply(
    lambda row: optimized_extract_strength(row['PRODUCTNAAM']),
    axis=1
)



In [201]:
data['WERKZAMESTOFFEN_CLEAN'] = data['WERKZAMESTOFFEN'].replace(r'#','/', regex=True)


# Define a list of common salts and hydration terms in Dutch
common_salts_dutch = ["(di)?hydrochloride", "nitraat", "fosfaat", "natrium", "kalium", "calcium", "magnesium",
                      "sulfaat", "tartraat", "mesilaat", "acetaat", "fumaraat", "malaat", "chloride", 
                      "bromide", "succinaat", "mesylaat","propionaat",'xinafoaat',"butiraat","butyraat", "bes[iy]laat","arginine",
                      "(un)?decanoaat", "maleaat", "citraat","diwaterstof", "valeraat", "lactobionaat","benzoaat",
                      "(-)?(TERT)?(-)?BUTYLAMINE", "CILEXETIL", "PROPYLEENGLYCOLAAT", "etexilaat", "carbonaat",
                      "oxalaat", "--"
                      ]

# Exclusion list for words/phrases you want to keep intact
exclusion_list = set(["carbasalaatcalcium"])

# Compile the regex pattern for salts
salt_pattern = re.compile(r'(' + '|(di|hydro|X-|waterstof)?'.join(common_salts_dutch) + r')', re.IGNORECASE)

# Regex to match numbers followed by '-water'
water_pattern = re.compile(r'((\d+([.,]?)\d*)|n|X)-water', re.IGNORECASE)

# Generalized function to remove unwanted patterns (salts and '-water') while respecting the exclusion list
def clean_ingredient(ingredient):
    word = ingredient#.split()
    
    cleaned_words = []

    word_lower = word.lower()  # Use lowercase for exclusion comparison
        
    if word_lower in exclusion_list:
        cleaned_words.append(word)  # Skip cleaning for excluded words
    else:
        # Remove salts and '-water' patterns
        cleaned_word = salt_pattern.sub('', word)
        cleaned_word = water_pattern.sub('', cleaned_word)
        cleaned_words.append(cleaned_word.strip())  # Add the cleaned word
    
    return ' '.join(cleaned_words).strip()


# Apply the function to remove salts and hydration terms
data['WERKZAMESTOFFEN_CLEAN'] = data['WERKZAMESTOFFEN_CLEAN'].fillna('').apply(clean_ingredient)



In [202]:
data['FARMACEUTISCHEVORM_CLEAN'] = data['FARMACEUTISCHEVORM'].fillna("").apply(lambda x: x.split(',')[0].strip())


In [205]:
data['PRESCRIBING_PRODUCT'] = data['WERKZAMESTOFFEN_CLEAN'].fillna("") + " - " + data['STERKTE'].fillna("") + " - " + data['FARMACEUTISCHEVORM_CLEAN'].fillna("")


temp= data[['PRESCRIBING_PRODUCT','PRODUCTNAAM',"STERKTE", 'WERKZAMESTOFFEN_CLEAN', 'FARMACEUTISCHEVORM_CLEAN']]

In [211]:
#Drop duplicates, based on handelsproduct
data = data.drop_duplicates(subset=['PRODUCTNAAM'], keep='first')


In [212]:
# Export the updated dataset
export_path = 'edups_formularium_concept.csv'
data[['PRESCRIBING_PRODUCT','PRODUCTNAAM',"STERKTE", 'WERKZAMESTOFFEN_CLEAN', 'FARMACEUTISCHEVORM', 'AFLEVERSTATUS']].to_csv(export_path, index=False, sep=';',encoding='utf-8-sig')

# Provide the path for downloading the new file
export_path


'edups_formularium_concept.csv'