## Data Parsing Test Run

In [1]:
# Python code to parse the provided preview lines and extract drug names, forms, and dosages

import re

# Sample data extraction based on the previewed structure
sample_data = [
    "A and B vaccine/PF I",
    "A and D TOPICAL OINT.",
    "A and D/white pet/lanolin",
    "A INJECTION V",
    "A INTRAMUSC",
    "A INTRAMUSC V",
    "A microsphr"
]

# Define a function to parse the data
def parse_drug_info(data):
    parsed_data = []
    for line in data:
        # Attempt to split drug name from its form and dosage
        parts = re.split(r'\s+', line)
        if len(parts) > 1:
            # Assuming the last part may contain form/dosage information
            form_dosage = parts[-1]
            drug_name = " ".join(parts[:-1])
        else:
            form_dosage = None
            drug_name = line
        
        parsed_data.append({
            'drug_name': drug_name,
            'form_dosage': form_dosage
        })
    return parsed_data

# Parse the sample data
parsed_sample_data = parse_drug_info(sample_data)
parsed_sample_data

[{'drug_name': 'A and B vaccine/PF', 'form_dosage': 'I'},
 {'drug_name': 'A and D TOPICAL', 'form_dosage': 'OINT.'},
 {'drug_name': 'A and D/white', 'form_dosage': 'pet/lanolin'},
 {'drug_name': 'A INJECTION', 'form_dosage': 'V'},
 {'drug_name': 'A', 'form_dosage': 'INTRAMUSC'},
 {'drug_name': 'A INTRAMUSC', 'form_dosage': 'V'},
 {'drug_name': 'A', 'form_dosage': 'microsphr'}]

## Refined Parsing Test

In [2]:
# Improved parsing strategy using regular expressions for better accuracy

def improved_parse_drug_info(data):
    improved_parsed_data = []
    # Regular expression to match potential form/dosage patterns
    form_dosage_pattern = re.compile(r'(TOPICAL|INJECTION|ORAL|SUBCUT|INTRAVEN|NASAL|OTIC|VAGINAL|TRANSDERM|INTRAUTERI|INTRAMUSC|CAPSULE|TABLET|SOLUTION|SUSP|GEL|CREAM|OINT|LIQD|POWD|AERO|DROPS|SYRINGE|VIAL|LOZENGE|SHAMPOO|FOAM|CONC|MD PMP|AUTO INJCT|CARTRIDGE|IMPLANT|PEN INJCTR|PK|SPR|DS PK|ER|SA|DR|CAP|TAB|CHEW|G|ML|MG|MC|UNI|%)')

    for line in data:
        # Attempt to find the form/dosage part using the regex
        match = form_dosage_pattern.search(line)
        if match:
            start_pos = match.start()
            drug_name = line[:start_pos].strip()
            form_dosage = line[start_pos:].strip()
        else:
            drug_name = line
            form_dosage = None
        
        improved_parsed_data.append({
            'drug_name': drug_name,
            'form_dosage': form_dosage
        })
    return improved_parsed_data

# Apply the improved parsing logic on the same sample data
improved_parsed_sample_data = improved_parse_drug_info(sample_data)
improved_parsed_sample_data


[{'drug_name': 'A and B vaccine/PF I', 'form_dosage': None},
 {'drug_name': 'A and D', 'form_dosage': 'TOPICAL OINT.'},
 {'drug_name': 'A and D/white pet/lanolin', 'form_dosage': None},
 {'drug_name': 'A', 'form_dosage': 'INJECTION V'},
 {'drug_name': 'A', 'form_dosage': 'INTRAMUSC'},
 {'drug_name': 'A', 'form_dosage': 'INTRAMUSC V'},
 {'drug_name': 'A microsphr', 'form_dosage': None}]

## Parsing Taha_Output Test File

In [3]:
# Assuming the function 'improved_parse_drug_info' and 'file_path' are defined

file_path = 'Taha_Output.txt'
# Step 1: Apply Improved Parsing to Entire File Content
with open(file_path, 'r') as file:
    entire_file_data = file.readlines()

parsed_data = improved_parse_drug_info(entire_file_data)

# Step 2: Normalize and Standardize Form Names and Dosage Units
# This would involve iterating over 'parsed_data', applying normalization and standardization rules.

# Example of normalization (simplified for demonstration)
normalized_data = []
for entry in parsed_data:
    if entry['form_dosage']:
        # Apply normalization logic here
        normalized_form_dosage = entry['form_dosage'].replace('caps', 'capsules').replace('tab', 'tablets')
        normalized_data.append({**entry, 'form_dosage': normalized_form_dosage})
    else:
        normalized_data.append(entry)


NameError: name 'file_path' is not defined