<h1 style="text-align: center;">EDA</h1>


### Checking the right path for the data

In [1]:
from pathlib import Path

# Go one level up from src/ → DDI-RAG
PROJECT_ROOT = Path().resolve().parent

RAW_FOLDER = PROJECT_ROOT / "data" / "raw"
PROCESSED_FOLDER = PROJECT_ROOT / "data" / "processed"

print("Project root:", PROJECT_ROOT)
print("Raw folder exists:", RAW_FOLDER.exists())

Project root: C:\Users\bc0683\ddi-rag
Raw folder exists: True


## Understadning toatl number of drugs appeared with generic names in the datased

In [2]:
import ijson
from pathlib import Path

PROJECT_ROOT = Path().resolve().parent
RAW_FOLDER = PROJECT_ROOT / "data" / "raw"

all_drug_names = []

for file in RAW_FOLDER.glob("drug-label-*.json"):
    print(f"Processing {file.name}...")
    
    with open(file, "rb") as f:
        for drug in ijson.items(f, "results.item"):
            generic_names = drug.get("openfda", {}).get("generic_name")
            
            if generic_names:
                all_drug_names.append(generic_names[0].lower())

print("\nTotal drug entries collected:", len(all_drug_names))

Processing drug-label-0001-of-0013.json...
Processing drug-label-0002-of-0013.json...
Processing drug-label-0003-of-0013.json...
Processing drug-label-0004-of-0013.json...
Processing drug-label-0005-of-0013.json...
Processing drug-label-0006-of-0013.json...
Processing drug-label-0007-of-0013.json...
Processing drug-label-0008-of-0013.json...
Processing drug-label-0009-of-0013.json...
Processing drug-label-0010-of-0013.json...
Processing drug-label-0011-of-0013.json...
Processing drug-label-0012-of-0013.json...
Processing drug-label-0013-of-0013.json...

Total drug entries collected: 82945


# Understading total number of drugs present in the datset
## . how many unique data labels are present.

## . how many drugs with interaction infomation in available


In [3]:
import ijson

total_drugs = 0
drugs_with_interactions = 0
unique_names = set()

for file in RAW_FOLDER.glob("drug-label-*.json"):
    print(f"Processing {file.name}...")
    
    with open(file, "rb") as f:
        for drug in ijson.items(f, "results.item"):
            total_drugs += 1
            
            generic_names = drug.get("openfda", {}).get("generic_name")
            interactions = drug.get("drug_interactions")
            
            if generic_names:
                unique_names.add(generic_names[0].lower())
            
            if interactions:
                drugs_with_interactions += 1

print("\n===== SUMMARY =====")
print("Total drug entries:", total_drugs)
print("Unique drug names:", len(unique_names))
print("Drugs with interaction info:", drugs_with_interactions)

Processing drug-label-0001-of-0013.json...
Processing drug-label-0002-of-0013.json...
Processing drug-label-0003-of-0013.json...
Processing drug-label-0004-of-0013.json...
Processing drug-label-0005-of-0013.json...
Processing drug-label-0006-of-0013.json...
Processing drug-label-0007-of-0013.json...
Processing drug-label-0008-of-0013.json...
Processing drug-label-0009-of-0013.json...
Processing drug-label-0010-of-0013.json...
Processing drug-label-0011-of-0013.json...
Processing drug-label-0012-of-0013.json...
Processing drug-label-0013-of-0013.json...

===== SUMMARY =====
Total drug entries: 254983
Unique drug names: 14112
Drugs with interaction info: 66695


### . from the two cells we can observe that there are a totoal of *254K enteries in thh dataset*, whith a *83K generic names* , *unique named druges with around 14K* and *drugs with interactions around 66k*  

| Metric  | What It Counts                |
| ------- | ----------------------------- |
| 254,983 | All label records             |
| 82,945  | Records with generic names    |
| 66,695  | Records with interaction data |
| 14,112  | Unique drug names             |


In [4]:
all_drug_names[:50]

['amoxicillin and clavulanate potassium',
 'emtricitabine and tenofovir disoproxil fumarate',
 'triple antibiotic ointment',
 'acetaminophen, aspirin, and caffeine',
 'titanium dioxide, zinc oxide',
 'buspirone hydrochloride',
 'olmesartan medoxomil / amlodipine besylate / hydrochlorothiazide',
 'linagliptin and metformin hydrochloride',
 'angelica archangelica, drosera, grindelia, lamium album, pinus sylvestris, rubus fruticosus, senega officinalis',
 'pertuzumab, trastuzumab, and hyaluronidase-zzxf',
 'guaifenesin and dextromethorphan hbr',
 'octinoxate, zinc oxide',
 'octinoxate, zinc oxide',
 'dextroamphetamine sulfate, dextroamphetamine saccharate, amphetamine sulfate and amphetamine aspartate',
 'isopropyl alcohol',
 'titanium dioxide, zinc oxide',
 'dextroamphetamine saccharate, amphetamine aspartate, dextroamphetamine sulfate and amphetamine sulfate',
 'water',
 'diclofenac sodium and misoprostol',
 'potassium phosphate, monobasic potassium phosphate, dibasic',
 'estradiol',
 '

In [5]:
from collections import Counter

drug_counts = Counter(all_drug_names)

drug_counts.most_common(20)

[('zinc oxide', 1812),
 ('alcohol', 1419),
 ('acetaminophen', 965),
 ('ibuprofen', 916),
 ('salicylic acid', 910),
 ('menthol', 851),
 ('benzalkonium chloride', 794),
 ('sodium fluoride', 784),
 ('titanium dioxide, zinc oxide', 640),
 ('avobenzone, homosalate, octisalate, octocrylene', 595),
 ('nicotine polacrilex', 520),
 ('ethyl alcohol', 517),
 ('isopropyl alcohol', 514),
 ('oxygen', 449),
 ('lidocaine', 430),
 ('calcium carbonate', 422),
 ('gabapentin', 391),
 ('benzocaine', 385),
 ('aspirin', 370),
 ('hydrocortisone', 349)]

In [6]:
list(unique_names)[:20]

['acetaminophen, aspirin (nsaid), caffeine',
 'oxygen',
 'arnica montana, baptisia, echinacea, eugenia jambosa, hepar shulph',
 'wart remover patch',
 'camphora',
 'anhydrous citric acid, aspirin',
 'folliculitis and acne treatment cream',
 'acid phosd12, nat sulphd7, lycopodiumd4, syzygium jambold8, taraxacumd8, uranium nitd6, uva ursid8',
 'pollinosis combination',
 'ret large intestine',
 'pancreas balance',
 'elamipretide hydrochloride',
 'apis mellifica, arsenicum album, belladonna, chamomilla, lachesis mutus, mercurius solubilis, pulsatilla (pratensis), rhus tox, silicea',
 'acetaminophen capsules 500mg (minis-red opaque)',
 'sda 40b alcohol 200 proof',
 'oxybenzone, padimate',
 'apis combination',
 'dl-camphor, l-menthol, methyl salicylate patch',
 'sual wart remover liquid',
 'clindamycin phosphate usp, 1%']

In [7]:

import pandas as pd 
df = pd.DataFrame({"drug_name": all_drug_names})

print(df.head())
print("Total rows:", len(df))

                                         drug_name
0            amoxicillin and clavulanate potassium
1  emtricitabine and tenofovir disoproxil fumarate
2                       triple antibiotic ointment
3             acetaminophen, aspirin, and caffeine
4                     titanium dioxide, zinc oxide
Total rows: 82945


In [8]:
df["drug_name"].value_counts().head(20)

drug_name
zinc oxide                                         1812
alcohol                                            1419
acetaminophen                                       965
ibuprofen                                           916
salicylic acid                                      910
menthol                                             851
benzalkonium chloride                               794
sodium fluoride                                     784
titanium dioxide, zinc oxide                        640
avobenzone, homosalate, octisalate, octocrylene     595
nicotine polacrilex                                 520
ethyl alcohol                                       517
isopropyl alcohol                                   514
oxygen                                              449
lidocaine                                           430
calcium carbonate                                   422
gabapentin                                          391
benzocaine                            

In [9]:
count_interactions = 0

for file in RAW_FOLDER.glob("drug-label-*.json"):
    with open(file, "rb") as f:
        for drug in ijson.items(f, "results.item"):
            if drug.get("drug_interactions"):
                count_interactions += 1

print("Total entries with interactions:", count_interactions)

Total entries with interactions: 66695


In [11]:
import pandas as pd
import ijson

records = []

for file in RAW_FOLDER.glob("drug-label-*.json"):
    with open(file, "rb") as f:
        for drug in ijson.items(f, "results.item"):
            
            record = {}
            
            # Flatten openfda sub-dict
            openfda = drug.get("openfda", {})
            for key, value in openfda.items():
                record[f"openfda_{key}"] = value[0] if isinstance(value, list) else value
            
            # Add all other fields as string
            for key, value in drug.items():
                if key != "openfda":
                    record[key] = str(value)[:1000]  # truncate to prevent huge memory
            
            records.append(record)

df = pd.DataFrame(records)

print("Dataset shape:", df.shape)
df.head()

Dataset shape: (254983, 182)


Unnamed: 0,openfda_application_number,openfda_brand_name,openfda_generic_name,openfda_manufacturer_name,openfda_product_ndc,openfda_product_type,openfda_route,openfda_substance_name,openfda_rxcui,openfda_spl_id,...,calibration_instructions,guaranteed_analysis_of_feed,components_table,diagram_of_device,pharmacogenomics_table,residue_warning,controlled_substance_table,spl_indexing_data_elements_table,safe_handling_warning_table,intended_use_of_the_device_table
0,ANDA201090,Amoxicillin and Clavulanate Potassium,AMOXICILLIN AND CLAVULANATE POTASSIUM,"Asclemed USA, Inc.",76420-859,HUMAN PRESCRIPTION DRUG,ORAL,AMOXICILLIN,617423.0,25322eee-f92a-5838-e063-6394a90aa376,...,,,,,,,,,,
1,NDA021752,Truvada,EMTRICITABINE AND TENOFOVIR DISOPROXIL FUMARATE,"Gilead Sciences, Inc",61958-0701,HUMAN PRESCRIPTION DRUG,ORAL,EMTRICITABINE,476556.0,32422ee8-1cdc-4305-bb3e-0458472513fc,...,,,,,,,,,,
2,M004,"Triple Antibiotic Ointment, Circle K",TRIPLE ANTIBIOTIC OINTMENT,"Lil' Drug Store Products, Inc.",66715-5305,HUMAN OTC DRUG,TOPICAL,BACITRACIN ZINC,204602.0,44e5107e-832b-2ce6-e063-6294a90a711d,...,,,,,,,,,,
3,M013,"Excedrin Extra Strength, TRAVEL BASIX","ACETAMINOPHEN, ASPIRIN, AND CAFFEINE","Lil Drug Store Products, Inc",66715-6410,HUMAN OTC DRUG,ORAL,ACETAMINOPHEN,209468.0,458c59d3-e231-99ac-e063-6394a90ac131,...,,,,,,,,,,
4,M020,ANTI-AGING TINTED SUNSCREEN,"TITANIUM DIOXIDE, ZINC OXIDE","LASER GIRL SKINCARE, LLC",87444-201,HUMAN OTC DRUG,TOPICAL,TITANIUM DIOXIDE,,4b93cd22-128a-29bc-e063-6294a90ab781,...,,,,,,,,,,
