In [0]:
import json
import pandas as pd

In [0]:
# ---- CONFIG ----
INPUT_JSON = "drug-label-0001-of-0013.json"
OUTPUT_CSV = "drug_labels_first_1000_complete.csv"
TARGET_COUNT = 1000

# ---- LOAD JSON ----
with open(INPUT_JSON, "r", encoding="utf-8") as f:
    data = json.load(f)

results = data.get("results", [])

# ---- MANDATORY FIELDS ----
required_openfda_fields = ["brand_name", "generic_name", "manufacturer_name", "product_ndc", "route"]
required_root_fields = ["id", "set_id", "active_ingredient", "warnings", "indications_and_usage", "purpose"]

# ---- EXTRACT RECORDS ----
records = []
for item in results:
    openfda = item.get("openfda", {})

    # Check completeness
    if not all(field in item for field in required_root_fields):
        continue
    if not all(field in openfda and openfda[field] for field in required_openfda_fields):
        continue

    # Build final row
    entry = {
        "id": item.get("id"),
        "set_id": item.get("set_id"),
        "brand_name": ", ".join(openfda.get("brand_name", [])),
        "generic_name": ", ".join(openfda.get("generic_name", [])),
        "manufacturer_name": ", ".join(openfda.get("manufacturer_name", [])),
        "product_ndc": ", ".join(openfda.get("product_ndc", [])),
        "route": ", ".join(openfda.get("route", [])),
        "substance_name": ", ".join(openfda.get("substance_name", [])),
        "active_ingredient": ", ".join(item.get("active_ingredient", [])),
        "purpose": ", ".join(item.get("purpose", [])),
        "indications_and_usage": ", ".join(item.get("indications_and_usage", [])),
        "warnings": ", ".join(item.get("warnings", [])),
        "dosage_and_administration": ", ".join(item.get("dosage_and_administration", [])),
        "inactive_ingredient": ", ".join(item.get("inactive_ingredient", [])),
        "effective_time": item.get("effective_time"),
        "version": item.get("version"),
    }

    records.append(entry)

    # Stop once we reach 1000 complete records
    if len(records) == TARGET_COUNT:
        break

# ---- SAVE TO DATAFRAME ----
df = pd.DataFrame(records)
df.to_csv(OUTPUT_CSV, index=False)

print(f"✅ Completed! Extracted {len(df)} fully-complete drug records.")
print(f"📌 Saved to: {OUTPUT_CSV}")
print(df.head())