In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Final column format
columns = [
    "Resource type", "Locality of resource", "Description", "Additional information on resource",
    "Year published", "URL of resource", "How to request access to this resource",
    "Name of resource's point of contact", "Organization of point of contact",
    "Email of resource's point of contact", "Technology Used", "GIS methods",
    "Resource Format (open data)", "Resource Format (tools and applications)",
    "Resource Format (case study)", "Peace Processes", "Peacebuilding outcomes",
    "Conflict Stages", "Civil Society Tags", "Economic Foundations Tags",
    "Safe Environments Tags", "Social Inclusion Tags", "Social Inclusion Tags (other)",
    "State Institution Tags"
]

# All subcategories grouped
categories = {
    "Social Inclusion": [
        "Dispute Resolution",
        "Gender equality behaviour change",
        "Intergroup dialogues",
        "Peace Education",
        "Peace Messaging and Media",
        "Social inclusion/ reintegration"
    ],
    "Peace Processes": [
        "Peace policy influencing",
        "Support for peace agreement implementation",
        "Transitional justice processes"
    ],
    "Safe environment": [
        "Civilian police reform",
        "Conflict-focused early warning systems",
        "Countering violent extremism",
        "Demining",
        "Disarmament, demobilisation, and reintegration (DDR) and gang drop-out programms",
        "Preventative protection measures"
    ],
    "Civil society": [
        "Civic engagement initiatives",
        "Civil society capacity building",
        "Justice and human rights support",
        "Social funds, community-driven development and reconstruction"
    ],
    "State Institution": [
        "Justice system support/reform",
        "Land reform",
        "Public sector governance and institutionalization",
        "Security sector reform",
        "Support for elections",
        "Support for foundational state design processes"
    ],
    "Economic Foundation": [
        "Academic catch-up",
        "Cash transfers and subsidies",
        "Community-based natural resource management",
        "Financial inclusion",
        "In-kind transfers and food assistance",
        "Infrastructure development and reconstruction",
        "Jobs creation",
        "Life skills and employment training",
        "Market development",
        "National natural resource benefits sharing",
        "Transboundary water sharing"
    ]
}

base_url = "https://catalog.data.gov/dataset"
all_rows = []

# Loop through each subcategory
for category, subcats in categories.items():
    for subcat in subcats:
        query = subcat.replace(" ", "+")
        search_url = f"{base_url}?q={query}"
        print(f"Searching: {search_url}")
        res = requests.get(search_url)
        soup = BeautifulSoup(res.content, "html.parser")
        results = soup.find_all("h3", class_="dataset-heading")

        for result in results:
            try:
                dataset_url = "https://catalog.data.gov" + result.a['href']
                dataset_res = requests.get(dataset_url)
                dataset_soup = BeautifulSoup(dataset_res.content, "html.parser")

                description = dataset_soup.find("div", class_="notes embedded-content")
                description = description.text.strip() if description else ""

                meta_tags = dataset_soup.find_all("div", class_="metadata-field")

                row = dict.fromkeys(columns, "")
                row["Description"] = description
                row["URL of resource"] = dataset_url
                row["Organization of point of contact"] = "Humanitarian Data Exchange"
                row["Email of resource's point of contact"] = "hdx@un.org"

                for tag in meta_tags:
                    key_tag = tag.find("div", class_="metadata-label")
                    val_tag = tag.find("div", class_="metadata-value")
                    if not key_tag or not val_tag:
                        continue

                    key = key_tag.text.strip().lower()
                    val = val_tag.text.strip()

                    if "publisher" in key:
                        row["Organization of point of contact"] = val
                    if "release date" in key or "issued" in key or "published" in key:
                        row["Year published"] = val
                    if "format" in key:
                        if "csv" in val.lower() or "json" in val.lower() or "zip" in val.lower():
                            row["Resource Format (open data)"] = val
                        elif "application" in val.lower():
                            row["Resource Format (tools and applications)"] = val
                        else:
                            row["Resource Format (case study)"] = val
                    if "license" in key:
                        row["Resource type"] = val
                    if "contact" in key:
                        row["Name of resource's point of contact"] = val

                # Assign tag to correct category
                if category == "Social Inclusion":
                    row["Social Inclusion Tags"] = subcat
                elif category == "Peace Processes":
                    row["Peace Processes"] = subcat
                elif category == "Safe environment":
                    row["Safe Environments Tags"] = subcat
                elif category == "Civil society":
                    row["Civil Society Tags"] = subcat
                elif category == "State Institution":
                    row["State Institution Tags"] = subcat
                elif category == "Economic Foundation":
                    row["Economic Foundations Tags"] = subcat

                all_rows.append(row)
                time.sleep(1)  # Be polite

            except Exception as e:
                print(f"Error processing dataset: {e}")
                continue

# Save all data
df = pd.DataFrame(all_rows)
df.to_csv("final_peace_datasets.csv", index=False)
print("✅ Scraping complete. Data saved to 'final_peace_datasets.csv'")
