In [86]:
import json
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Replace with your actual JSON file path
with open("./data/links.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)

columns = [
    "URL of resource", "Description", "Year published", 
    "Resource Format (open data)", "Resource Format (tools and applications)", 
    "Resource Format (case study)", "Resource type", 
    "Name of resource's point of contact", "Organization of point of contact", 
    "Email of resource's point of contact", 
    "Social Inclusion Tags", "Peace Processes", "Safe Environments Tags", 
    "Civil Society Tags", "State Institution Tags", "Economic Foundations Tags"
]

all_rows = []



In [87]:
# Loop through JSON and extract each URL
for category_obj in json_data:
    #print(category_obj)
    for category, subtopics in category_obj.items():
        for subtopic, subcat_data in subtopics.items():
            for url in subcat_data["Links"]:
                print(f"Processing: {url}")
                try:
                    dataset_res = requests.get(url)
                    soup = BeautifulSoup(dataset_res.content, "html.parser")
                    dataset_items = soup.find_all("li", class_="dataset-item has-organization")
                    
                    for item in dataset_items:
                        content_div = item.find("div", class_="dataset-content")
                        if not content_div:
                            continue
                    
                        heading = content_div.find("h3", class_="dataset-heading")
                        if not heading:
                            continue
                    
                        link_tag = heading.find("a")
                        if link_tag and "href" in link_tag.attrs:
                            href = link_tag["href"]
                            full_url = "https://catalog.data.gov" + href
                            # ✅ Now we can create the row
                            row = dict.fromkeys(columns, "")
                            row["URL of resource"] = full_url
                            #print(row)
                            #all_rows.append(row)
                            soup = BeautifulSoup(requests.get(full_url).content, "html.parser")
                            description_div = soup.find("div", class_="notes embedded-content", itemprop="description")
                            if description_div:
                                description = description_div.get_text(strip=True)
                                row["Description"] = description

                            # Find the span with itemprop="dateModified"
                            date_modified_span = soup.find("span", itemprop="dateModified")
                            
                            if date_modified_span:
                                # Find the <a> tag within it
                                date_link = date_modified_span.find("a")
                                
                                if date_link:
                                    # Extract the text from the <a> tag
                                    date_text = date_link.get_text(strip=True)
                                    
                                    # Now extract the year from the date text (e.g., "October 4, 2024")
                                    year = date_text.split()[-1]  # The year is the last part of the string
                                    
                                    row["Year published"] = year   
                            row["Resource type"] = subcat_data['Type']

                            # Find the <tr> with itemprop="publisher"
                            publisher_tr = soup.find("tr", itemprop="publisher")
                            
                            if publisher_tr:
                                # Find the <span> with itemprop="name" inside the <td> of the <tr>
                                publisher_span = publisher_tr.find("td").find("span", itemprop="name")
                                
                                if publisher_span:
                                    # Get the text of the <span> tag (i.e., the publisher name)
                                    publisher_name = publisher_span.get_text(strip=True)
                                    row["Organization of point of contact"] = publisher_name

                            # Find the specific section with the class "module-narrow contact"
                            contact_section = soup.find("section", class_="module module-narrow contact")
                            
                            # If the section is found, search for the <a> tag with the mailto link within it
                            if contact_section:
                                contact_link = contact_section.find("a", href=True, string=True)
                                
                                if contact_link and "mailto:" in contact_link["href"]:
                                    # Extract the email address from the href attribute
                                    email = contact_link["href"].replace("mailto:", "")
                                    row["Email of resource's point of contact"] = email
                                    
                                    
                            # Assign the correct tag column
                            if category == "Social Inclusion":
                                row["Social Inclusion Tags"] = subtopic
                            elif category == "Peace Processes":
                                row["Peace Processes"] = subtopic
                            elif category == "Safe environment":
                                row["Safe Environments Tags"] = subtopic
                            elif category == "Civil society":
                                row["Civil Society Tags"] = subtopic
                            elif category == "State Institution":
                                row["State Institution Tags"] = subtopic
                            elif category == "Economic Foundation":
                                row["Economic Foundations Tags"] = subtopic
                            
                            all_rows.append(row)
                            
                            
                            
                        
                except Exception as e:
                    print(f"⚠️ Error processing {url}: {e}")
                    continue


# # Save to CSV
# df = pd.DataFrame(all_rows)
# df.to_csv("final_peace_datasets.csv", index=False)
# print("✅ Scraping complete. Data saved to 'final_peace_datasets.csv'")


Processing: https://data.humdata.org/dataset/?vocab_Topics=cash+voucher+assistance-cva&vocab_Topics=funding&q=Cash+transfers+and+subsidies&sort=last_modified+desc&ext_page_size=25
Processing: https://catalog.data.gov/dataset?q=Civic+engagement+initiatives+&sort=views_recent+desc&ext_location=&ext_bbox=&ext_prev_extent=
Processing: https://data.humdata.org/dataset/?vocab_Topics=community+engagement&vocab_Topics=development&vocab_Topics=governance+and+civil+society&vocab_Topics=protection&vocab_Topics=gender&q=&sort=last_modified+desc&ext_page_size=25
Processing: https://catalog.data.gov/dataset/?tags=capacity-building
Processing: https://catalog.data.gov/dataset?q=Civil+society+capacity+building&sort=views_recent+desc&ext_location=&ext_bbox=&ext_prev_extent=
Processing: https://data.humdata.org/dataset/?vocab_Topics=development&vocab_Topics=gender&vocab_Topics=governance+and+civil+society&vocab_Topics=protection&vocab_Topics=operational+capacity&q=&sort=last_modified+desc&ext_page_size=

In [88]:
all_rows


[{'URL of resource': 'https://catalog.data.gov/dataset/2024-annual-technology-baseline-atb-cost-and-performance-data-for-electricity-generation-t',
  'Description': 'These data provide the 2024 update of the Electricity Annual Technology Baseline (ATB). Starting in 2015 NREL has presented the ATB, consisting of detailed cost and performance data, both current and projected, for electricity generation and storage technologies. The ATB products now include data (Excel workbook, Tableau workbooks, and structured summary csv files), as well as documentation and user engagement via a website, presentation, and webinar. Starting in 2021, the data are cloud optimized and provided in the OEDI data lake. The data for 2015 - 2020 are can be found on the NREL Data Search Page. The website documentation can be found on the ATB Website.',
  'Year published': '2025',
  'Resource Format (open data)': '',
  'Resource Format (tools and applications)': '',
  'Resource Format (case study)': '',
  'Resour

In [89]:
# Save to CSV
df = pd.DataFrame(all_rows)
df.to_csv("final_peace_datasets.csv", index=False)
print("✅ Scraping complete. Data saved to 'final_peace_datasets.csv'")

✅ Scraping complete. Data saved to 'final_peace_datasets.csv'
