In [1]:
from datetime import datetime
import os
import unicodedata

from bs4 import BeautifulSoup
import pandas as pd

In [2]:
with open('html/ra.html', 'r', encoding="utf8") as f:
    data = f.read().replace('\n', '')  
soup = BeautifulSoup(data, 'html.parser')

In [3]:
def get_sections(soup):
    div = soup.find('div', {'class': 'Section1'})
    div_children_gen = div.childGenerator()
    agencies = []
    sections = []
    for child in div_children_gen:
        tag_name = child.name
        if tag_name == 'h2':
            agency = unicodedata.normalize("NFKD", child.text).strip()
            if agency not in agencies:
                agencies.append(agency)
        elif tag_name == 'h3':
            if agencies[-1] not in ['Test', 'Unreleased']:
                section = unicodedata.normalize("NFKD", child.text).strip()
                sections.append(section)
                
    return set(sections)

In [4]:
div = soup.find('div', {'class': 'Section1'})
div_children_gen = div.childGenerator()
agencies = [x.get_text() for x in soup.find_all('h2')]
data = {}
agency_tracker = []
section_tracker = []
for child in div_children_gen:
    tag_name = child.name
    tag_class = " ".join(child.attrs.get('class',''))
    if tag_name == 'h2':
        agency = unicodedata.normalize("NFKD", child.text).strip()
        if agency not in agency_tracker:
            data.update({agency:{}})
            agency_tracker.append(agency)
            continue
    elif tag_name == 'h3':
        section = unicodedata.normalize("NFKD", child.text).strip()
        current_agency = agency_tracker[-1]
        if section not in data[current_agency].keys():
            data[current_agency].update({section: ''})
            section_tracker.append(section)
    elif tag_name == 'div' and tag_class == 'data_table':
        current_section = section_tracker[-1]
        current_agency = agency_tracker[-1]
        table = child.find('table')
        table_headers = table.find_all('th')
        table_headers = [t.text for t in table_headers]
        table_rows = table.find_all('tr')
        l = []
        for tr in table_rows:
            td = tr.find_all('td')
            row = [tr.text for tr in td]
            l.append(row)
        df = pd.DataFrame(l, columns = table_headers)
        data[current_agency][section] = df
    elif tag_name == 'p':
        current_section = section_tracker[-1]
        current_agency = agency_tracker[-1]
        p_text = unicodedata.normalize("NFKD", child.text).strip()
        data[current_agency][section] += f'{p_text} '



In [5]:
out_path = os.path.join(os.getcwd(), 'data')
if not os.path.exists(out_path):
    os.mkdir(out_path)
now = datetime.now().strftime("%m-%d-%Y")
sheet_names = list(data[list(data.keys())[0]].keys())
sheet_name_map = {'Part IA: Grants Management Standard Functions and Activities': 'Part IA Functions Activities',
                  'Part IB: Grants Management Additional Capabilities': 'Part IB Additional Capabilities',
                  'Part IIB: Readiness for Change in Grants Management (Questions 1-2)': 'Part IIB - Readiness for Change',
                  'Part IIA: Grants Management Systems Inventory': 'Part IIA Systems Inventory',
                  "2) Do you have any suggestions for improving existing shared grants systems that you use?": '2) Suggestions',
                  '3) Validation of Agency OCIO Concurrence': '3) OCIO Validation',
                  "3) Does your agency's current organizational governance structure enable effective discussion and decision making across the C-Suite (CFO, CIO, CHCO, CAO, etc) and across agency bureaus and components?": '3) Organizational Environment',
                  '4) Does your agency have agency-wide standard data definitions for Grants Management across the sub-components that are in use today? If "yes", please upload separately.': '4) Standard Data Definitions',
                  "5. If a project to improve grants management services or systems were undertaken at your agency, what pain point(s) could be addressed at your agency?": "5) Pain Points",
                  '7) General Comments': '7) Comments',
                  "Part III: Additional Grants Mgmt Question #1": 'Part III Question #1',
                  "Part III: Additional Grants Mgmt Questions #2": 'Part III Question #2',
                  'Document Uploads': 'Document Uploads',
                  '6) If applicable, please provide details of any statute, regulation or policy impeding the further adoption, standardization, or consolidation including name, type, and any additional relevant information.': '6) Policy Impediments'}
for agency in data: 
    if agency in {'Test', 'Unreleased'}:
        continue
    xls_path = os.path.join(os.getcwd(), 'data', f'Grants Management Readiness Assessment - {agency} - {now}.xls')
    with pd.ExcelWriter(xls_path) as writer:
        sections = data[agency]
        sections = set(sections.keys()) | get_sections(soup)
        for section in sections:
            sheet_name = sheet_name_map[section]
            try:
                data_to_write = data[agency][section]
            except KeyError:
                df = pd.DataFrame([''])
                df.columns = [section]
                df.to_excel(writer, sheet_name, index = False)
                continue
            if isinstance(data_to_write, pd.core.frame.DataFrame):
                df = data_to_write.dropna(how = 'all')
                df.to_excel(writer, sheet_name, index = False)
            else:
                df = pd.DataFrame([data_to_write])
                df.columns = [section]
                df.to_excel(writer, sheet_name, index = False)

        writer.save()