In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import os
import re
from bs4 import BeautifulSoup
import fuzzywuzzy

In [10]:
import os
import re
import pandas as pd
from bs4 import BeautifulSoup

# Utility to clean whitespace

def clean_text(text):
    return re.sub(r"\s+", " ", text).strip()

# Extraction mapping for scheme-level fields
extraction_dict = {
    'Name of the Scheme': 'div class="col-sm-18"', #/Project
    'Whether scheme': 'div class="col-sm-6"', # /project is new/ongoing
    'Whether the scheme is funded by NABARD': 'div class="col-sm-6"', 
    'Proposal Number': 'div class="form-group"', 
    'Proposal Date': 'div class="col-sm-6"', 
    'Issued Number': 'div class="form-group"', 
    'Valid Upto': 'div class="col-sm-6"', 
    'Admin Department File Number': 'div class="col-sm-6"', 
    'Brief nature of the scheme': 'div class="col-sm-6"', #/project and the time period required for implementation/execution
    'Aim, objectives and benefit expected from the scheme/project': 'div class="col-sm-6"', 
    'Whether included in the priority list approved by the Departmental Minister': 'div class="col-sm-6"', 
    'Ddo Details': 'div class="col-sm-6"', 
    'Common Proposal': 'div class="col-sm-6"', 
    'Project Profile Id:': 'div class="col-sm-6"', 
    'Project Classification': 'span class="text-success h6"', 
    'Type of the project:': '<div class="col-lg-12">', 
    'Opening and Closing dates:': 'div class="col-sm-6"', 
    'Project Beneficiary :': 'div class="col-sm-6" style="margin-top: 10px;"', 
    'Procurement Activity Type:': 'div class="col-sm-8" style="margin-top: 10px;"', 
    'Beneficiary Share Amount': 'div class="col-sm-12"', 
    'Whether the scheme is centrally sponsored scheme': 'div class="col-sm-12"', 
    'Total Administrative Approval Amount (Project Cost)(In Lakhs)': 'div class="col-sm-12"', 
    'DDO Code': 'div class="col-sm-6"', 
    'Treasury Name': 'div class="col-sm-6"', 
    'DDO Office Name': 'div class="col-sm-18"', 
    'Permission from local bodies for construction/establishment wherever necessary': 'div class="col-sm-18"', 
    'Non-Objection certificate': 'div class="col-sm-18"', 
    'Non-Duplicity certificate': 'div class="col-sm-18"', 
    'Views of Department': 'div class="col-sm-18"', 
    'Details information in case of observation made': 'div class="form-group"', # by T&D in last year\'s endorsement
    'Sanction letter copy of GOI in case of Central Sector Schemes': 'div class="col-sm-6"', 
    'Views of Sr': 'div class="col-sm-18"', #.F.A/F.A for non-admissibility of the instant project under DFPR
    'Funding pattern of the scheme': 'div class="col-sm-6"', 
    'The year from which the scheme is being implemented': 'div class="col-sm-6"', 
    'The quantum of central share allocated': 'div class="col-sm-6"', #/released for the year along with the supporting document
    'The criteria of proposing the amount of state share': 'div class="col-sm-6"'
}

# Convert pattern strings to CSS-like selector

def pattern_to_selector(pattern):
    pattern = pattern.strip()
    if pattern.startswith("<") and pattern.endswith(">"):
        pattern = pattern[1:-1].strip()
    tag_match = re.match(r'^(\w+)', pattern)
    tag = tag_match.group(1) if tag_match else ""
    class_match = re.search(r'class="([^"]+)"', pattern)
    if class_match:
        classes = class_match.group(1).split()
        return tag + ''.join(f'.{c}' for c in classes)
    return tag

# Extract a field based on label text and selector fingerprint

def extract_value_for_key(soup, key, pattern):
    cleaned_key = re.sub(r"\s*\([^)]*\)", "", key).strip()
    label = soup.find(text=re.compile(re.escape(cleaned_key), re.IGNORECASE))
    if not label:
        return None
    selector = pattern_to_selector(pattern)
    def matches(tag):
        if not tag.name:
            return False
        parts = selector.split('.')
        if tag.name != parts[0]:
            return False
        return all(c in tag.get('class', []) for c in parts[1:])
    found = label.find_next(matches)
    return clean_text(found.get_text()) if found else None

# Identify HOA tables by header keywords

def find_hoa_tables(soup):
    tables = []
    for tbl in soup.find_all('table'):
        first = tbl.find('tr')
        if not first:
            continue
        # only consider direct <th> children for header detection
        headers = [clean_text(th.get_text()) for th in first.find_all('th', recursive=False)]
        low = [h.lower() for h in headers]
        # require both 'head of account' and 'si no.' in direct headers
        if 'head of account' in low and any('si no' in h for h in low):
            tables.append(tbl)
    return tables

# Parse a single HOA table into flat records

def parse_hoa_table(tbl, scheme_data):
    records = []
    body = tbl.find('tbody') or tbl
    for tr in body.find_all('tr'):
        # skip header or total rows
        if tr.find('th'):
            continue
        # detect total row
        total_in = tr.find('input', id='totalHoaAmount')
        if total_in:
            scheme_data['Total HOA Amount'] = total_in['value']
            continue
        cells = tr.find_all('td')
        # need at least 7 cells: sl no, HOA cell, grant, current, total, est, green tag
        if len(cells) < 7:
            continue
        # Extract HOA number and name
        hoacell = cells[1]
        label = hoacell.find('label')
        name = clean_text(label.get_text()) if label else None
        # remove label(s) to isolate number
        for lab in hoacell.find_all('label'):
            lab.extract()
        number = clean_text(hoacell.get_text())
        # Extract financials
        current = clean_text(cells[3].get_text())
        total_est = clean_text(cells[4].get_text())
        estimated = clean_text(cells[5].get_text())
        # Green Budget Tag (first of two cells)
        green = clean_text(cells[6].get_text())
        # Build record
        rec = scheme_data.copy()
        rec['HOA_Number'] = number
        rec['HOA_Name'] = name
        rec["Current Year's Budget Provision (In Lakhs)"] = current
        rec["Total Estimated Cost of the Project (In Lakhs)"] = total_est
        rec["Estimated Expenditure in (Ct FY) (In Lakhs)"] = estimated
        rec['Green Budget Tag'] = green
        records.append(rec)
    return records

# Process a single HTML file

def process_file(path):
    with open(path, 'r', encoding='utf-8') as f:
        html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    # Normalize whitespace
    for t in soup.find_all(text=True):
        t.replace_with(clean_text(t))
    # Extract scheme-level fields
    scheme_data = {k: extract_value_for_key(soup, k, p) for k, p in extraction_dict.items()}
    # Parse any HOA tables
    recs = []
    for tbl in find_hoa_tables(soup):
        recs.extend(parse_hoa_table(tbl, scheme_data))
    # Fallback to single record if no table
    return recs or [scheme_data]

# Batch-run and export

def main():
    root = r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Sources\SDRF\scripts\scraping_experiments\downloaded_pages"
    files = []
    for yr in os.listdir(root):
        path = os.path.join(root, yr)
        if os.path.isdir(path) and re.match(r'\d{4}-\d{2}', yr):
            for fn in os.listdir(path):
                if fn.lower().endswith(('.html', '.htm')):
                    files.append((yr, os.path.join(path, fn)))
    total = len(files)
    print(f"Total files to process: {total}")

    # Export a sample for 2023-24
    sample = next(((y, f) for y, f in files if y == '2023-24'), None)
    if sample:
        y, fpath = sample
        print(f"\nSample [{y}]: {fpath}")
        sample_recs = process_file(fpath)
        print(sample_recs, "\n")
        pd.DataFrame(sample_recs).to_csv(
            os.path.join(os.path.dirname(fpath), 'all_extracted_data_sample_2023-24.csv'),
            index=False)
        print("Sample exported.\n")

    all_recs = []
    proc = fail = 0
    for y, fpath in files:
        try:
            recs = process_file(fpath)
            for r in recs:
                r['Financial Year'] = y
                r['Source File'] = fpath
                all_recs.append(r)
            proc += 1
        except Exception as e:
            fail += 1
            print(f"Error parsing {fpath}: {e}")
        print(f"Finished {proc}/{total} files")

    print(f"Done: {proc} succeeded, {fail} failed.")
    df = pd.DataFrame(all_recs)
    out = r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Sources\SDRF\data\all_extracted_data_v2.csv"
    df.to_csv(out, index=False)
    print(f"Data saved to {out}")

if __name__ == '__main__':
    main()

Total files to process: 6916

Sample [2023-24]: D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\Deployment\flood-data-ecosystem-Assam\Sources\SDRF\scripts\scraping_experiments\downloaded_pages\2023-24\AA-05-2023-24-6740.html


  for t in soup.find_all(text=True):
  label = soup.find(text=re.compile(re.escape(cleaned_key), re.IGNORECASE))


[{'Name of the Scheme': 'Repairing & Renovation Santipur PWSS damaged during Flood 2022 at Hajo LAC & Jalukbari LAC Under Hajo PHE Sub Division', 'Whether scheme': 'New project', 'Whether the scheme is funded by NABARD': 'No', 'Proposal Number': 'AA-05-2023-24-6740', 'Proposal Date': '26-06-2023', 'Issued Number': 'AA/05_2023-24(I)_5243', 'Valid Upto': '26-06-2026', 'Admin Department File Number': '289051', 'Brief nature of the scheme': 'FDR SCHEME', 'Aim, objectives and benefit expected from the scheme/project': 'Repairing & Renovation Santipur PWSS', 'Whether included in the priority list approved by the Departmental Minister': 'No', 'Ddo Details': 'AKM/DA/001', 'Common Proposal': 'No', 'Project Profile Id:': '', 'Project Classification': 'State Own Priority Scheme-GOI Special Scheme', 'Type of the project:': '3 Year Project', 'Opening and Closing dates:': '26-06-202326-06-2026', 'Project Beneficiary :': 'PUBLIC', 'Procurement Activity Type:': '--Not Available--', 'Beneficiary Share 

### All pages script

In [8]:
import os
import re
import pandas as pd
from bs4 import BeautifulSoup

# Extraction dictionary mapping column names to a string fingerprint for the element.
extraction_dict = {
    'Name of the Scheme': 'div class="col-sm-18"', #/Project
    'Whether scheme': 'div class="col-sm-6"', # /project is new/ongoing
    'Whether the scheme is funded by NABARD': 'div class="col-sm-6"', 
    'Proposal Number': 'div class="form-group"', 
    'Proposal Date': 'div class="col-sm-6"', 
    'Issued Number': 'div class="form-group"', 
    'Valid Upto': 'div class="col-sm-6"', 
    'Admin Department File Number': 'div class="col-sm-6"', 
    'Brief nature of the scheme': 'div class="col-sm-6"', #/project and the time period required for implementation/execution
    'Aim, objectives and benefit expected from the scheme/project': 'div class="col-sm-6"', 
    'Whether included in the priority list approved by the Departmental Minister': 'div class="col-sm-6"', 
    'Ddo Details': 'div class="col-sm-6"', 
    'Common Proposal': 'div class="col-sm-6"', 
    'Project Profile Id:': 'div class="col-sm-6"', 
    'Project Classification': 'span class="text-success h6"', 
    'Type of the project:': '<div class="col-lg-12">', 
    'Opening and Closing dates:': 'div class="col-sm-6"', 
    'Project Beneficiary :': 'div class="col-sm-6" style="margin-top: 10px;"', 
    'Procurement Activity Type:': 'div class="col-sm-8" style="margin-top: 10px;"', 
    'Beneficiary Share Amount': 'div class="col-sm-12"', 
    'Whether the scheme is centrally sponsored scheme': 'div class="col-sm-12"', 
    'Total Administrative Approval Amount (Project Cost)(In Lakhs)': 'div class="col-sm-12"', 
    'DDO Code': 'div class="col-sm-6"', 
    'Treasury Name': 'div class="col-sm-6"', 
    'DDO Office Name': 'div class="col-sm-18"', 
    'Permission from local bodies for construction/establishment wherever necessary': 'div class="col-sm-18"', 
    'Non-Objection certificate': 'div class="col-sm-18"', 
    'Non-Duplicity certificate': 'div class="col-sm-18"', 
    'Views of Department': 'div class="col-sm-18"', 
    'Details information in case of observation made': 'div class="form-group"', # by T&D in last year\'s endorsement
    'Sanction letter copy of GOI in case of Central Sector Schemes': 'div class="col-sm-6"', 
    'Views of Sr': 'div class="col-sm-18"', #.F.A/F.A for non-admissibility of the instant project under DFPR
    'Funding pattern of the scheme': 'div class="col-sm-6"', 
    'The year from which the scheme is being implemented': 'div class="col-sm-6"', 
    'The quantum of central share allocated': 'div class="col-sm-6"', #/released for the year along with the supporting document
    'The criteria of proposing the amount of state share': 'div class="col-sm-6"'
}

def pattern_to_selector(pattern):
    """
    Converts a string like 'div class="col-sm-18"' or '<div class="col-lg-12">' 
    into a CSS selector, e.g. 'div.col-sm-18' or 'div.col-lg-12'.
    """
    pattern = pattern.strip()
    if pattern.startswith("<") and pattern.endswith(">"):
        pattern = pattern[1:-1].strip()
    tag_match = re.match(r'^(\w+)', pattern)
    tag = tag_match.group(1) if tag_match else ""
    class_match = re.search(r'class="([^"]+)"', pattern)
    if class_match:
        classes = class_match.group(1).split()
        selector = tag + ''.join(['.' + cls for cls in classes])
    else:
        selector = tag
    return selector

def extract_value_for_key(soup, key, pattern):
    """
    Matches the text node that contains the cleaned key (with parentheses removed)
    and returns the text from the first subsequent element that matches the CSS fingerprint derived from pattern.
    The matching is done in a case-insensitive manner.
    """
    # Remove parentheses and the text contained within them from the key
    cleaned_key = re.sub(r'\s*\([^)]*\)', '', key).strip()
    label_element = soup.find(text=re.compile(re.escape(cleaned_key), re.IGNORECASE))
    if label_element:
        selector = pattern_to_selector(pattern)
        def matches(tag):
            if not tag.name:
                return False
            sel_parts = selector.split('.')
            sel_tag = sel_parts[0]
            sel_classes = sel_parts[1:] if len(sel_parts) > 1 else []
            if tag.name != sel_tag:
                return False
            tag_classes = tag.get("class", [])
            return all(cls in tag_classes for cls in sel_classes)
        candidate = label_element.find_next(matches)
        if candidate:
            return re.sub(r'\s+', ' ', candidate.get_text(strip=True))
    return None

def process_file(file_path):
    """
    Processes one HTML file: cleans text nodes, extracts data based on extraction_dict,
    and then splits the combined 'Opening and Closing dates:' field into 'Starting Date' and 'Closing Date'.
    Returns a dictionary of extracted values.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        file_contents = file.read()

    soup = BeautifulSoup(file_contents, 'html.parser')
    
    # Pre-clean all text nodes to remove extra whitespace/newlines
    for text_node in soup.find_all(text=True):
        cleaned = re.sub(r'\s+', ' ', text_node)
        text_node.replace_with(cleaned)
    
    extracted_data = {}
    for key, pattern in extraction_dict.items():
        value = extract_value_for_key(soup, key, pattern)
        extracted_data[key] = value

    # Split the "Opening and Closing dates:" entry into two separate columns
    if "Opening and Closing dates:" in extracted_data and extracted_data["Opening and Closing dates:"] is not None:
        dates_value = extracted_data["Opening and Closing dates:"]
        # Use regex to extract date patterns of the form dd-mm-yyyy
        dates = re.findall(r'\d{2}-\d{2}-\d{4}', dates_value)
        if len(dates) >= 2:
            extracted_data["Starting Date"] = dates[0]
            extracted_data["Closing Date"] = dates[1]
        else:
            extracted_data["Starting Date"] = dates_value
            extracted_data["Closing Date"] = None
        # Remove the original combined field
        extracted_data.pop("Opening and Closing dates:", None)
    else:
        extracted_data["Starting Date"] = None
        extracted_data["Closing Date"] = None
    
    return extracted_data

def main():
    # Set the root folder containing subfolders named as financial years (e.g. "2021-22")
    root_folder = r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\scripts\scraping_experiments\downloaded_pages"
    all_data = []
    
    # Iterate over each subfolder matching the financial year pattern
    for subfolder in os.listdir(root_folder):
        subfolder_path = os.path.join(root_folder, subfolder)
        if os.path.isdir(subfolder_path) and re.match(r'\d{4}-\d{2}', subfolder):
            financial_year = subfolder  # e.g. "2021-22"
            # Process both .html and .htm files in this subfolder
            for file in os.listdir(subfolder_path):
                if file.lower().endswith((".html", ".htm")):
                    file_path = os.path.join(subfolder_path, file)
                    data = process_file(file_path)
                    data["Financial Year"] = financial_year
                    data["Source File"] = file_path
                    all_data.append(data)
    
    # Create a single DataFrame from all the records
    df = pd.DataFrame(all_data)
    print("Extracted DataFrame (first few rows):")
    print(df.head())
    
    # Save the DataFrame to a CSV file
    output_csv = r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\data\all_extracted_data.csv"
    df.to_csv(output_csv, index=False)
    print(f"Data saved to {output_csv}")

if __name__ == "__main__":
    main()


  for text_node in soup.find_all(text=True):
  label_element = soup.find(text=re.compile(re.escape(cleaned_key), re.IGNORECASE))


Extracted DataFrame (first few rows):
  Name of the Scheme Whether scheme Whether the scheme is funded by NABARD  \
0                              None                                   None   
1                              None                                   None   
2                              None                                   None   
3                              None                                   None   
4                              None                                   None   

  Proposal Number Proposal Date Issued Number Valid Upto  \
0            None          None          None       None   
1            None          None          None       None   
2            None          None          None       None   
3            None          None          None       None   
4            None          None          None       None   

  Admin Department File Number Brief nature of the scheme  \
0                         None                       None   
1         

extraction_dict mapping:
| Key                                                                                                                     | Element                                  |
| ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------- |
| Name of the Scheme/Project proposed                                                                                     | div class="col-sm-18"                     |
| Whether scheme/project is new/ongoing                                                                                   | div class="col-sm-6"                      |
| Whether the scheme is funded by NABARD                                                                                  | div class="col-sm-6"                      |
| Proposal Number                                                                                                         | div class="form-group"                    |
| Proposal Date                                                                                                           | div class="col-sm-6"                      |
| Issued Number                                                                                                           | div class="form-group"                    |
| Valid Upto(Auto generated after Issue)                                                                                  | div class="col-sm-6"                      |
| Admin Department File Number                                                                                            | div class="col-sm-6"                      |
| Brief nature of the scheme/project and the time period required for implementation/execution                          | div class="col-sm-6"                      |
| Aim, objectives and benefit expected from the scheme/project                                                            | div class="col-sm-6"                      |
| Whether included in the priority list approved by the Departmental Minister                                            | div class="col-sm-6"                      |
| Ddo Details                                                                                                             | div class="col-sm-6"                      |
| Common Proposal(If the documents in the Proposal Needs a Referance in other Proposals)                                   | div class="col-sm-6"                      |
| Project Profile Id:                                                                                                     | div class="col-sm-6"                      |
| Project Classification :                                                                                                | span class="text-success h6"              |
| Type of the project:                                                                                                    | <div class="col-lg-12">                    |
| Opening and Closing dates:                                                                                              | div class="col-sm-6"                      |
| Project Beneficiary :                                                                                                   | div class="col-sm-6" style="margin-top: 10px;" |
| Procurement Activity Type:                                                                                              | div class="col-sm-8" style="margin-top: 10px;" |
| Beneficiary Share Amount(In Lakhs)                                                                                      | div class="col-sm-12"                     |
| Whether the scheme is centrally sponsored scheme                                                                      | div class="col-sm-12"                     |
| Total Administrative Approval Amount (Project Cost)(In Lakhs)                                                          | div class="col-sm-12"                     |
| DDO Code                                                                                                                | div class="col-sm-6"                      |
| Treasury Name(Auto generated)                                                                                           | div class="col-sm-6"                      |
| DDO Office Name(Auto generated)                                                                                         | div class="col-sm-18"                     |
| Permission from local bodies for construction/establishment wherever necessary                                          | div class="col-sm-18"                     |
| Non-Objection certificate                                                                                                | div class="col-sm-18"                     |
| Non-Duplicity certificate                                                                                                | div class="col-sm-18"                     |
| Views of Department                                                                                                     | div class="col-sm-18"                     |
| Details information in case of observation made by T&D in last year's endorsement                                       | div class="form-group"                    |
| Sanction letter copy of GOI in case of Central Sector Schemes                                                          | div class="col-sm-6"                      |
| Views of Sr.F.A/F.A for non-admissibility of the instant project under DFPR                                              | div class="col-sm-18"                     |
| Funding pattern of the scheme                                                                                           | div class="col-sm-6"                      |
| The year from which the scheme is being implemented                                                                     | div class="col-sm-6"                      |
| The quantum of central share allocated/released for the year along with the supporting document                         | div class="col-sm-6"                      |
| The criteria of proposing the amount of state share                                                                     | div class="col-sm-6"                      |
| attachments required 'Other Documents( if required )'                                                                   |                                          |
| attachments required 'SOPD Justification Document*'                                                                     |                                          |
| Detailed Project Report (DPR) / Project Estimate approved by the Competent Authority                                    |                                          |
| attachmenbts required 'Views of Senior Most Secretary in the Department'                                                | div class="col-sm-18"                     |
| (attachments required) 'Departments view on the feasibility of the proposal by the Departmental Authority'                | div class="col-sm-18"                     |
| (attachments required) 'Specific View of F.A/Sr.F.A and Checklist of F.A'                                                | div class="col-sm-18"                     |
| Beneficiary Name                                                                                                        | div class="col-sm-12"                     |
| Procurement activity                                                                                                    |                                          |
| Unique Number                                                                                                           |                                          |

In [4]:
folder = r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\scripts\scraping_experiments\downloaded_pages"


In [None]:
def extract_field_by_label(soup, label_text):
    """
    Searches for an element (e.g., <td>, <th>, <div>, or <span>) that contains the given label text,
    and returns the text from its next sibling element.
    
    You might need to update the logic based on your file's actual structure.
    """
    # Find an element containing the label text
    label_element = soup.find(lambda tag: tag.name in ['td', 'th', 'div', 'span'] and label_text in tag.get_text())
    if label_element:
        # Get the next element which should contain the corresponding value
        value_element = label_element.find_next()
        if value_element:
            return value_element.get_text(strip=True)
    return None

def extract_fields(soup):
    """
    Extracts the required fields from the BeautifulSoup-parsed HTML.
    Adjust the label texts below to match your HTML document.
    """
    record = {}
    record["Name of the Scheme/Project Proposed"] = extract_field_by_label(soup, "Name of the Scheme/Project Proposed")
    record["Whether the scheme is funded by NABARD"] = extract_field_by_label(soup, "NABARD")
    record["Brief nature of the scheme/project and the time period required for implementation/execution"] = extract_field_by_label(soup, "Brief nature")
    record["Type of the project"] = extract_field_by_label(soup, "Type of the project")
    record["Procurement Activity Type"] = extract_field_by_label(soup, "Procurement Activity Type")
    record["Aim, objectives and benefit expected from the scheme/project"] = extract_field_by_label(soup, "Aim, objectives")
    record["Project Classification"] = extract_field_by_label(soup, "Project Classification")
    record["Total Administrative Approval Amount (Project Cost) (In Lakhs)"] = extract_field_by_label(soup, "Total Administrative Approval Amount")
    return record

def main():
    # Specify the test HTML file (ensure it's in the current directory or provide the full path)
    test_file = r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\scripts\scraping_experiments\downloaded_pages\2023-24\AA-05-2023-24-6741.html"
    
    if not os.path.exists(test_file):
        print(f"File {test_file} not found. Please ensure the file exists in the directory.")
        return

    with open(test_file, "r", encoding="utf-8") as f:
        html_content = f.read()

    soup = BeautifulSoup(html_content, "html.parser")
    data_record = extract_fields(soup)
    
    # Create a DataFrame with one row using the extracted data
    df = pd.DataFrame([data_record])
    
    print("Extracted DataFrame:")
    df
    
    # Optionally, save the DataFrame to a CSV file
    df.to_csv("test_extracted_data.csv", index=False)
    print("DataFrame saved to test_extracted_data.csv")

if __name__ == "__main__":
    main()

test_file = r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\scripts\scraping_experiments\downloaded_pages\2023-24\AA-05-2023-24-6741.html"
with open(test_file, 'r', encoding='utf-8') as file:
    file_contents = file.read()

soup = BeautifulSoup(file_contents, 'html.parser')

print(soup.prettify())
cleaned_text = [div.get_text(strip=True) for div in soup.find_all("div", class_="col-sm-6 active")]
print([re.sub(r'\s+', ' ', text) for text in cleaned_text])

Extracted DataFrame:
DataFrame saved to test_extracted_data.csv


In [3]:
test_file = r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\scripts\scraping_experiments\downloaded_pages\2018-19\AA-05-2018-19-0036.html"
with open(test_file, 'r', encoding='utf-8') as file:
    file_contents = file.read()

soup = BeautifulSoup(file_contents, 'html.parser')

print(soup.prettify())
cleaned_text = [div.get_text(strip=True) for div in soup.find_all("div", class_="col-sm-6 active")]
print([re.sub(r'\s+', ' ', text) for text in cleaned_text])

<html lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="Public Finance Management System - Government of Assam" name="description"/>
  <link href="/assamfinance/resources/icons/favicon.ico" rel="icon" type="image/x-icon"/>
  <link href="/assamfinance/resources/icons/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <link href="/assamfinance/resources/style/kranstyle.css" rel="stylesheet" type="text/css"/>
  <link href="/assamfinance/resources/style/jquery-ui.css" rel="stylesheet" type="text/css"/>
  <!-- BOOTSTRAP -->
  <link href="/assamfinance/resources/bootstrap/css/bootstrap.min.css" rel="stylesheet" type="text/css"/>
  <link href="/assamfinance/resources/font-awesome/css/font-awesome.min.css" rel="stylesheet" type="text/css"/>
  <!-- CALENDAR  -->
  <link href="/assamfinance/resources/js/cal

In [None]:
import os
import re
import pandas as pd
from bs4 import BeautifulSoup

# Your extraction dictionary mapping column names to CSS fingerprint strings
extraction_dict = {
    'Name of the Scheme/Project proposed': 'div class="col-sm-18"', 
    'Whether scheme/project is new/ongoing': 'div class="col-sm-6"', 
    'Whether the scheme is funded by NABARD': 'div class="col-sm-6"', 
    'Proposal Number': 'div class="form-group"', 
    'Proposal Date': 'div class="col-sm-6"', 
    'Issued Number': 'div class="form-group"', 
    'Valid Upto(Auto generated after Issue)': 'div class="col-sm-6"', 
    'Admin Department File Number': 'div class="col-sm-6"', 
    'Brief nature of the scheme/project and the time period required for implementation/execution': 'div class="col-sm-6"', 
    'Aim, objectives and benefit expected from the scheme/project': 'div class="col-sm-6"', 
    'Whether included in the priority list approved by the Departmental Minister': 'div class="col-sm-6"', 
    'Ddo Details': 'div class="col-sm-6"', 
    'Common Proposal(If the documents in the Proposal Needs a Referance in other Proposals)': 'div class="col-sm-6"', 
    'Project Profile Id:': 'div class="col-sm-6"', 
    'Project Classification :': 'span class="text-success h6"', 
    'Type of the project:': '<div class="col-lg-12">', 
    'Opening and Closing dates:': 'div class="col-sm-6"', 
    'Project Beneficiary :': 'div class="col-sm-6" style="margin-top: 10px;"', 
    'Procurement Activity Type:': 'div class="col-sm-8" style="margin-top: 10px;"', 
    'Beneficiary Share Amount': 'div class="col-sm-12"', 
    'Whether the scheme is centrally sponsored scheme': 'div class="col-sm-12"', 
    'Total Administrative Approval Amount': 'div class="col-sm-12"', 
    'DDO Code': 'div class="col-sm-6"', 
    'Treasury Name(Auto generated)': 'div class="col-sm-6"', 
    'DDO Office Name(Auto generated)': 'div class="col-sm-18"', 
    'Permission from local bodies for construction/establishment wherever necessary': 'div class="col-sm-18"', 
    'Non-Objection certificate': 'div class="col-sm-18"', 
    'Non-Duplicity certificate': 'div class="col-sm-18"', 
    'Views of Department': 'div class="col-sm-18"', 
    "Details information in case of observation made by T&D in last year's endorsement": 'div class="form-group"', 
    'Sanction letter copy of GOI in case of Central Sector Schemes': 'div class="col-sm-6"', 
    'Views of Sr.F.A/F.A for non-admissibility of the instant project under DFPR': 'div class="col-sm-18"', 
    'Funding pattern of the scheme': 'div class="col-sm-6"', 
    'The year from which the scheme is being implemented': 'div class="col-sm-6"', 
    'The quantum of central share allocated/released for the year along with the supporting document': 'div class="col-sm-6"', 
    'The criteria of proposing the amount of state share': 'div class="col-sm-6"'
}

def pattern_to_selector(pattern):
    """
    Converts a string like 'div class="col-sm-18"' or '<div class="col-lg-12">' 
    into a CSS selector, e.g. 'div.col-sm-18' or 'div.col-lg-12'.
    """
    pattern = pattern.strip()
    if pattern.startswith("<") and pattern.endswith(">"):
        pattern = pattern[1:-1].strip()
    tag_match = re.match(r'^(\w+)', pattern)
    tag = tag_match.group(1) if tag_match else ""
    class_match = re.search(r'class="([^"]+)"', pattern)
    if class_match:
        classes = class_match.group(1).split()
        selector = tag + ''.join(['.' + cls for cls in classes])
    else:
        selector = tag
    return selector

def extract_value_for_key(soup, key, pattern):
    """
    Finds the text node containing the label (key) and returns the text from the first
    subsequent element that matches the CSS fingerprint derived from pattern.
    """
    label_element = soup.find(text=re.compile(re.escape(key)))
    if label_element:
        selector = pattern_to_selector(pattern)
        def matches(tag):
            if not tag.name:
                return False
            sel_parts = selector.split('.')
            sel_tag = sel_parts[0]
            sel_classes = sel_parts[1:] if len(sel_parts) > 1 else []
            if tag.name != sel_tag:
                return False
            tag_classes = tag.get("class", [])
            return all(cls in tag_classes for cls in sel_classes)
        candidate = label_element.find_next(matches)
        if candidate:
            # Clean the extracted text by replacing multiple whitespace/newlines with a single space
            return re.sub(r'\s+', ' ', candidate.get_text(strip=True))
    return None

def main():
    # Path to your test HTML file
    test_file = r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\scripts\scraping_experiments\downloaded_pages\2023-24\AA-05-2023-24-6741.html"
    if not os.path.exists(test_file):
        print(f"File not found: {test_file}")
        return

    with open(test_file, 'r', encoding='utf-8') as file:
        file_contents = file.read()

    soup = BeautifulSoup(file_contents, 'html.parser')
    
    # Pre-clean every text node in the soup to remove newlines and extra spaces
    for text_node in soup.find_all(text=True):
        cleaned = re.sub(r'\s+', ' ', text_node)
        text_node.replace_with(cleaned)
    
    # Extract data using the extraction dictionary
    extracted_data = {}
    for key, pattern in extraction_dict.items():
        value = extract_value_for_key(soup, key, pattern)
        extracted_data[key] = value

    # Create a one-row DataFrame from the extracted data
    df = pd.DataFrame([extracted_data])
    print("Extracted DataFrame:")
    print(df)
    
    # Optionally, save the DataFrame to a CSV file
    df.to_csv("extracted_data.csv", index=False)
    print("Data saved to extracted_data.csv")

if __name__ == "__main__":
    main()


Extracted DataFrame:
                 Name of the Scheme/Project proposed  \
0  Repairing & Renovation Jalah No-1 PWSS damaged...   

  Whether scheme/project is new/ongoing  \
0                           New project   

  Whether the scheme is funded by NABARD     Proposal Number Proposal Date  \
0                                     No  AA-05-2023-24-6741    26-06-2023   

           Issued Number Valid Upto(Auto generated after Issue)  \
0  AA/05_2023-24(I)_5242                                   None   

  Admin Department File Number  \
0                       289051   

  Brief nature of the scheme/project and the time period required for implementation/execution  \
0                                         FDR SCHEME                                             

  Aim, objectives and benefit expected from the scheme/project  ...  \
0             Repairing & Renovation Jalah No-1 PWSS            ...   

  Non-Objection certificate       Non-Duplicity certificate  \
0              

  for text_node in soup.find_all(text=True):
  label_element = soup.find(text=re.compile(re.escape(key)))


In [5]:
# V1 of all page extractor

import os
import re
import pandas as pd
from bs4 import BeautifulSoup

# Extraction dictionary mapping column names to a string fingerprint for the element.
extraction_dict = { 'Name of the Scheme/Project proposed': 'div class="col-sm-18"', 
    'Whether scheme/project is new/ongoing': 'div class="col-sm-6"', 
    'Whether the scheme is funded by NABARD': 'div class="col-sm-6"', 
    'Proposal Number': 'div class="form-group"', 
    'Proposal Date': 'div class="col-sm-6"', 
    'Issued Number': 'div class="form-group"', 
    'Valid Upto': 'div class="col-sm-6"', 
    'Admin Department File Number': 'div class="col-sm-6"', 
    'Brief nature of the scheme/project and the time period required for implementation/execution': 'div class="col-sm-6"', 
    'Aim, objectives and benefit expected from the scheme/project': 'div class="col-sm-6"', 
    'Whether included in the priority list approved by the Departmental Minister': 'div class="col-sm-6"', 
    'Ddo Details': 'div class="col-sm-6"', 
    'Common Proposal': 'div class="col-sm-6"', 
    'Project Profile Id:': 'div class="col-sm-6"', 
    'Project Classification': 'span class="text-success h6"', 
    'Type of the project:': '<div class="col-lg-12">', 
    'Opening and Closing dates:': 'div class="col-sm-6"', 
    'Project Beneficiary :': 'div class="col-sm-6" style="margin-top: 10px;"', 
    'Procurement Activity Type:': 'div class="col-sm-8" style="margin-top: 10px;"', 
    'Beneficiary Share Amount(In Lakhs)': 'div class="col-sm-12"', 
    'Whether the scheme is centrally sponsored scheme': 'div class="col-sm-12"', 
    'Total Administrative Approval Amount (Project Cost)(In Lakhs)': 'div class="col-sm-12"', 
    'DDO Code': 'div class="col-sm-6"', 
    'Treasury Name': 'div class="col-sm-6"', 
    'DDO Office Name': 'div class="col-sm-18"', 
    'Permission from local bodies for construction/establishment wherever necessary': 'div class="col-sm-18"', 
    'Non-Objection certificate': 'div class="col-sm-18"', 
    'Non-Duplicity certificate': 'div class="col-sm-18"', 
    'Views of Department': 'div class="col-sm-18"', 
    'Details information in case of observation made by T&D in last year\'s endorsement': 'div class="form-group"', 
    'Sanction letter copy of GOI in case of Central Sector Schemes': 'div class="col-sm-6"', 
    'Views of Sr.F.A/F.A for non-admissibility of the instant project under DFPR': 'div class="col-sm-18"', 
    'Funding pattern of the scheme': 'div class="col-sm-6"', 
    'The year from which the scheme is being implemented': 'div class="col-sm-6"', 
    'The quantum of central share allocated/released for the year along with the supporting document': 'div class="col-sm-6"', 
    'The criteria of proposing the amount of state share': 'div class="col-sm-6"'
}

def pattern_to_selector(pattern):
    """
    Converts a string like 'div class="col-sm-18"' or '<div class="col-lg-12">' 
    into a CSS selector (e.g. 'div.col-sm-18' or 'div.col-lg-12').
    """
    pattern = pattern.strip()
    if pattern.startswith("<") and pattern.endswith(">"):
        pattern = pattern[1:-1].strip()
    tag_match = re.match(r'^(\w+)', pattern)
    tag = tag_match.group(1) if tag_match else ""
    class_match = re.search(r'class="([^"]+)"', pattern)
    if class_match:
        classes = class_match.group(1).split()
        selector = tag + ''.join(['.' + cls for cls in classes])
    else:
        selector = tag
    return selector

def extract_value_for_key(soup, key, pattern):
    """
    Finds the text node that matches the label (key) and then returns the text from the first subsequent element
    that matches the CSS fingerprint derived from pattern.
    """
    label_element = soup.find(text=re.compile(re.escape(key)))
    if label_element:
        selector = pattern_to_selector(pattern)
        def matches(tag):
            if not tag.name:
                return False
            sel_parts = selector.split('.')
            sel_tag = sel_parts[0]
            sel_classes = sel_parts[1:] if len(sel_parts) > 1 else []
            if tag.name != sel_tag:
                return False
            tag_classes = tag.get("class", [])
            return all(cls in tag_classes for cls in sel_classes)
        candidate = label_element.find_next(matches)
        if candidate:
            # Clean the extracted text by reducing whitespace/newlines to single spaces.
            return re.sub(r'\s+', ' ', candidate.get_text(strip=True))
    return None

def process_file(file_path):
    """
    Processes one HTML file: cleans the text nodes, extracts data based on extraction_dict,
    and returns a dictionary with the extracted values.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        file_contents = file.read()

    soup = BeautifulSoup(file_contents, 'html.parser')
    
    # Pre-clean every text node to remove extra whitespace/newlines.
    for text_node in soup.find_all(text=True):
        cleaned = re.sub(r'\s+', ' ', text_node)
        text_node.replace_with(cleaned)
    
    extracted_data = {}
    for key, pattern in extraction_dict.items():
        value = extract_value_for_key(soup, key, pattern)
        extracted_data[key] = value
    return extracted_data

def main():
    # Set the root folder that contains subfolders named as financial years ("YYYY-YY")
    root_folder = r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\scripts\scraping_experiments\downloaded_pages"
    all_data = []
    
    # Loop over each subfolder in the root folder.
    for subfolder in os.listdir(root_folder):
        subfolder_path = os.path.join(root_folder, subfolder)
        # Check if it's a directory and matches the financial year pattern
        if os.path.isdir(subfolder_path) and re.match(r'\d{4}-\d{2}', subfolder):
            financial_year = subfolder  # e.g. "2021-22"
            # Process both .html and .htm files in this subfolder
            for file in os.listdir(subfolder_path):
                if file.lower().endswith((".html", ".htm")):
                    file_path = os.path.join(subfolder_path, file)
                    data = process_file(file_path)
                    # Add the financial year and optionally the source file path
                    data["Financial Year"] = financial_year
                    data["Source File"] = file_path
                    all_data.append(data)
    
    # Create a single DataFrame from all extracted records.
    df = pd.DataFrame(all_data)
    print("Extracted DataFrame (first few rows):")
    print(df.head())
    
    # Save the DataFrame to a CSV file.
    output_csv = r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\data\all_extracted_data.csv"
    df.to_csv(output_csv, index=False)
    print(f"Data saved to {output_csv}")

if __name__ == "__main__":
    main()


  for text_node in soup.find_all(text=True):
  label_element = soup.find(text=re.compile(re.escape(key)))


KeyboardInterrupt: 

### Test single - page script

In [23]:
import os
import re
import pandas as pd
from bs4 import BeautifulSoup

# Extraction dictionary: keys are the labels (parentheses will be removed in matching)
extraction_dict = {
    'Name of the Scheme/Project proposed': 'div class="col-sm-18"', 
    'Whether scheme/project is new/ongoing': 'div class="col-sm-6"', 
    'Whether the scheme is funded by NABARD': 'div class="col-sm-6"', 
    'Proposal Number': 'div class="form-group"', 
    'Proposal Date': 'div class="col-sm-6"', 
    'Issued Number': 'div class="form-group"', 
    'Valid Upto': 'div class="col-sm-6"', 
    'Admin Department File Number': 'div class="col-sm-6"', 
    'Brief nature of the scheme/project and the time period required for implementation/execution': 'div class="col-sm-6"', 
    'Aim, objectives and benefit expected from the scheme/project': 'div class="col-sm-6"', 
    'Whether included in the priority list approved by the Departmental Minister': 'div class="col-sm-6"', 
    'Ddo Details': 'div class="col-sm-6"', 
    'Common Proposal': 'div class="col-sm-6"', 
    'Project Profile Id:': 'div class="col-sm-6"', 
    'Project Classification': 'span class="text-success h6"', 
    'Type of the project:': '<div class="col-lg-12">', 
    'Opening and Closing dates:': 'div class="col-sm-6"', 
    'Project Beneficiary :': 'div class="col-sm-6" style="margin-top: 10px;"', 
    'Procurement Activity Type:': 'div class="col-sm-8" style="margin-top: 10px;"', 
    'Beneficiary Share Amount(In Lakhs)': 'div class="col-sm-12"', 
    'Whether the scheme is centrally sponsored scheme': 'div class="col-sm-12"', 
    'Total Administrative Approval Amount (Project Cost)(In Lakhs)': 'div class="col-sm-12"', 
    'DDO Code': 'div class="col-sm-6"', 
    'Treasury Name': 'div class="col-sm-6"', 
    'DDO Office Name': 'div class="col-sm-18"', 
    'Permission from local bodies for construction/establishment wherever necessary': 'div class="col-sm-18"', 
    'Non-Objection certificate': 'div class="col-sm-18"', 
    'Non-Duplicity certificate': 'div class="col-sm-18"', 
    'Views of Department': 'div class="col-sm-18"', 
    "Details information in case of observation made by T&D in last year's endorsement": 'div class="form-group"', 
    'Sanction letter copy of GOI in case of Central Sector Schemes': 'div class="col-sm-6"', 
    'Views of Sr.F.A/F.A for non-admissibility of the instant project under DFPR': 'div class="col-sm-18"', 
    'Funding pattern of the scheme': 'div class="col-sm-6"', 
    'The year from which the scheme is being implemented': 'div class="col-sm-6"', 
    'The quantum of central share allocated/released for the year along with the supporting document': 'div class="col-sm-6"', 
    'The criteria of proposing the amount of state share': 'div class="col-sm-6"'
}

def pattern_to_selector(pattern):
    """
    Converts a string like 'div class="col-sm-18"' or '<div class="col-lg-12">' 
    into a CSS selector such as 'div.col-sm-18' or 'div.col-lg-12'.
    """
    pattern = pattern.strip()
    if pattern.startswith("<") and pattern.endswith(">"):
        pattern = pattern[1:-1].strip()
    tag_match = re.match(r'^(\w+)', pattern)
    tag = tag_match.group(1) if tag_match else ""
    class_match = re.search(r'class="([^"]+)"', pattern)
    if class_match:
        classes = class_match.group(1).split()
        selector = tag + ''.join(['.' + cls for cls in classes])
    else:
        selector = tag
    return selector

def extract_value_for_key(soup, key, pattern):
    """
    Matches a text node containing the cleaned key (parentheses removed) in a case-insensitive way,
    then returns the text from the first subsequent element that matches the CSS selector derived from pattern.
    """
    # Remove any parentheses and contained text from the key
    cleaned_key = re.sub(r'\s*\([^)]*\)', '', key).strip()
    label_element = soup.find(text=re.compile(re.escape(cleaned_key), re.IGNORECASE))
    if label_element:
        selector = pattern_to_selector(pattern)
        def matches(tag):
            if not tag.name:
                return False
            sel_parts = selector.split('.')
            sel_tag = sel_parts[0]
            sel_classes = sel_parts[1:] if len(sel_parts) > 1 else []
            if tag.name != sel_tag:
                return False
            tag_classes = tag.get("class", [])
            return all(cls in tag_classes for cls in sel_classes)
        candidate = label_element.find_next(matches)
        if candidate:
            return re.sub(r'\s+', ' ', candidate.get_text(strip=True))
    return None

def process_file(file_path):
    """
    Processes one HTML file: cleans text nodes, extracts data using extraction_dict,
    and returns a dictionary of the extracted values.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        file_contents = file.read()

    soup = BeautifulSoup(file_contents, 'html.parser')
    
    # Pre-clean every text node to remove extra whitespace/newlines
    for text_node in soup.find_all(text=True):
        cleaned = re.sub(r'\s+', ' ', text_node)
        text_node.replace_with(cleaned)
    
    extracted_data = {}
    for key, pattern in extraction_dict.items():
        value = extract_value_for_key(soup, key, pattern)
        extracted_data[key] = value
    return extracted_data

def main():
    # Specify the test HTML file path.
    test_file = r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\scripts\scraping_experiments\downloaded_pages\2023-24\AA-05-2023-24-6741.html"
    # Manually set a financial year for testing (or extract it from the folder name)
    financial_year = "2021-22"
    
    if not os.path.exists(test_file):
        print(f"File not found: {test_file}")
        return
    
    data = process_file(test_file)
    data["Financial Year"] = financial_year
    data["Source File"] = test_file
    
    # Create a DataFrame with one row using the extracted data.
    df = pd.DataFrame([data])
    print("Extracted DataFrame:")
    print(df)
    
    # Optionally, save the DataFrame to a CSV file.
    output_csv = "single_file_extracted_data.csv"
    df.to_csv(output_csv, index=False)
    print(f"Data saved to {output_csv}")

if __name__ == "__main__":
    main()


Extracted DataFrame:
                 Name of the Scheme/Project proposed  \
0  Repairing & Renovation Jalah No-1 PWSS damaged...   

  Whether scheme/project is new/ongoing  \
0                           New project   

  Whether the scheme is funded by NABARD     Proposal Number Proposal Date  \
0                                     No  AA-05-2023-24-6741    26-06-2023   

           Issued Number  Valid Upto Admin Department File Number  \
0  AA/05_2023-24(I)_5242  26-06-2026                       289051   

  Brief nature of the scheme/project and the time period required for implementation/execution  \
0                                         FDR SCHEME                                             

  Aim, objectives and benefit expected from the scheme/project  ...  \
0             Repairing & Renovation Jalah No-1 PWSS            ...   

  Views of Department  \
0                       

  Details information in case of observation made by T&D in last year's endorsement  \
0    

  for text_node in soup.find_all(text=True):
  label_element = soup.find(text=re.compile(re.escape(cleaned_key), re.IGNORECASE))


Extract basic html info

In [21]:
import pandas as pd
from bs4 import BeautifulSoup

html_doc = open(r'D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\scripts\scraping_experiments\downloaded_pages\2023-24\AA-05-2023-24-6742.html')
# html_doc = open(r'D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\scripts\scraping_experiments\downloaded_pages\2017-18\AA-05-2017-18-0001.html')
soup = BeautifulSoup(html_doc, 'html.parser')

print(soup.prettify())


<html lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="Public Finance Management System - Government of Assam" name="description"/>
  <link href="/assamfinance/resources/icons/favicon.ico" rel="icon" type="image/x-icon"/>
  <link href="/assamfinance/resources/icons/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
  <link href="/assamfinance/resources/style/kranstyle.css" rel="stylesheet" type="text/css"/>
  <link href="/assamfinance/resources/style/jquery-ui.css" rel="stylesheet" type="text/css"/>
  <!-- BOOTSTRAP -->
  <link href="/assamfinance/resources/bootstrap/css/bootstrap.min.css" rel="stylesheet" type="text/css"/>
  <link href="/assamfinance/resources/font-awesome/css/font-awesome.min.css" rel="stylesheet" type="text/css"/>
  <!-- CALENDAR  -->
  <link href="/assamfinance/resources/js/cal

In [16]:
import os
import re
import difflib
import pandas as pd
from bs4 import BeautifulSoup

# Your dictionary mapping original titles to the CSS fingerprint for the next element
extraction_dict = {
    'Name of the Scheme/Project proposed': 'div class="col-sm-18"', 
    'Whether scheme/project is new/ongoing': 'div class="col-sm-6"', 
    'Whether the scheme is funded by NABARD': 'div class="col-sm-6"', 
    'Proposal Number': 'div class="form-group"', 
    'Proposal Date': 'div class="col-sm-6"', 
    'Issued Number': 'div class="form-group"', 
    'Valid Upto': 'div class="col-sm-6"', 
    'Admin Department File Number': 'div class="col-sm-6"', 
    'Brief nature of the scheme/project and the time period required for implementation/execution': 'div class="col-sm-6"', 
    'Aim, objectives and benefit expected from the scheme/project': 'div class="col-sm-6"', 
    'Whether included in the priority list approved by the Departmental Minister': 'div class="col-sm-6"', 
    'Ddo Details': 'div class="col-sm-6"', 
    'Common Proposal': 'div class="col-sm-6"', 
    'Project Profile Id:': 'div class="col-sm-6"', 
    'Project Classification': 'span class="text-success h6"', 
    'Type of the project:': '<div class="col-lg-12">', 
    'Opening and Closing dates:': 'div class="col-sm-6"',  # This field will later be split into starting and closing dates
    'Project Beneficiary :': 'div class="col-sm-6" style="margin-top: 10px;"', 
    'Procurement Activity Type:': 'div class="col-sm-8" style="margin-top: 10px;"', 
    'Beneficiary Share Amount(In Lakhs)': 'div class="col-sm-12"', 
    'Whether the scheme is centrally sponsored scheme': 'div class="col-sm-12"', 
    'Total Administrative Approval Amount (Project Cost)(In Lakhs)': 'div class="col-sm-12"', 
    'DDO Code': 'div class="col-sm-6"', 
    'Treasury Name': 'div class="col-sm-6"', 
    'DDO Office Name': 'div class="col-sm-18"', 
    'Permission from local bodies for construction/establishment wherever necessary': 'div class="col-sm-18"', 
    'Non-Objection certificate': 'div class="col-sm-18"', 
    'Non-Duplicity certificate': 'div class="col-sm-18"', 
    'Views of Department': 'div class="col-sm-18"', 
    "Details information in case of observation made by T&D in last year's endorsement": 'div class="form-group"', 
    'Sanction letter copy of GOI in case of Central Sector Schemes': 'div class="col-sm-6"', 
    'Views of Sr.F.A/F.A for non-admissibility of the instant project under DFPR': 'div class="col-sm-18"', 
    'Funding pattern of the scheme': 'div class="col-sm-6"', 
    'The year from which the scheme is being implemented': 'div class="col-sm-6"', 
    'The quantum of central share allocated/released for the year along with the supporting document': 'div class="col-sm-6"', 
    'The criteria of proposing the amount of state share': 'div class="col-sm-6"'
}

def pattern_to_selector(pattern):
    """
    Converts a string like 'div class="col-sm-18"' or '<div class="col-lg-12">' 
    into a CSS selector (e.g. 'div.col-sm-18' or 'div.col-lg-12').
    """
    pattern = pattern.strip()
    if pattern.startswith("<") and pattern.endswith(">"):
        pattern = pattern[1:-1].strip()
    tag_match = re.match(r'^(\w+)', pattern)
    tag = tag_match.group(1) if tag_match else ""
    class_match = re.search(r'class="([^"]+)"', pattern)
    if class_match:
        classes = class_match.group(1).split()
        selector = tag + ''.join(['.' + cls for cls in classes])
    else:
        selector = tag
    return selector

def extract_value_for_key(soup, key, pattern):
    """
    Searches for a text node containing the cleaned key (parentheses removed) in a case-insensitive way,
    then returns the text from the first subsequent element that matches the CSS selector derived from pattern.
    """
    # Remove any parentheses and contained text from the key
    cleaned_key = re.sub(r'\s*\([^)]*\)', '', key).strip()
    label_element = soup.find(text=re.compile(re.escape(cleaned_key), re.IGNORECASE))
    if label_element:
        selector = pattern_to_selector(pattern)
        def matches(tag):
            if not tag.name:
                return False
            sel_parts = selector.split('.')
            sel_tag = sel_parts[0]
            sel_classes = sel_parts[1:] if len(sel_parts) > 1 else []
            if tag.name != sel_tag:
                return False
            tag_classes = tag.get("class", [])
            return all(cls in tag_classes for cls in sel_classes)
        candidate = label_element.find_next(matches)
        if candidate:
            return re.sub(r'\s+', ' ', candidate.get_text(strip=True))
    return None

def get_candidate_titles(soup):
    """
    Extracts candidate title elements from the new HTML file.
    For this example, we use all non-empty <div> elements (you can adjust this selector as needed).
    Returns a list of tuples: (text, element)
    """
    candidates = []
    for div in soup.find_all("div"):
        text = div.get_text(strip=True)
        if text and len(text) > 3:  # filter very short strings
            candidates.append((text, div))
    return candidates

def extract_new_value_from_element(elem):
    """
    Given an element (assumed to be the matched title in the new file),
    find the next element that likely contains the corresponding value and clean its text.
    """
    if elem:
        next_elem = elem.find_next()
        if next_elem:
            return re.sub(r'\s+', ' ', next_elem.get_text(strip=True))
    return None

def main():
    # Paths to your original and new HTML files (update these paths as needed)
    original_file = r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\scripts\scraping_experiments\downloaded_pages\2024-25\AA-05-2024-25-7194.html"
    new_file = r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\scripts\scraping_experiments\downloaded_pages\2017-18\AA-05-2017-18-7101.html"
    
    if not os.path.exists(original_file):
        print(f"Original file not found: {original_file}")
        return
    if not os.path.exists(new_file):
        print(f"New file not found: {new_file}")
        return

    # Load both files into BeautifulSoup and pre-clean all text nodes
    with open(original_file, 'r', encoding='utf-8') as f:
        original_soup = BeautifulSoup(f.read(), 'html.parser')
    with open(new_file, 'r', encoding='utf-8') as f:
        new_soup = BeautifulSoup(f.read(), 'html.parser')
    
    # Pre-clean all text nodes in both soups
    for soup in [original_soup, new_soup]:
        for text_node in soup.find_all(text=True):
            cleaned = re.sub(r'\s+', ' ', text_node)
            text_node.replace_with(cleaned)
    
    # Extract values from the original file using the provided dictionary
    original_extracted = {}
    for key, pattern in extraction_dict.items():
        original_extracted[key] = extract_value_for_key(original_soup, key, pattern)
    
    # In the new file, extract candidate title elements (this is a simple example using all divs)
    candidate_titles = get_candidate_titles(new_soup)
    candidate_texts = [text for text, elem in candidate_titles]
    
    # Prepare a list to collect DataFrame rows
    rows = []
    
    # For each original title (key), perform fuzzy matching to find the closest candidate title in new file
    for key in extraction_dict.keys():
        cleaned_key = re.sub(r'\s*\([^)]*\)', '', key).strip()
        best_matches = difflib.get_close_matches(cleaned_key, candidate_texts, n=1, cutoff=0.4)
        if best_matches:
            new_title = best_matches[0]
            # Get the candidate element corresponding to the matched title
            candidate_elem = next((elem for text, elem in candidate_titles if text == new_title), None)
            new_value = extract_new_value_from_element(candidate_elem)
        else:
            new_title = None
            new_value = None
        
        # Get the original value from the original file
        original_value = original_extracted.get(key)
        
        # Append the row (each row corresponds to one original title/key)
        rows.append({
            "Original Column": key,
            "New HTML Column": new_title,
            "Element in Original File": original_value,
            "Element in New File": new_value
        })
    
    # Create the DataFrame
    df = pd.DataFrame(rows)
    print("Extracted DataFrame:")
    #print(df)
    
    # Optionally, save the DataFrame to a CSV file
    output_csv = "comparison_extracted_data.csv"
    df.to_csv(output_csv, index=False)
    print(f"Data saved to {output_csv}")
    return df

if __name__ == "__main__":
    main()


Extracted DataFrame:
Data saved to comparison_extracted_data.csv


  for text_node in soup.find_all(text=True):
  label_element = soup.find(text=re.compile(re.escape(cleaned_key), re.IGNORECASE))


# EDA

In [17]:
all_data = pd.read_csv(r"D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR-Assam\Sources\SDRF\data\all_extracted_data.csv")
all_data

Unnamed: 0,Name of the Scheme,Whether scheme,Whether the scheme is funded by NABARD,Proposal Number,Proposal Date,Issued Number,Valid Upto,Admin Department File Number,Brief nature of the scheme,"Aim, objectives and benefit expected from the scheme/project",...,Sanction letter copy of GOI in case of Central Sector Schemes,Views of Sr,Funding pattern of the scheme,The year from which the scheme is being implemented,The quantum of central share allocated,The criteria of proposing the amount of state share,Starting Date,Closing Date,Financial Year,Source File
0,Immediate measures for closing of breach and r...,New project,No,AA-05-2019-20-0197,06-06-2019,AA/05_2019-20(I)_11,20-07-2022,RGR(RRR)386/2019,Immediateestoration of flood damages of river ...,To protect the people of the greater Dhakuakha...,...,No,,,,,,----,,2019-20,D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR...
1,Immediate measures for restoration of damaged ...,New project,No,AA-05-2019-20-0198,06-06-2019,AA/05_2019-20(I)_10,19-07-2022,RGr(RRR)388/2019,Immediate measures for repairing and restoration.,To protect the people of Greater Dhakuakhana area,...,No,,,,,,----,,2019-20,D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR...
2,Restoration of flood damaged road from 6th KM ...,New project,No,AA-05-2019-20-0200,14-06-2019,AA/05_2019-20(I)_01,14-06-2022,RGR(RRR)951/2018,Restoration of flood damaged work,"It is the only road for movement to schools, c...",...,No,,,,,,----,,2019-20,D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR...
3,Immediate measures for restoration of damages ...,New project,No,AA-05-2019-20-0205,26-06-2019,AA/05_2019-20(I)_28,06-12-2022,RGR(RRR)548/2019,The proposed scheme has been framed to prevent...,The proposed scheme has been framed to prevent...,...,No,,,,,,----,,2019-20,D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR...
4,"""Immediate measures to Ring Bund at villagae T...",New project,No,AA-05-2019-20-0206,26-06-2019,AA/05_2019-20(I)_09,19-07-2024,RGR(RRR)819/2018,,To protect the life and property of the people...,...,No,,,,,,----,,2019-20,D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5826,CONSTRUCTION OF FRONT BUILDING (GROUND + THREE...,Ongoing Project(Parent Proposal No. AA-SDM-202...,,,06-12-2024,,,-N/A-,-N/A-,-N/A-,...,,,,,,,,,2024-25,D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR...
5827,CONSTRUCTION OF FRONT BUILDING (GROUND + THREE...,Ongoing Project(Parent Proposal No. AA-SDM-202...,,,17-12-2024,,,-N/A-,-N/A-,-N/A-,...,,,,,,,,,2024-25,D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR...
5828,Procurement of Machinery and Trolley Mounted P...,New project,No,AA-SDM-2024-25-0007,20-01-2025,AA/SDM_2024-25(I)_03,29-01-2030,499824,Procurement of Machinery and Trolley Mounted P...,1. The pumps will aid in mitigation of urban f...,...,No,,FROM SDRF-PREPAREDNESS & CAPACITY BUILDING WINDOW,2024-25,9940 LAKHS,,29-01-2025,29-01-2030,2024-25,D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR...
5829,"Construction of DEOC building, Kamrup, Amingao...",New project,No,AA-SDM-2024-25-0008,24-02-2025,AA/SDM_2024-25(I)_04,11-03-2028,,"Construction of DEOC building, Kamrup, Amingao...","Construction of DEOC building, Kamrup, Amingao...",...,No,,FROM SDRF-PREPAREDNESS & CAPACITY BUILDING WINDOW,2023-24,9940 LAKHS,,11-03-2025,11-03-2028,2024-25,D:\CivicDataLab_IDS-DRR\IDS-DRR_Github\IDS-DRR...


In [20]:
all_data['Issued Number'].nunique()

5658

In [22]:
# Create a regex pattern to match "PWD", "public works department", or "pwd" (case-insensitive)
pattern = r"(?i)\b(?:pwd|public works department)\b"

# Apply the pattern across all columns and get a boolean mask for rows that contain the pattern
mask = all_data.apply(lambda col: col.astype(str).str.contains(pattern, regex=True, na=False)).any(axis=1)

# Filter the DataFrame based on the mask
pwd_rows = all_data[mask]

print(pwd_rows)

                                     Name of the Scheme  \
2     Restoration of flood damaged road from 6th KM ...   
6     Immediate restoration of flood damaged road fr...   
8     Immediate restoration of Udmari Salkocha road ...   
9     Immediate restoration of the flood damaged roa...   
10    Immediate restoration of flood damaged road fr...   
...                                                 ...   
5614  Immediate Restoration of 7 No line tiniali roa...   
5618  Immediate Restoration of Dillibari PWD Road to...   
5665  Temporary Restoration of Velakoba Durahati PWD...   
5693  Road from Gunahati PWD at Kalakata to SIRD Com...   
5715  Restoration of road from Kanaibazar- Anipur PW...   

                                         Whether scheme  \
2                                           New project   
6                                           New project   
8                                           New project   
9                                           New project