In [None]:
!pip install requests beautifulsoup4

In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from bs4 import Tag

In [None]:
def element_to_md(element, service_link = "https://gic.gov.lk/", indent=0):
    md = ""
    prefix = "  " * indent

    if element.name == "p":
        # handle links inside <p>
        for a in element.find_all("a", href=True):
            absolute_url = urljoin(service_link, a['href'])
            a.replace_with(f"[{a.get_text(strip=True)}]({absolute_url})")
        text = element.get_text(" ", strip=True)
        md += f"{prefix}{text}\n\n"

    elif element.name in ["ul", "ol"]:
        for i, li in enumerate(element.find_all("li", recursive=False), start=1):
            # Nested lists
            sub_md = element_to_md(li, service_link, indent + 1)
            if element.name == "ul":
                md += f"{prefix}- {sub_md.strip()}\n"
            else:
                md += f"{prefix}{i}. {sub_md.strip()}\n"
        md += "\n"

    elif element.name == "li":
      # process li children
      for child in element.children:
          if isinstance(child, Tag):
              if child.name in ["p", "ul", "ol"]:
                  md += "\n" + element_to_md(child, service_link, indent + 1)
              else:
                  # process links
                  for a in child.find_all("a", href=True):
                      absolute_url = urljoin(service_link, a['href'])
                      a.replace_with(f"[{a.get_text(strip=True)}]({absolute_url})")
                  md += child.get_text(" ", strip=True) + " "
          else:
              # child is a string
              md += str(child).strip() + " "

    elif element.name == "table":
        # Skip if last table contains 'Organization Information'
        if "Organization Information" in element.get_text():
            return ""
        rows = element.find_all("tr")
        table_md = ""
        for r, row in enumerate(rows):
            cols = row.find_all(["th", "td"])
            row_text = " | ".join([c.get_text(" ", strip=True) for c in cols])
            table_md += f"{row_text}\n"
            if r == 0:
                # Add Markdown table separator after header
                table_md += " | ".join(["---"] * len(cols)) + "\n"
        md += table_md + "\n"

    elif element.name in ["div", "span"]:
        # Recursive call for children
        for child in element.children:
            md += element_to_md(child, service_link, indent)

    return md

In [None]:
def scrape_service_page(service_name, service_link):
    print(f"Scraping: {service_name}")
    try:
        resp = requests.get(service_link, timeout=30)
        resp.raise_for_status()
    except Exception as e:
        print(f"Failed to fetch {service_link}: {e}")
        return ""

    soup = BeautifulSoup(resp.text, "html.parser")
    content_div = soup.find("div", id="jf_content_print")
    if not content_div:
        print(f"No content div found for {service_link}")
        return

    md_text = element_to_md(content_div, service_link)
    # print(md_text)

    if (len(md_text) < 10):
        print(f"No content found for {service_link}")
        return ""
    else:
        # Save to markdown file
        safe_name = "".join([c if c.isalnum() else "_" for c in service_name])
        md_file_path = os.path.join(output_dir, f"{safe_name}.md")
        with open(md_file_path, "w", encoding="utf-8") as f:
            f.write(md_text)
            print(f"Saved to {md_file_path}: {len(md_text)}")
            return md_file_path


## Scrape data for each main category

In [None]:
# Perform scraping one by one main category for easy human verification
# main_category_id = 1
# main_category_id = 2
# main_category_id = 3
# main_category_id = 4
# main_category_id = 5
# main_category_id = 6
# main_category_id = 7
# main_category_id = 8
# main_category_id = 9
# main_category_id = 10
# main_category_id = 11
main_category_id = 12

In [None]:
import pandas as pd

content = pd.read_csv(f'gic_data_content_{main_category_id-1}.csv')

In [None]:
output_dir = f"service_md_{main_category_id}"
os.makedirs(output_dir, exist_ok=True)

for idx, row in content.iterrows():
    try:
        if row["main_category_id"] != main_category_id:
            continue
        if pd.notna(row["content_file"]) and str(row["content_file"]).strip() != "":
          continue

        filename = scrape_service_page(
            row["service"],
            row["serviceLink"]
        )

        content.at[idx, "content_file"] = filename

    except Exception as e:
        print(f"Failed to scrape {row['serviceLink']}: {e}")
        content.at[idx, "content_file"] = ""
    print("\n----------------------------------------------------\n")

Scraping: Cooked Meals for People Subject to Distress
Saved to service_md_12/Cooked_Meals_for_People_Subject_to_Distress.md: 1012

----------------------------------------------------

Scraping: Funeral Aids In Respect of Those Who Died of Disaster
Saved to service_md_12/Funeral_Aids_In_Respect_of_Those_Who_Died_of_Disaster.md: 1371

----------------------------------------------------

Scraping: Obtaining Relief for Damages Caused to Crops Due to Any Disaster or Attack by Wild Elephant
Saved to service_md_12/Obtaining_Relief_for_Damages_Caused_to_Crops_Due_to_Any_Disaster_or_Attack_by_Wild_Elephant.md: 1212

----------------------------------------------------

Scraping: Obtaining Relief for Disasters Faces by Small Scale Entrepreneurs  and the Self Employed
Saved to service_md_12/Obtaining_Relief_for_Disasters_Faces_by_Small_Scale_Entrepreneurs__and_the_Self_Employed.md: 996

----------------------------------------------------

Scraping: Dry Rations Meals for People are Subject to D

In [None]:
import shutil

shutil.make_archive(
    base_name=output_dir,
    format="zip",
    root_dir=output_dir
)

'/content/service_md_12.zip'

In [None]:
content.to_csv(f"gic_data_content_{main_category_id}.csv", index=False)

In [None]:
import numpy as np

df = content.copy()
df["content_file"] = df["content_file"].replace("", np.nan)
df["is_scraped"] = df["content_file"].notna()

summary = (
    df.groupby(["main_category_id", "main_category"])
      .agg(
          scraped=("content_file", lambda x: x.notna().sum()),
          not_scraped=("content_file", lambda x: x.isna().sum()),
      )
      .reset_index()
)
summary

Unnamed: 0,main_category_id,main_category,scraped,not_scraped
0,1,Education & Training,66,2
1,2,"Housing, Property & Utilities",45,0
2,3,"Banking, Tax & Insurance",48,7
3,4,"Travel, Tourism & Lesiure",41,6
4,5,"Justice, Law & Rights",62,1
5,6,Employment Information,32,5
6,7,"Health, Well Being & Social Service",93,0
7,8,"Agriculture, Livestock & Fisheries",101,2
8,9,Citizen's Registrations,36,1
9,10,"Trade, Business & Industry",127,2


In [None]:
total_scraped = summary["scraped"].sum()
total_not_scraped = summary["not_scraped"].sum()

total_summary = {
    "scraped": total_scraped,
    "not_scraped": total_not_scraped,
    "total": total_scraped + total_not_scraped
}

print(total_summary)

{'scraped': np.int64(733), 'not_scraped': np.int64(28), 'total': np.int64(761)}


In [None]:
unscraped = content[
    content["content_file"].isna() | (content["content_file"] == "")
]

for _, row in unscraped.iterrows():
    print(row["main_category"])
    print(row["service"], "->")
    print(row["serviceLink"])
    print("\n----------------------------------------\n")

Education & Training
Official Languages Proficiencies Exam ->
https://gic.gov.lk/gic/index.php/en/component/info/?id=601&catid=20&task=info

----------------------------------------

Education & Training
Proficiencies Exam for Other Languages ->
https://gic.gov.lk/gic/index.php/en/component/info/?id=606&catid=20&task=info

----------------------------------------

Banking, Tax & Insurance
Safe Deposit Lockers ->
https://gic.gov.lk/gic/index.php/en/component/info/?id=772&catid=90&task=info

----------------------------------------

Banking, Tax & Insurance
BOC I-net Facility ->
https://gic.gov.lk/gic/index.php/en/component/info/?id=790&catid=90&task=info

----------------------------------------

Banking, Tax & Insurance
Issuing Crackers for Public ->
https://gic.gov.lk/gic/index.php/en/component/info/?id=1101&catid=90&task=info

----------------------------------------

Banking, Tax & Insurance
Obtain Educational Loans ->
https://gic.gov.lk/gic/index.php/en/component/info/?id=813&catid

## Scraped data stats after manually reviewed and added content of missing services

In [None]:
import pandas as pd

final_data = pd.read_csv("gic_data_content_final.csv")
final_data.head()

Unnamed: 0,main_category_id,main_category,subcategory,service,serviceLink,content_file
0,1,Education & Training,Education Publications,Sales Outlets of Books,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Sales_Outlets_of_Books.md
1,1,Education & Training,Education Publications,Museum Publications,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Museum_Publications.md
2,1,Education & Training,Education Publications,Services of Establishment Unit of Educational ...,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Services_of_Establishment_Unit_of...
3,1,Education & Training,Education Publications,Services of Information Technology Unit of Edu...,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Services_of_Information_Technolog...
4,1,Education & Training,Education Publications,Warehouses of Educational Publications Depart...,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Warehouses_of_Educational__Public...


In [None]:
summary = (
    final_data.groupby(["main_category_id", "main_category"])
      .agg(
          scraped=("content_file", lambda x: (x != "-").sum()),
          not_scraped=("content_file", lambda x: (x == "-").sum()),
      )
      .reset_index()
)

summary

Unnamed: 0,main_category_id,main_category,scraped,not_scraped
0,1,Education & Training,66,2
1,2,"Housing, Property & Utilities",45,0
2,3,"Banking, Tax & Insurance",52,3
3,4,"Travel, Tourism & Lesiure",46,1
4,5,"Justice, Law & Rights",63,0
5,6,Employment Information,34,3
6,7,"Health, Well Being & Social Service",93,0
7,8,"Agriculture, Livestock & Fisheries",102,1
8,9,Citizen's Registrations,37,0
9,10,"Trade, Business & Industry",127,2


In [None]:
total_scraped = summary["scraped"].sum()
total_not_scraped = summary["not_scraped"].sum()

total_summary = {
    "scraped": total_scraped,
    "not_scraped": total_not_scraped,
    "total": total_scraped + total_not_scraped
}

print(total_summary)

{'scraped': np.int64(747), 'not_scraped': np.int64(14), 'total': np.int64(761)}


In [None]:
summary.to_csv("gic_data_content_final_summary.csv", index=False)

## Extract subcategory id and service id for each service

In [None]:
import pandas as pd

final_data = pd.read_csv("gic_data_content_final.csv")
final_data.head()

Unnamed: 0,main_category_id,main_category,subcategory,service,serviceLink,content_file
0,1,Education & Training,Education Publications,Sales Outlets of Books,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Sales_Outlets_of_Books.md
1,1,Education & Training,Education Publications,Museum Publications,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Museum_Publications.md
2,1,Education & Training,Education Publications,Services of Establishment Unit of Educational ...,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Services_of_Establishment_Unit_of...
3,1,Education & Training,Education Publications,Services of Information Technology Unit of Edu...,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Services_of_Information_Technolog...
4,1,Education & Training,Education Publications,Warehouses of Educational Publications Depart...,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Warehouses_of_Educational__Public...


In [None]:
final_data['sub_category_id'] = 0
final_data['service_id'] = 0
final_data.iloc[0]['serviceLink']

'https://gic.gov.lk/gic/index.php/en/component/info/?id=665&catid=87&task=info'

In [None]:
from urllib.parse import urlparse, parse_qs
import pandas as pd

def extract_ids(service_url):
    parsed = urlparse(service_url)
    params = parse_qs(parsed.query)

    return pd.Series({
        "service_id": params.get("id", [None])[0],
        "sub_category_id": params.get("catid", [None])[0],
    })

# Apply once (vectorized)
final_data[["service_id", "sub_category_id"]] = (
    final_data["serviceLink"]
    .apply(extract_ids)
)

# Convert to integers safely
final_data["service_id"] = pd.to_numeric(final_data["service_id"], errors="coerce")
final_data["sub_category_id"] = pd.to_numeric(final_data["sub_category_id"], errors="coerce")

In [None]:
final_data.head()

Unnamed: 0,main_category_id,main_category,subcategory,service,serviceLink,content_file,sub_category_id,service_id
0,1,Education & Training,Education Publications,Sales Outlets of Books,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Sales_Outlets_of_Books.md,87,665
1,1,Education & Training,Education Publications,Museum Publications,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Museum_Publications.md,87,1481
2,1,Education & Training,Education Publications,Services of Establishment Unit of Educational ...,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Services_of_Establishment_Unit_of...,87,662
3,1,Education & Training,Education Publications,Services of Information Technology Unit of Edu...,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Services_of_Information_Technolog...,87,659
4,1,Education & Training,Education Publications,Warehouses of Educational Publications Depart...,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Warehouses_of_Educational__Public...,87,664


In [None]:
final_data.to_csv("gic_data_final_ids_extracted.csv", index=False)

In [None]:
final_data["key"] = (
    final_data["main_category_id"].astype(str)
    + "-" +
    final_data["sub_category_id"].astype(str)
    + "-" +
    final_data["service_id"].astype(str)
)

duplicates = final_data[
    final_data["key"].duplicated(keep=False)
]

duplicates

Unnamed: 0,main_category_id,main_category,subcategory,service,serviceLink,content_file,sub_category_id,service_id,key


In [None]:
final_data.to_csv("gic_data_final_with_key.csv", index=False)