In [None]:
!pip install requests beautifulsoup4

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin
import pandas as pd

BASE_URL = "https://gic.gov.lk/gic/"
HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

In [None]:
def scrape_main_category(main_category_id):
    url = f"{BASE_URL}index.php/en/component/info/?id={main_category_id}&task=cat"
    response = requests.get(url, headers=HEADERS, timeout=30)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, "html.parser")
    rows = []

    main_category_el = soup.select_one("a.componentheading4")
    if not main_category_el:
        print(f"No main category found for ID: {main_category_id}")
        return rows

    main_category_name = main_category_el.get_text(strip=True)

    subcategory_links = soup.select("a.componentindex4")
    print(f"Found {len(subcategory_links)} subcategories for ID: {main_category_id}")

    for sub_link in subcategory_links:
        subcategory_name = sub_link.get_text(strip=True)
        print(f"Scraping subcategory: {subcategory_name}")

        onmouseover = sub_link.get("onmouseover", "")
        if "show(" not in onmouseover:
            continue

        div_id = onmouseover.split("show(\"")[1].split("\")")[0]
        service_div = soup.find("div", id=div_id)

        if not service_div:
            print(f"No service div found for subcategory: {subcategory_name}")
            continue

        service_links = service_div.find_all("a", href=True)
        print(f"Found {len(service_links)} services for subcategory: {subcategory_name}")
        for service_link in service_links:
            service_name = service_link.get_text(strip=True)
            service_url = urljoin(BASE_URL, service_link["href"])

            rows.append({
                "main_category_id": main_category_id,
                "main_category": main_category_name,
                "subcategory": subcategory_name,
                "service": service_link.get_text(strip=True),
                "serviceLink": urljoin(BASE_URL, service_link["href"])
            })

    return rows

In [None]:
def scrape_all(main_category_ids, output_csv="gic_services.csv"):
    all_rows = []

    for mc_id in main_category_ids:
        print(f"Scraping main category ID: {mc_id}")
        main_category_response = scrape_main_category(mc_id)
        print(f"Scraped {len(main_category_response)} rows")
        all_rows.extend(main_category_response)

    df = pd.DataFrame(all_rows)
    return df.head()


In [None]:
main_category_ids = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
df = scrape_all(main_category_ids)
df

Scraping main category ID: 1
Found 9 subcategories for ID: 1
Scraping subcategory: Education Publications
Found 13 services for subcategory: Education Publications
Scraping subcategory: Pre-School Education
Found 0 services for subcategory: Pre-School Education
Scraping subcategory: School Education
Found 1 services for subcategory: School Education
Scraping subcategory: Higher Education & University Education
Found 3 services for subcategory: Higher Education & University Education
Scraping subcategory: Vocational Education & Training
Found 38 services for subcategory: Vocational Education & Training
Scraping subcategory: Education for Differently-abled
Found 1 services for subcategory: Education for Differently-abled
Scraping subcategory: Distance Education
Found 1 services for subcategory: Distance Education
Scraping subcategory: Admissions
Found 0 services for subcategory: Admissions
Scraping subcategory: Education for Government Officers
Found 11 services for subcategory: Educatio

Unnamed: 0,main_category_id,main_category,subcategory,service,serviceLink
0,1,Education & Training,Education Publications,Sales Outlets of Books,https://gic.gov.lk/gic/index.php/en/component/...
1,1,Education & Training,Education Publications,Museum Publications,https://gic.gov.lk/gic/index.php/en/component/...
2,1,Education & Training,Education Publications,Services of Establishment Unit of Educational ...,https://gic.gov.lk/gic/index.php/en/component/...
3,1,Education & Training,Education Publications,Services of Information Technology Unit of Edu...,https://gic.gov.lk/gic/index.php/en/component/...
4,1,Education & Training,Education Publications,Warehouses of Educational Publications Depart...,https://gic.gov.lk/gic/index.php/en/component/...
...,...,...,...,...,...
758,12,Environment,Weather Service,Handling the Weather and Climate Scientific St...,https://gic.gov.lk/gic/index.php/en/component/...
759,12,Environment,Weather Service,Contribution for the Meteorological Exhibitions,https://gic.gov.lk/gic/index.php/en/component/...
760,12,Environment,Wildlife,Reserve a Circuit Bungalow,https://gic.gov.lk/gic/index.php/en/component/...
761,12,Environment,Wildlife,Mega Constructions in Buffer Zones of National...,https://gic.gov.lk/gic/index.php/en/component/...


In [None]:
len(df)

763

In [None]:
df.isna().sum()

Unnamed: 0,0
main_category_id,0
main_category,0
subcategory,0
service,0
serviceLink,0


In [None]:
df["serviceLink"].nunique()

761

In [None]:
df_unique = df.drop_duplicates(subset=["serviceLink"]).reset_index(drop=True)

In [None]:
print(len(df_unique), df["serviceLink"].nunique())

761 761


In [None]:
df_unique.to_csv("gic_data_links_unique.csv", index=False)

In [None]:
category_counts = df_unique.groupby("main_category").size().sort_values(ascending=False)

df_category_count = category_counts.reset_index(name="count")

df_category_count.to_csv("gic_main_category_stats.csv")

df_category_count.head()

In [None]:
category_counts2 = df_unique.groupby(["main_category", "subcategory"]).size()

df_category_count2 = category_counts2.reset_index(name="count")

df_category_count2 = df_category_count2.sort_values(by=["main_category", "subcategory"]).reset_index(drop=True)

df_category_count2.to_csv("gic_sub_category_stats.csv", index=False)

df_category_count2.head()


Unnamed: 0,main_category,subcategory,count
0,"Agriculture, Livestock & Fisheries","Agriculture, Livestock & Fisheries Development",60
1,"Agriculture, Livestock & Fisheries",Approval & Registration,7
2,"Agriculture, Livestock & Fisheries",Permits & Licenses,7
3,"Agriculture, Livestock & Fisheries","Subsidies & Assistance for Agriculture, Livest...",4
4,"Agriculture, Livestock & Fisheries",Training & Extension Services,25
