In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

BASE_DIR = "/content/drive/MyDrive/fyrp/gic_data_2/data_scraper/data"

## Function to read a single markdown file safely

In [None]:
def read_md_file(main_category_no, filename):
    """
    Reads a markdown file from service_md_{main_category_no} folder.
    Returns content as string or None if not found.
    """
    # folder = f"service_md_{main_category_no}"
    file_path = os.path.join(BASE_DIR, filename)

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return None

    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()
    except Exception as e:
        print(f"Failed to read {file_path}: {e}")
        return None

## Read files

In [None]:
import pandas as pd

content_df = pd.read_csv("gic_data_final_with_key.csv")
content_df.head()

Unnamed: 0,main_category_id,main_category,subcategory,service,serviceLink,content_file,sub_category_id,service_id,key
0,1,Education & Training,Education Publications,Sales Outlets of Books,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Sales_Outlets_of_Books.md,87,665,1-87-665
1,1,Education & Training,Education Publications,Museum Publications,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Museum_Publications.md,87,1481,1-87-1481
2,1,Education & Training,Education Publications,Services of Establishment Unit of Educational ...,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Services_of_Establishment_Unit_of...,87,662,1-87-662
3,1,Education & Training,Education Publications,Services of Information Technology Unit of Edu...,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Services_of_Information_Technolog...,87,659,1-87-659
4,1,Education & Training,Education Publications,Warehouses of Educational Publications Depart...,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Warehouses_of_Educational__Public...,87,664,1-87-664


## Imports

In [None]:
!pip install langchain-google-genai langchain_core

In [None]:
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from google.colab import userdata

## Context segementation

In [None]:
llm = GoogleGenerativeAI(model="gemini-2.0-flash", google_api_key="AIzaSyAY9tsLG4IHCJ58dX6FG3KaLiznPIg_3Do")

prompt_templete = PromptTemplate(
  template="""You are a content segmentation engine for Retrieval-Augmented Generation (RAG).
Your task is to divide the given service content into strictly size-controlled, meaningful contexts suitable for vector embedding.

---

You are given the full content of a government service page in Markdown format.
Your task is to divide this content into multiple meaningful contexts suitable for a Retrieval-Augmented Generation (RAG) system.
Follow these rules strictly:

1. SEGMENTATION RULES (MANDATORY)
  - Each context MUST be between 250 and 400 words.
  - Contexts must NEVER exceed 400 words. This rule is strict.
  - If the entire content is less than 250 words, return exactly ONE context.
  - If any logical section exceeds 400 words:
  - You MUST split it into multiple contexts.
  - Preserve meaning and continuity.
  - Use (Part 1), (Part 2) in the title.
  - It is better to split than to exceed the word limit.
  - Do NOT merge unrelated topics to reach the word limit.
  - Do NOT summarize or paraphrase â€” preserve original wording.
  - Do NOT add new information.

2. Title generation:
   - Generate a clear, concise title for each context.
   - The title must reflect the main topic of that context.
   - Titles should be informative (e.g., "Eligibility Criteria and Required Documents", "Application Procedure and Fees").

3. Formatting:
   - Preserve bullet points, numbered lists, and tables as plain text.
   - Keep links in Markdown format: [text](url).

4. Output format:
   - Return ONLY valid JSON.
   - Do NOT include explanations, comments, or additional text.

Use the following exact JSON structure:

{{
  "contexts": [
    {{
      "context_id": 1,
      "title": "Context title here",
      "content": "Context content here"
    }}
  ]
}}

---

Here is the service content:

{service_content}
""",
  input_variables=["service_content"],
)

In [None]:
content_df['no_of_contexts'] = 0
content_df.head()

In [None]:
import json

contexts_list = []

start_index = 217
end_index = len(content_df)

for idx, row in content_df.iterrows():

    if idx < start_index or idx > end_index:
        continue

    print(f"Processing {idx}: {row['key']}....")

    if row["content_file"] == "-":
        content_df.at[idx, "no_of_contexts"] = -1
        print("content file is -")
        continue

    text = read_md_file(
        main_category_no=row["main_category_id"],
        filename=row["content_file"]
    )

    if text is None:
        content_df.at[idx, "no_of_contexts"] = -2
        print("text is None")
        continue

    prompt = prompt_templete.format(service_content=text)
    response = llm.invoke(prompt)
    try:
        parsed = json.loads(response.split("```")[1].split("json")[1])
        contexts = parsed.get("contexts", [])
    except json.JSONDecodeError:
        print("JSONDecodeError")
        content_df.at[idx, "no_of_contexts"] = -3
        continue

    if len(contexts) > 0:
        print(f"Found {len(contexts)} contexts")
        print("Text length: ", len(text))
        for i, ctx in enumerate(contexts):
            print(f"Context {i+1} length: ", len(ctx["content"]))
            context_id = (
                f"{row['main_category_id']}-"
                f"{row['sub_category_id']}-"
                f"{row['service_id']}-"
                f"{i+1}"
            )

            contexts_list.append({
                "context_id": context_id,
                "title": ctx["title"],
                "content": ctx["content"]
            })

        content_df.at[idx, "no_of_contexts"] = len(contexts)
    else:
        print("no contexts")
        content_df.at[idx, "no_of_contexts"] = -4
    print("\n--------------------------------------\n")



In [None]:
contexts_df = pd.DataFrame(contexts_list)
print(len(contexts_df))
contexts_df.head()

In [None]:
contexts_df.to_csv(f"contexts_{start_index}_{end_index}.csv", index=False)

In [None]:
content_df.head()

In [None]:
content_df.to_csv(f"gic_data_contexts_count_updated.csv", index=False)

In [None]:
df1 = pd.read_csv("contexts_0_0.csv")
df2 = pd.read_csv("contexts_1_1.csv")
df3 = pd.read_csv("contexts_2_2.csv")
df4 = pd.read_csv("contexts_3_50.csv")
df5 = pd.read_csv("contexts_51_100.csv")
df6 = pd.read_csv("contexts_101_216.csv")
df7 = pd.read_csv("contexts_217_761.csv")

finl_contexts_df = pd.concat([df1, df2, df3, df4, df5, df6, df7])
print(len(finl_contexts_df))
finl_contexts_df.head()

In [None]:
finl_contexts_df.to_csv("gic_data_contexts_final.csv", index=False)

## Add meta data to contexts

In [None]:
import pandas as pd

contexts_df = pd.read_csv("gic_data_contexts_final.csv")
contexts_df.head(1)

Unnamed: 0,context_id,title,content
0,1-87-665-1,Educational Publications Department Book Sales...,"There are 6 sales outlets of books, successful..."


In [None]:
service_df = pd.read_csv("gic_data_final_with_key.csv")
service_df.head(1)

Unnamed: 0,main_category_id,main_category,subcategory,service,serviceLink,content_file,sub_category_id,service_id,key
0,1,Education & Training,Education Publications,Sales Outlets of Books,https://gic.gov.lk/gic/index.php/en/component/...,service_md_1/Sales_Outlets_of_Books.md,87,665,1-87-665


### Add meta data columns to contexts df

In [None]:
contexts_df["service_key"] = contexts_df["context_id"].str.rsplit("-", n=1).str[0]
contexts_df.head()


Unnamed: 0,context_id,title,content,service_key
0,1-87-665-1,Educational Publications Department Book Sales...,"There are 6 sales outlets of books, successful...",1-87-665
1,1-87-1481-1,Publication Prices - Sinhala Publications,Publication Price Sinhala Publications Rs. Cts...,1-87-1481
2,1-87-1481-2,Publication Prices - English Publications,English Publications\n21. Some Sinhala Combati...,1-87-1481
3,1-87-1481-3,Publication Prices - English and Tamil Publica...,36. The Pleistocene of Ceylon 390 00\n37. Colo...,1-87-1481
4,1-87-662-1,Departmental Administration and Record Keeping,1. Maintaining the personal files of the offic...,1-87-662


In [None]:
service_meta_cols = [
    "key",
    "main_category_id",
    "main_category",
    "subcategory",
    "sub_category_id",
    "service",
    "service_id",
    "serviceLink",
]

contexts_df = contexts_df.merge(
    service_df[service_meta_cols],
    left_on="service_key",
    right_on="key",
    how="left"
)

contexts_df.head()

Unnamed: 0,context_id,title,content,service_key,key,main_category_id,main_category,subcategory,sub_category_id,service,service_id,serviceLink
0,1-87-665-1,Educational Publications Department Book Sales...,"There are 6 sales outlets of books, successful...",1-87-665,1-87-665,1,Education & Training,Education Publications,87,Sales Outlets of Books,665,https://gic.gov.lk/gic/index.php/en/component/...
1,1-87-1481-1,Publication Prices - Sinhala Publications,Publication Price Sinhala Publications Rs. Cts...,1-87-1481,1-87-1481,1,Education & Training,Education Publications,87,Museum Publications,1481,https://gic.gov.lk/gic/index.php/en/component/...
2,1-87-1481-2,Publication Prices - English Publications,English Publications\n21. Some Sinhala Combati...,1-87-1481,1-87-1481,1,Education & Training,Education Publications,87,Museum Publications,1481,https://gic.gov.lk/gic/index.php/en/component/...
3,1-87-1481-3,Publication Prices - English and Tamil Publica...,36. The Pleistocene of Ceylon 390 00\n37. Colo...,1-87-1481,1-87-1481,1,Education & Training,Education Publications,87,Museum Publications,1481,https://gic.gov.lk/gic/index.php/en/component/...
4,1-87-662-1,Departmental Administration and Record Keeping,1. Maintaining the personal files of the offic...,1-87-662,1-87-662,1,Education & Training,Education Publications,87,Services of Establishment Unit of Educational ...,662,https://gic.gov.lk/gic/index.php/en/component/...


In [None]:
contexts_df.drop(columns=["service_key", "key"], inplace=True)

In [None]:
contexts_df.head(1)

Unnamed: 0,context_id,title,content,main_category_id,main_category,subcategory,sub_category_id,service,service_id,serviceLink
0,1-87-665-1,Educational Publications Department Book Sales...,"There are 6 sales outlets of books, successful...",1,Education & Training,Education Publications,87,Sales Outlets of Books,665,https://gic.gov.lk/gic/index.php/en/component/...


In [None]:
contexts_df.rename(
    columns={
        "title": "context_title",
        "content": "context_content",
        "main_category": "main_category_name",
        "subcategory": "sub_category_name",
        "service": "service_name",
        "serviceLink": "service_link"
    },
    inplace=True
)
contexts_df.head(1)

Unnamed: 0,context_id,context_title,context_content,main_category_id,main_category_name,sub_category_name,sub_category_id,service_name,service_id,service_link
0,1-87-665-1,Educational Publications Department Book Sales...,"There are 6 sales outlets of books, successful...",1,Education & Training,Education Publications,87,Sales Outlets of Books,665,https://gic.gov.lk/gic/index.php/en/component/...


### Append meta data to each context

In [None]:
contexts_df["context_with_metadata"] = (
    "Service: " + contexts_df["service_name"] + "\n"
    "Main Category: " + contexts_df["main_category_name"] + "\n"
    "Subcategory: " + contexts_df["sub_category_name"] + "\n"
    "Topic: " + contexts_df["context_title"] + "\n---\n"
    + contexts_df["context_content"]
)
contexts_df.head(1)

Unnamed: 0,context_id,context_title,context_content,main_category_id,main_category_name,sub_category_name,sub_category_id,service_name,service_id,service_link,context_with_metadata
0,1-87-665-1,Educational Publications Department Book Sales...,"There are 6 sales outlets of books, successful...",1,Education & Training,Education Publications,87,Sales Outlets of Books,665,https://gic.gov.lk/gic/index.php/en/component/...,Service: Sales Outlets of Books\nMain Category...


In [None]:
contexts_df.to_csv("gic_data_contexts_final_with_meta.csv", index=False)