In [1]:

import fitz  # PyMuPDF
import re

pdf_path = "constitution of India.pdf"
doc = fitz.open(pdf_path)

# Step 1: raw extraction (all pages) with cleaning and metadata capture
pages_and_chunks_raw = []

for page_num in range(len(doc)):
    text = doc[page_num].get_text("text")
    
    # Remove unwanted lines: headers and standalone digits
    cleaned_lines = []
    for line in text.splitlines():
        stripped_line = line.strip()
        if not re.search(r"THE\s+CONSTITUTION\s+OF\s+INDIA", stripped_line, re.IGNORECASE) and not stripped_line.isdigit():
            cleaned_lines.append(line)
    cleaned_text = "\n".join(cleaned_lines).strip()
    
    # Separate footnotes: look for a line of underscores (e.g., ________________) as separator
    lines = cleaned_text.splitlines()
    main_text_lines = []
    footnotes_lines = []
    in_footnotes = False
    separator_pattern = r'^[_]{5,}$'  # Matches lines with 5 or more underscores (adjust if needed)
    
    for line in lines:
        if re.match(separator_pattern, line.strip()):
            in_footnotes = True
            continue  # Skip the separator line itself
        if in_footnotes:
            footnotes_lines.append(line)
        else:
            main_text_lines.append(line)
    
    # Rebuild main text and footnotes
    main_text = "\n".join(main_text_lines).strip()
    f_notes = "\n".join(footnotes_lines).strip() if footnotes_lines else ""  # Empty if no footnotes
 
    # Now, extract "Section" metadata: look for pattern like "(...)" at the beginning (first few lines) of main_text
    section = ""  # Default to empty if not found
    main_lines = main_text.splitlines()[:5]  # Check only the beginning (first 5 lines)
    section_pattern = r"^\((.+?)\)$"  # Matches lines that are exactly "(content)"
    
    for line in main_lines:
        stripped = line.strip()
        match = re.match(section_pattern, stripped)
        if match:
            section = match.group(1).strip()  # Capture the text inside parentheses
            break  # Stop after finding the first match
    
    # If no match found, ignore (keep as empty) as per query
    
    # Store the page with cleaned main text, footnotes, and metadata
    pages_and_chunks_raw.append({
        "page_number": page_num + 1,
        "text": main_text,
        "section": section,
        "foot notes": f_notes  # Stored as metadata
    })

# Step 2: filter pages (skip 1–31 and 390–400)
# pages_and_chunks_1 = [item for item in pages_and_chunks_raw if item["page_number"] >= 32 and not (390 <= item["page_number"] <= 400)]
pages_and_chunks_1 = [
    item for item in pages_and_chunks_raw
    if item["page_number"] >= 32
    and not (390 <= item["page_number"] <= 400)
    and item["page_number"] != 142
]
doc.close()


In [2]:
#this wont affect 1 pages section because as per the algorithm empty section inherits the next sections values since the next page after 1 pages section is the first page of the section hence empty so the 1 paged section inherits this emptiness
for i in range(len(pages_and_chunks_1)-2):  # Stop before the last two
        if not pages_and_chunks_1[i]["section"] :  # If empty ("" or falsy)
            # Fill from the next one
            pages_and_chunks_1[i]["section"] = pages_and_chunks_1[i + 1]["section"]
    


In [3]:
empty_pages = []
for item in pages_and_chunks_1:
        if not item.get("section", "").strip():  # Check if empty or whitespace-only
            empty_pages.append(item["page_number"])

empty_pages

[32, 56, 283, 401, 402]

In [4]:
pages_and_chunks_1[109]

{'page_number': 141,
 'text': '(Part VI.—The States) \n235. Control over subordinate courts.—The control over district \ncourts and courts subordinate thereto including the posting and promotion of, \nand the grant of leave to, persons belonging to the judicial service of a State \nand holding any post inferior to the post of district judge shall be vested in the \nHigh Court, but nothing in this article shall be construed as taking away from \nany such person any right of appeal which he may have under the law \nregulating the conditions of his service or as authorising the High Court to deal \nwith him otherwise than in accordance with the conditions of his service \nprescribed under such law. \n236. Interpretation.—In this Chapter—\n(a) the expression “district judge” includes judge of a city civil court, \nadditional district judge, joint district judge, assistant district judge, chief \njudge of a small cause court, chief presidency magistrate, additional \nchief presidency magist

In [5]:
pages_and_chunks_1[401-11-32-1]["section"] ,pages_and_chunks_1[402-11-32-1]["section"]  ,pages_and_chunks_1[358-32-1]["section"] ,pages_and_chunks_1[341-32-1]["section"] 

('', '', 'Article 31B', 'Article 246')

In [6]:
# Example: Direct index assignment (if pages_and_chunks_1[0] is page 32) also we skipped pages 390 to 400 so do the math to get exact page numbers
pages_and_chunks_1[0]["section"] = "Preamble"  # Page 32
pages_and_chunks_1[24]["section"] = "PART IVA FUNDAMENTAL DUTIES"  # Page 56
#pages_and_chunks_1[110]["section"] = "PART VII The States in Part B of the First Schedule, omitted section by constitutional amendment."  # Page 142
pages_and_chunks_1[250]["section"] = "PART XXII SHORT TITLE, COMMENCEMENT, AUTHORITATIVE TEXT IN HINDI AND REPEALS"  # Page 283
pages_and_chunks_1[308]["section"] = 'Seventh Schedule' 
pages_and_chunks_1[325]["section"] = 'Ninth Schedule'
pages_and_chunks_1[347]["section"] = 'Eleventh Schedule'
pages_and_chunks_1[348]["section"] = 'Twelfth Schedule'
pages_and_chunks_1[357]["section"] = "APPENDIX II"  # Page 401
pages_and_chunks_1[358]["section"] = "APPENDIX III"  # Page 402


## section split and consolidation

In [7]:
# Assuming pages_and_chunks_1 is already defined as a list of dicts with 'page_number', 'text', 'section'
# Create a dictionary to hold consolidated text by section
consolidated_sections = {}

# Temporary variable to build text for each group
current_section = None
current_text = []
current_page_numbers = []
current_footnotes = {}  # Will be a dict of {page_number: footnotes}

# Iterate through the list (sorted by page_number)
for page in pages_and_chunks_1:
    section = page['section'].strip()  # No need for standalone fill; all sections are non-empty
    
    if section != current_section:
        # Save the previous group if it exists
        if current_section is not None:
            consolidated_sections[current_section] = {
                "consolidated_text": "\n".join(current_text).strip(),
                "page_numbers": current_page_numbers,  # List of page numbers for this section
                "footnotes": current_footnotes  # Dict of footnotes keyed by page_number
            }
        
        # Start new group
        current_section = section
        current_text = [page['text']]
        current_page_numbers = [page['page_number']]
        current_footnotes = {page['page_number']: page['foot notes']}
    else:
        # Append to current group
        current_text.append(page['text'])
        current_page_numbers.append(page['page_number'])
        current_footnotes[page['page_number']] = page['foot notes']

# Save the last group
if current_section is not None:
    consolidated_sections[current_section] = {
        "consolidated_text": "\n".join(current_text).strip(),
        "page_numbers": current_page_numbers,
        "footnotes": current_footnotes
    }

# To preview (example: first section's details)
if consolidated_sections:
    first_sec = list(consolidated_sections.keys())[0]
    print(f"Section: {first_sec}")
    print(f"Page Numbers: {consolidated_sections[first_sec]['page_numbers']}")
    print(f"Footnotes Preview: {list(consolidated_sections[first_sec]['footnotes'].items())[:2]}...")  # First 2 footnotes
    print(f"Text Preview: {consolidated_sections[first_sec]['consolidated_text'][:200]}...\n")

Section: Preamble
Page Numbers: [32]
Footnotes Preview: [(32, '1. Subs. by the Constitution (Forty-second Amendment) Act, 1976, s.2, for "SOVEREIGN \nDEMOCRATIC REPUBLIC" (w.e.f. 3-1-1977).\n2. Subs. by s. 2, ibid., for "Unity of the Nation" (w.e.f. 3-1-1977).')]...
Text Preview: PREAMBLE
WE, THE PEOPLE OF INDIA, having solemnly resolved to constitute 
India into a 
1[SOVEREIGN SOCIALIST SECULAR DEMOCRATIC 
REPUBLIC] and to secure to all its citizens:
JUSTICE, social, economic...



In [8]:

sections_list = []
for section_name, data in consolidated_sections.items():
    text = data["consolidated_text"]
    sections_list.append({
        "section": section_name,
        "consolidated_text": text,
        "section_character_count": len(text),
        "section_token_count": len(text) / 4,
        "page_numbers": data["page_numbers"],
        "footnotes": data["footnotes"]
    })


In [9]:
sections_list[:5]

[{'section': 'Preamble',
  'consolidated_text': 'PREAMBLE\nWE, THE PEOPLE OF INDIA, having solemnly resolved to constitute \nIndia into a \n1[SOVEREIGN SOCIALIST SECULAR DEMOCRATIC \nREPUBLIC] and to secure to all its citizens:\nJUSTICE, social, economic and political;\n \nLIBERTY of thought, expression, belief, faith and worship;\nEQUALITY of status and of opportunity;\nand to promote among them all\nFRATERNITY assuring the dignity of the individual and the 2[unity \nand integrity of the Nation];\nIN OUR CONSTITUENT ASSEMBLY this twenty-sixth day of \nNovember, 1949, do HEREBY ADOPT, ENACT AND GIVE TO \nOURSELVES THIS CONSTITUTION.',
  'section_character_count': 577,
  'section_token_count': 144.25,
  'page_numbers': [32],
  'footnotes': {32: '1. Subs. by the Constitution (Forty-second Amendment) Act, 1976, s.2, for "SOVEREIGN \nDEMOCRATIC REPUBLIC" (w.e.f. 3-1-1977).\n2. Subs. by s. 2, ibid., for "Unity of the Nation" (w.e.f. 3-1-1977).'}},
 {'section': 'Part I.—Union and its territo

## chapter split and consolidation

In [10]:

import re
import uuid

sections_list_split_chapters = []

# Regex pattern to detect chapters - e.g., "CHAPTER I.—GENERAL" or "CHAPTER II.—PARLIAMENT"
chapter_pattern = re.compile(r"^(CHAPTER\s+[IVXLCDM]+\.?—?.*)", re.MULTILINE | re.IGNORECASE)

for section_dict in sections_list:
    section_name = section_dict['section']
    text = section_dict['consolidated_text']

    # Find all chapter matches and their positions
    matches = list(chapter_pattern.finditer(text))

    if not matches:
        # No chapters found - keep the section as is
        sections_list_split_chapters.append({
            'section': section_name,
            'chapter': None,
            'consolidated_text': text,
            "character_count" : len(text),
            "token_count" : len(text)/4,
            'chunk_id': str(uuid.uuid4()),
        })
    else:
        # Chapters found - split text into chapter chunks
        for i, match in enumerate(matches):
            start_pos = match.start()
            end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(text)
            chapter_text = text[start_pos:end_pos].strip()
            chapter_title = match.group(1).strip()

            sections_list_split_chapters.append({
                'section': section_name,
                'chapter': chapter_title,
                'consolidated_text': chapter_text,
                "character_count" : len(chapter_text),
                "token_count" : len(chapter_text)/4,
                'chunk_id': str(uuid.uuid4()),
            })

# Verify number of splits and some samples
len(sections_list_split_chapters), sections_list_split_chapters[:3]


(57,
 [{'section': 'Preamble',
   'chapter': None,
   'consolidated_text': 'PREAMBLE\nWE, THE PEOPLE OF INDIA, having solemnly resolved to constitute \nIndia into a \n1[SOVEREIGN SOCIALIST SECULAR DEMOCRATIC \nREPUBLIC] and to secure to all its citizens:\nJUSTICE, social, economic and political;\n \nLIBERTY of thought, expression, belief, faith and worship;\nEQUALITY of status and of opportunity;\nand to promote among them all\nFRATERNITY assuring the dignity of the individual and the 2[unity \nand integrity of the Nation];\nIN OUR CONSTITUENT ASSEMBLY this twenty-sixth day of \nNovember, 1949, do HEREBY ADOPT, ENACT AND GIVE TO \nOURSELVES THIS CONSTITUTION.',
   'character_count': 577,
   'token_count': 144.25,
   'chunk_id': '8c0ed553-08db-4908-af99-97f131c6d185'},
  {'section': 'Part I.—Union and its territory',
   'chapter': None,
   'consolidated_text': 'PART I\nTHE UNION AND ITS TERRITORY\n1. Name and territory of the Union.—(1) India, that is Bharat, \nshall be a Union of State

In [11]:
sections_list_split_chapters[:5]

[{'section': 'Preamble',
  'chapter': None,
  'consolidated_text': 'PREAMBLE\nWE, THE PEOPLE OF INDIA, having solemnly resolved to constitute \nIndia into a \n1[SOVEREIGN SOCIALIST SECULAR DEMOCRATIC \nREPUBLIC] and to secure to all its citizens:\nJUSTICE, social, economic and political;\n \nLIBERTY of thought, expression, belief, faith and worship;\nEQUALITY of status and of opportunity;\nand to promote among them all\nFRATERNITY assuring the dignity of the individual and the 2[unity \nand integrity of the Nation];\nIN OUR CONSTITUENT ASSEMBLY this twenty-sixth day of \nNovember, 1949, do HEREBY ADOPT, ENACT AND GIVE TO \nOURSELVES THIS CONSTITUTION.',
  'character_count': 577,
  'token_count': 144.25,
  'chunk_id': '8c0ed553-08db-4908-af99-97f131c6d185'},
 {'section': 'Part I.—Union and its territory',
  'chapter': None,
  'consolidated_text': 'PART I\nTHE UNION AND ITS TERRITORY\n1. Name and territory of the Union.—(1) India, that is Bharat, \nshall be a Union of States.\n1[(2) The 

## Article split and consolidation

In [12]:

import re

# Assuming 'sections_list_split_chapters' is your list of dictionaries

# Regex pattern for splits
pattern = re.compile(r'\n\s*(\d+\.|\d*\[\d+\.)')

# New list for article splits
article_split = []

# Global counter starting from 1
counter = 0

# Iterate over each dictionary in the list
for entry in sections_list_split_chapters:
    section = entry.get('section')
    chapter = entry.get('chapter')
    consolidated_text = entry.get('consolidated_text', '')
    
    # Find all split positions
    splits = [(m.start(), m.group()) for m in pattern.finditer(consolidated_text)]
    
    
    # If there are splits
    if splits:
        initial_chunk = ''
        # Check for initial chunk before first split
        first_start = splits[0][0]
        if first_start > 0:
            initial_chunk = consolidated_text[0:first_start].strip()
        
        # Now handle the split chunks
        for i in range(len(splits)):
            counter+= 1
            # Skip 238 if reached
            if counter == 238:
                counter += 1
            start_pos = splits[i][0]
            end_pos = splits[i + 1][0] if i + 1 < len(splits) else len(consolidated_text)
            chunk = consolidated_text[start_pos:end_pos].strip()
            
            # Append initial chunk to the first split's chunk
            if i == 0 and initial_chunk:
                chunk = initial_chunk + '\n' + chunk  # Using newline to preserve structure
            
            if chunk:
                article_split.append({'Article number': str(counter)  ,'text': chunk, 'section': section, 'chapter': chapter,'character count':len(chunk) ,'token count': len(chunk)/4})
                
    else:
        # No splits, add the whole text if not empty
        whole_text = consolidated_text.strip()
        if whole_text:
            article_split.append({'Article number':str(counter),'text': whole_text, 'section': section, 'chapter': chapter,'character count':len(whole_text) ,'token count': len(whole_text)/4})
            



In [13]:
article_split[:5]

[{'Article number': '0',
  'text': 'PREAMBLE\nWE, THE PEOPLE OF INDIA, having solemnly resolved to constitute \nIndia into a \n1[SOVEREIGN SOCIALIST SECULAR DEMOCRATIC \nREPUBLIC] and to secure to all its citizens:\nJUSTICE, social, economic and political;\n \nLIBERTY of thought, expression, belief, faith and worship;\nEQUALITY of status and of opportunity;\nand to promote among them all\nFRATERNITY assuring the dignity of the individual and the 2[unity \nand integrity of the Nation];\nIN OUR CONSTITUENT ASSEMBLY this twenty-sixth day of \nNovember, 1949, do HEREBY ADOPT, ENACT AND GIVE TO \nOURSELVES THIS CONSTITUTION.',
  'section': 'Preamble',
  'chapter': None,
  'character count': 577,
  'token count': 144.25},
 {'Article number': '1',
  'text': 'PART I\nTHE UNION AND ITS TERRITORY\n1. Name and territory of the Union.—(1) India, that is Bharat, \nshall be a Union of States.\n1[(2) The States and the territories thereof shall be as specified in \nthe First Schedule.]\n(3) The terri

## Further Data Exploration 

Excluding some data for simplicity: just taking the articles excluding the schedules and appendices

In [14]:
data = article_split[:399]
data[:2]

[{'Article number': '0',
  'text': 'PREAMBLE\nWE, THE PEOPLE OF INDIA, having solemnly resolved to constitute \nIndia into a \n1[SOVEREIGN SOCIALIST SECULAR DEMOCRATIC \nREPUBLIC] and to secure to all its citizens:\nJUSTICE, social, economic and political;\n \nLIBERTY of thought, expression, belief, faith and worship;\nEQUALITY of status and of opportunity;\nand to promote among them all\nFRATERNITY assuring the dignity of the individual and the 2[unity \nand integrity of the Nation];\nIN OUR CONSTITUENT ASSEMBLY this twenty-sixth day of \nNovember, 1949, do HEREBY ADOPT, ENACT AND GIVE TO \nOURSELVES THIS CONSTITUTION.',
  'section': 'Preamble',
  'chapter': None,
  'character count': 577,
  'token count': 144.25},
 {'Article number': '1',
  'text': 'PART I\nTHE UNION AND ITS TERRITORY\n1. Name and territory of the Union.—(1) India, that is Bharat, \nshall be a Union of States.\n1[(2) The States and the territories thereof shall be as specified in \nthe First Schedule.]\n(3) The terri

In [15]:
articles= {}
for item in data :
    # if item['token count'] > 400 :
        article = list(item.keys())[0] +' ' +item['Article number']
        articles[article] = item['token count']
        
articles

{'Article number 0': 144.25,
 'Article number 1': 102.75,
 'Article number 2': 75.5,
 'Article number 3': 343.25,
 'Article number 4': 187.25,
 'Article number 5': 116.0,
 'Article number 6': 304.5,
 'Article number 7': 186.75,
 'Article number 8': 205.5,
 'Article number 9': 73.0,
 'Article number 10': 65.5,
 'Article number 11': 71.75,
 'Article number 12': 85.75,
 'Article number 13': 290.0,
 'Article number 14': 39.0,
 'Article number 15': 697.25,
 'Article number 16': 680.25,
 'Article number 17': 57.0,
 'Article number 18': 150.0,
 'Article number 19': 763.0,
 'Article number 20': 133.0,
 'Article number 21': 86.5,
 'Article number 22': 720.5,
 'Article number 23': 125.25,
 'Article number 24': 56.75,
 'Article number 25': 276.5,
 'Article number 26': 111.25,
 'Article number 27': 72.0,
 'Article number 28': 232.0,
 'Article number 29': 108.5,
 'Article number 30': 228.25,
 'Article number 31': 1250.25,
 'Article number 32': 265.5,
 'Article number 33': 206.5,
 'Article number 34

In [16]:
filtered_articles ={}
for item in data :
    if item['token count'] > 400 :
        filtered_article = list(item.keys())[0]+' ' +item['Article number']
        filtered_articles[filtered_article] = item['token count']
filtered_articles

{'Article number 15': 697.25,
 'Article number 16': 680.25,
 'Article number 19': 763.0,
 'Article number 22': 720.5,
 'Article number 31': 1250.25,
 'Article number 55': 516.5,
 'Article number 75': 434.25,
 'Article number 81': 455.25,
 'Article number 101': 451.75,
 'Article number 108': 669.25,
 'Article number 110': 507.0,
 'Article number 112': 541.75,
 'Article number 124': 1266.0,
 'Article number 134': 482.0,
 'Article number 145': 783.75,
 'Article number 148': 481.5,
 'Article number 164': 728.0,
 'Article number 170': 552.5,
 'Article number 171': 618.25,
 'Article number 190': 467.75,
 'Article number 197': 410.25,
 'Article number 199': 514.0,
 'Article number 202': 423.0,
 'Article number 213': 737.0,
 'Article number 217': 744.75,
 'Article number 224': 498.25,
 'Article number 226': 620.0,
 'Article number 233': 510.25,
 'Article number 239': 2610.25,
 'Article number 243': 3110.25,
 'Article number 244': 675.25,
 'Article number 246': 468.5,
 'Article number 249': 401

In [17]:
sorted_dict = dict(sorted(filtered_articles.items(), key=lambda x: x[1], reverse=True))
sorted_dict

{'Article number 371': 7119.25,
 'Article number 243': 3110.25,
 'Article number 338': 2888.25,
 'Article number 239': 2610.25,
 'Article number 366': 1978.5,
 'Article number 352': 1367.75,
 'Article number 323': 1342.25,
 'Article number 356': 1295.5,
 'Article number 279': 1275.0,
 'Article number 124': 1266.0,
 'Article number 31': 1250.25,
 'Article number 332': 1086.5,
 'Article number 312': 1085.0,
 'Article number 361': 1075.5,
 'Article number 372': 995.25,
 'Article number 320': 909.5,
 'Article number 145': 783.75,
 'Article number 19': 763.0,
 'Article number 269': 755.0,
 'Article number 275': 753.25,
 'Article number 217': 744.75,
 'Article number 213': 737.0,
 'Article number 359': 735.75,
 'Article number 164': 728.0,
 'Article number 22': 720.5,
 'Article number 370': 698.75,
 'Article number 15': 697.25,
 'Article number 16': 680.25,
 'Article number 244': 675.25,
 'Article number 108': 669.25,
 'Article number 334': 662.0,
 'Article number 360': 649.0,
 'Article numb

### splitting sub articles

In [18]:
import re
import string

# Assuming 'data' is your list of dictionaries (article_split[:399])

# New pattern for sub-article splits
pattern = re.compile(r'\n\s*(\d+[A-Z-]*\.|\d*\[\d+[A-Z-]*\.)')

# New list for sub-article splits
sub_article_split = []

for article in data:
    token_count = article.get('token count', 0)
    text = article.get('text', '')
    section = article.get('section')
    chapter = article.get('chapter')
    article_number = article.get('Article number')
    
    if token_count < 500:
        # Append without splitting, add sub-article None
        sub_article_split.append({
            'Article number': article_number,
            'text': text,
            'section': section,
            'chapter': chapter,
            'sub-article': str(0),
            'token_count' : len(text)/4
        })
    else:
        # Find splits
        splits = [(m.start(), m.group()) for m in pattern.finditer(text)]
        
        if splits:
            # Handle initial chunk
            initial_chunk = ''
            first_start = splits[0][0]
            if first_start > 0:
                initial_chunk = text[0:first_start].strip()
            
            # Uppercase letters for labeling
            sub_labels = list(string.ascii_uppercase)
            sub_counter = 0
            
            for i in range(len(splits)):
                start_pos = splits[i][0]
                end_pos = splits[i + 1][0] if i + 1 < len(splits) else len(text)
                chunk = text[start_pos:end_pos].strip()
                
                if i == 0 and initial_chunk:
                    chunk = initial_chunk + '\n' + chunk
                
                if chunk:
                    sub_label = sub_labels[sub_counter % len(sub_labels)]
                    sub_article_split.append({
                        'Article number': article_number,
                        'text': chunk,
                        'section': section,
                        'chapter': chapter,
                        'sub-article': sub_label,
                        'token_count' : len(chunk)/4
                    })
                    sub_counter += 1
        else:
            # No sub-splits found, treat as non-split
            sub_article_split.append({
                'Article number': article_number,
                'text': text,
                'section': section,
                'chapter': chapter,
                'sub-article': str(0),
                'token_count' : len(text)/4
            })



In [19]:
sub_article_split[:5]


[{'Article number': '0',
  'text': 'PREAMBLE\nWE, THE PEOPLE OF INDIA, having solemnly resolved to constitute \nIndia into a \n1[SOVEREIGN SOCIALIST SECULAR DEMOCRATIC \nREPUBLIC] and to secure to all its citizens:\nJUSTICE, social, economic and political;\n \nLIBERTY of thought, expression, belief, faith and worship;\nEQUALITY of status and of opportunity;\nand to promote among them all\nFRATERNITY assuring the dignity of the individual and the 2[unity \nand integrity of the Nation];\nIN OUR CONSTITUENT ASSEMBLY this twenty-sixth day of \nNovember, 1949, do HEREBY ADOPT, ENACT AND GIVE TO \nOURSELVES THIS CONSTITUTION.',
  'section': 'Preamble',
  'chapter': None,
  'sub-article': '0',
  'token_count': 144.25},
 {'Article number': '1',
  'text': 'PART I\nTHE UNION AND ITS TERRITORY\n1. Name and territory of the Union.—(1) India, that is Bharat, \nshall be a Union of States.\n1[(2) The States and the territories thereof shall be as specified in \nthe First Schedule.]\n(3) The territory

In [20]:
sub_articles= {}
for item in sub_article_split :
    # if item['token count'] > 400 :
    article = 'Article '+item['Article number']
    sub_article = item['sub-article']
    id  = article+'_' + sub_article
    sub_articles[id] = item['token_count']

In [21]:
sorted_SA = dict(sorted(sub_articles.items(), key=lambda x: x[1], reverse=True))
sorted_SA

{'Article 371_A': 2086.25,
 'Article 366_0': 1978.5,
 'Article 371_D': 1959.5,
 'Article 338_A': 1891.25,
 'Article 371_F': 1584.5,
 'Article 239_B': 1500.75,
 'Article 352_A': 1367.75,
 'Article 356_0': 1295.5,
 'Article 279_A': 1274.75,
 'Article 332_A': 1086.5,
 'Article 312_A': 1085.0,
 'Article 338_B': 996.75,
 'Article 372_A': 995.25,
 'Article 320_0': 909.5,
 'Article 323_B': 821.75,
 'Article 31_A': 821.0,
 'Article 145_0': 783.75,
 'Article 19_0': 763.0,
 'Article 269_A': 755.0,
 'Article 275_0': 753.25,
 'Article 217_0': 744.75,
 'Article 213_A': 737.0,
 'Article 359_A': 735.75,
 'Article 164_0': 728.0,
 'Article 22_0': 720.5,
 'Article 124_A': 717.75,
 'Article 370_0': 698.75,
 'Article 15_0': 697.25,
 'Article 16_0': 680.25,
 'Article 108_0': 669.25,
 'Article 334_A': 661.75,
 'Article 360_0': 649.0,
 'Article 324_A': 635.25,
 'Article 226_A': 620.0,
 'Article 171_0': 618.25,
 'Article 316_0': 617.0,
 'Article 363_A': 584.5,
 'Article 243_P': 576.75,
 'Article 374_0': 573.0

In [22]:
x =[]
for article,token in sorted_SA.items() :
    if token > 500 :
        x.append((article,token))
x

[('Article 371_A', 2086.25),
 ('Article 366_0', 1978.5),
 ('Article 371_D', 1959.5),
 ('Article 338_A', 1891.25),
 ('Article 371_F', 1584.5),
 ('Article 239_B', 1500.75),
 ('Article 352_A', 1367.75),
 ('Article 356_0', 1295.5),
 ('Article 279_A', 1274.75),
 ('Article 332_A', 1086.5),
 ('Article 312_A', 1085.0),
 ('Article 338_B', 996.75),
 ('Article 372_A', 995.25),
 ('Article 320_0', 909.5),
 ('Article 323_B', 821.75),
 ('Article 31_A', 821.0),
 ('Article 145_0', 783.75),
 ('Article 19_0', 763.0),
 ('Article 269_A', 755.0),
 ('Article 275_0', 753.25),
 ('Article 217_0', 744.75),
 ('Article 213_A', 737.0),
 ('Article 359_A', 735.75),
 ('Article 164_0', 728.0),
 ('Article 22_0', 720.5),
 ('Article 124_A', 717.75),
 ('Article 370_0', 698.75),
 ('Article 15_0', 697.25),
 ('Article 16_0', 680.25),
 ('Article 108_0', 669.25),
 ('Article 334_A', 661.75),
 ('Article 360_0', 649.0),
 ('Article 324_A', 635.25),
 ('Article 226_A', 620.0),
 ('Article 171_0', 618.25),
 ('Article 316_0', 617.0),
 (

In [23]:
len(x)

52

## Clause level split(optional) :

im gonna use the sub_article_split for embedding . but clause level split helps to reduce the chunks with more than 500 tokens.

In [24]:
import re

# Assuming 'sub_article_split' is your list of dictionaries from previous step

# Pattern for clause splits: digit followed by optional spaces and a dot, capturing the digit
# pattern = re.compile(r'(?:\n)?—?\((\d)\)\s*')
pattern = re.compile(r'(\n\(\d+\)\s*)')


# New list for clause level splits
clause_level_split = []

for entry in sub_article_split:
    token_count = entry.get('token_count')  # Assuming token count is present
    text = entry.get('text', '')
    section = entry.get('section')
    chapter = entry.get('chapter')
    article_number = entry.get('Article number')
    sub_article = entry.get('sub-article')
    
    if token_count > 500:
        # Find all split positions, capturing the digit
        splits = [(m.start(), m.group(1)) for m in pattern.finditer(text)]
        
        if splits:
            # Handle initial chunk before first split
            initial_chunk = ''
            first_start = splits[0][0]
            if first_start > 0:
                initial_chunk = text[0:first_start].strip()
            
            for i in range(len(splits)):
                start_pos = splits[i][0]
                end_pos = splits[i + 1][0] if i + 1 < len(splits) else len(text)
                clause_digit = splits[i][1]  # The captured digit
                
                chunk = text[start_pos:end_pos].strip()
                
                # Append initial chunk to the first clause's chunk
                if i == 0 and initial_chunk:
                    chunk = initial_chunk + '\n' + chunk
                
                if chunk:
                    clause_level_split.append({
                        'Article number': article_number,
                        'text': chunk,
                        'section': section,
                        'chapter': chapter,
                        'sub-article': sub_article,
                        'clause': clause_digit,
                        'token_count' : len(chunk)/4
                    })
        else:
            # No splits found even though token count > 500, add with clause '0'
            clause_level_split.append({
                'Article number': article_number,
                'text': text,
                'section': section,
                'chapter': chapter,
                'sub-article': sub_article,
                'clause': '0',
                'token_count' :token_count
            })
    else:
        # Token count <= 500 (including <400), add unchanged with clause '0'
        clause_level_split.append({
            'Article number': article_number,
            'text': text,
            'section': section,
            'chapter': chapter,
            'sub-article': sub_article,
            'clause': '0',
            'token_count' : token_count
        })




In [25]:
clause_level_split[:5]


[{'Article number': '0',
  'text': 'PREAMBLE\nWE, THE PEOPLE OF INDIA, having solemnly resolved to constitute \nIndia into a \n1[SOVEREIGN SOCIALIST SECULAR DEMOCRATIC \nREPUBLIC] and to secure to all its citizens:\nJUSTICE, social, economic and political;\n \nLIBERTY of thought, expression, belief, faith and worship;\nEQUALITY of status and of opportunity;\nand to promote among them all\nFRATERNITY assuring the dignity of the individual and the 2[unity \nand integrity of the Nation];\nIN OUR CONSTITUENT ASSEMBLY this twenty-sixth day of \nNovember, 1949, do HEREBY ADOPT, ENACT AND GIVE TO \nOURSELVES THIS CONSTITUTION.',
  'section': 'Preamble',
  'chapter': None,
  'sub-article': '0',
  'clause': '0',
  'token_count': 144.25},
 {'Article number': '1',
  'text': 'PART I\nTHE UNION AND ITS TERRITORY\n1. Name and territory of the Union.—(1) India, that is Bharat, \nshall be a Union of States.\n1[(2) The States and the territories thereof shall be as specified in \nthe First Schedule.]\n

In [26]:
clauses= {}
for item in clause_level_split :
    # if item['token count'] > 400 :
    article = 'Article '+item['Article number']
    sub_article = item['sub-article']
    clause = item['clause']
    id  = article+'_' + sub_article + '_' + clause
    clauses[id] = item['token_count']

In [27]:
sorted_clauses = dict(sorted(clauses.items(), key=lambda x: x[1], reverse=True))
sorted_clauses

{'Article 371_F_0': 1584.5,
 'Article 31_A_\n(2) ': 821.0,
 'Article 371_A_\n(2) ': 797.25,
 'Article 275_0_\n(2) ': 753.25,
 'Article 217_0_\n(2) ': 744.5,
 'Article 372_A_\n(3) ': 680.0,
 'Article 359_A_\n(2) ': 668.0,
 'Article 356_0_\n(4) ': 641.0,
 'Article 226_A_\n(2) ': 620.0,
 'Article 363_A_\n(2) ': 584.5,
 'Article 370_0_\n(2) ': 574.25,
 'Article 164_0_\n(2) ': 564.25,
 'Article 320_0_\n(3) ': 561.25,
 'Article 316_0_\n(2) ': 561.0,
 'Article 371_D_\n(2) ': 551.5,
 'Article 213_A_\n(2) ': 551.0,
 'Article 368_A_\n(5) ': 547.0,
 'Article 239_B_\n(2)': 525.75,
 'Article 15_0_\n(3) ': 519.5,
 'Article 224_0_0': 498.25,
 'Article 334_A_\n(2) ': 498.25,
 'Article 348_0_0': 498.25,
 'Article 290_0_0': 495.0,
 'Article 378_0_0': 490.5,
 'Article 317_0_0': 485.25,
 'Article 330_A_0': 484.75,
 'Article 134_0_0': 482.0,
 'Article 148_0_0': 481.5,
 'Article 323_A_\n(2) ': 477.0,
 'Article 246_0_0': 468.5,
 'Article 243_E_0': 468.0,
 'Article 190_0_0': 467.75,
 'Article 311_0_0': 467.25

## Chunking

In [29]:
import tiktoken
from sentence_transformers import SentenceTransformer, util
import spacy



In [30]:
# Initialize models
tokenizer = tiktoken.get_encoding("cl100k_base")  # For token counting
nlp = spacy.load("en_core_web_sm")  # Load spaCy model for sentence segmentation (use 'en_core_web_sm' or larger for better accuracy)

def count_tokens(text):
    return len(tokenizer.encode(text))

In [31]:
# Initialize models
tokenizer = tiktoken.get_encoding("cl100k_base")  # For token counting
nlp = spacy.load("en_core_web_sm")  # Load spaCy model for sentence segmentation

def count_tokens(text):
    return len(tokenizer.encode(text))

def sentence_split(text, max_tokens=500):
    """Split large text into sentence-based chunks <= max_tokens using spaCy, with fallback for very long sentences."""
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents]  # Use spaCy for sentence segmentation
    if not sentences:
        return [text]
    
    chunks = []
    current_chunk = []
    current_tokens = 0
    
    for sent in sentences:
        sent_tokens = count_tokens(sent)
        
        # Fallback: If a single sentence exceeds max_tokens, split it on '.\n'
        if sent_tokens > max_tokens:
            subclauses = [clause.strip() for clause in sent.split(';') if clause]
            for clause in subclauses:
                clause_tokens = count_tokens(clause)
                if current_tokens + clause_tokens > max_tokens and current_chunk:
                    chunks.append(' '.join(current_chunk))
                    current_chunk = []
                    current_tokens = 0
                current_chunk.append(clause)
                current_tokens += clause_tokens
        else:
            if current_tokens + sent_tokens > max_tokens and current_chunk:
                chunks.append(' '.join(current_chunk))
                current_chunk = []
                current_tokens = 0
            
            current_chunk.append(sent)
            current_tokens += sent_tokens
    
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    
    return chunks

In [32]:
def merge_consecutive_pairs(chunks, max_tokens=500):
    """Merge consecutive pairs of chunks if their combined token count is < max_tokens. Repeat until no more merges possible. Preserves order."""
    if len(chunks) < 2:
        return chunks

    changed = True
    current_list = chunks[:]

    while changed:
        changed = False
        merged = []
        i = 0
        while i < len(current_list):
            if i == len(current_list) - 1:
                # Last element left as is if odd
                merged.append(current_list[i])
                i += 1
            else:
                first = current_list[i]
                second = current_list[i + 1]
                combined_tokens = first['token_count'] + second['token_count']

                if combined_tokens < max_tokens:
                    # Merge
                    merged_text = first['text'] + ' ' + second['text']
                    merged_dict = {
                        'Article number': first['Article number'],
                        'text': merged_text,
                        'token_count': count_tokens(merged_text),  # Recalculate for accuracy
                        'section': first['section'],
                        'chapter': first['chapter'],
                        #'sub-article': f"{first['sub-article']} + {second['sub-article']}"
                    }
                    merged.append(merged_dict)
                    changed = True
                    i += 2
                else:
                    merged.append(first)
                    i += 1
        current_list = merged
    return current_list

In [37]:
sub_article_split[:3]

[{'Article number': '0',
  'text': 'PREAMBLE\nWE, THE PEOPLE OF INDIA, having solemnly resolved to constitute \nIndia into a \n1[SOVEREIGN SOCIALIST SECULAR DEMOCRATIC \nREPUBLIC] and to secure to all its citizens:\nJUSTICE, social, economic and political;\n \nLIBERTY of thought, expression, belief, faith and worship;\nEQUALITY of status and of opportunity;\nand to promote among them all\nFRATERNITY assuring the dignity of the individual and the 2[unity \nand integrity of the Nation];\nIN OUR CONSTITUENT ASSEMBLY this twenty-sixth day of \nNovember, 1949, do HEREBY ADOPT, ENACT AND GIVE TO \nOURSELVES THIS CONSTITUTION.',
  'section': 'Preamble',
  'chapter': None,
  'sub-article': '0',
  'token_count': 144.25},
 {'Article number': '1',
  'text': 'PART I\nTHE UNION AND ITS TERRITORY\n1. Name and territory of the Union.—(1) India, that is Bharat, \nshall be a Union of States.\n1[(2) The States and the territories thereof shall be as specified in \nthe First Schedule.]\n(3) The territory

In [38]:

# Process the data
post_split_groups = {}  # To hold post-split chunks per article

# Group by article and handle splitting
article_groups = {}
for item in sub_article_split:
    art_num = item['Article number']
    if art_num not in article_groups:
        article_groups[art_num] = []
    article_groups[art_num].append(item)

for art_num, group in article_groups.items():
    post_split = []
    for item in group:
        if item['token_count'] > 500:
            splits = sentence_split(item['text'])
            for j, split_text in enumerate(splits):
                post_split.append({
                    'Article number': art_num,
                    'text': split_text,
                    'section': item['section'],
                    'chapter': item['chapter'],
                    'sub-article': f"{item['sub-article']}_split{j}",
                    'token_count': count_tokens(split_text)
                })
        else:
            post_split.append(item)
    post_split_groups[art_num] = post_split



In [39]:
token_counts3= []
for key,value in post_split_groups.items():
    for dict in value :
        token_counts3.append(dict['token_count'])
        if dict['token_count'] > 500 :
            print(f'art : {dict['Article number']} section :{dict['section']},chapter : {dict['chapter']} ,subarticle : {dict['sub-article']}')
    # print(item['token_count'])

print(sorted(token_counts3,reverse=True))

[498.25, 498.25, 495.0, 495, 494, 494, 491, 491, 490.5, 489, 487, 485.25, 485, 485, 484.75, 482, 482.0, 482, 481.5, 481, 479, 478, 475, 475, 472, 472, 471, 470, 470, 469, 468.5, 468, 468.0, 468, 467.75, 467.25, 466, 464.0, 464, 463, 462, 461, 459, 459, 458, 455.25, 454, 451.75, 451, 450, 450, 449, 448.75, 448.75, 448, 447.75, 447, 446, 445, 445, 440, 439, 438, 435.0, 434.25, 434.0, 434, 433.25, 431, 426, 424, 424, 423.0, 423, 421, 420, 418, 414, 410.25, 410, 403.0, 401.75, 400.0, 398.0, 394.75, 394, 393.5, 389, 386, 384, 383.75, 383.25, 379.0, 378.25, 377.75, 377, 375.25, 373.75, 372, 369.5, 369, 369, 368.75, 367.0, 367, 366.75, 366.25, 366.25, 365.75, 365, 363.5, 363.25, 361.25, 360, 359, 358, 357.5, 356, 355, 352.75, 349.25, 349, 348.25, 347.5, 345.75, 343.75, 343.25, 342, 340.75, 340.25, 340, 339.25, 338.75, 338.25, 337.25, 337.25, 336.5, 335.5, 335.0, 334.0, 332.5, 331.75, 328.0, 327.25, 327.0, 326, 324.0, 323.0, 319.75, 315.75, 314.5, 311, 310, 309.25, 306.75, 306.5, 306, 305.25, 

In [40]:
# Now apply merging to each article's post-split list
final_chunks = []
for art_num, group_list in post_split_groups.items():
    if group_list:
        merged = merge_consecutive_pairs(group_list)
        final_chunks.extend(merged)

# Output or save final_chunks (list of dicts)
print(f"Processed {len(final_chunks)} chunks.")

Processed 501 chunks.


In [41]:
token_counts= []
for item in final_chunks:
    token_counts.append(item['token_count'])
    # print(item['token_count'])

print(sorted(token_counts,reverse=True))

[498.25, 498.25, 495.0, 495, 494, 494, 491, 491, 490.5, 489, 487, 485.25, 485, 485, 484.75, 482, 482.0, 482, 482, 481.5, 481, 479, 478, 475, 475, 475, 475, 472, 472, 471, 470, 470, 469, 468.5, 468, 468.0, 468, 467.75, 467.25, 466, 464.0, 464, 463, 462, 461, 459, 459, 458, 455.25, 454, 451.75, 451, 450, 450, 449, 448.75, 448, 447.75, 447, 446, 446, 445, 445, 440, 439, 438, 435, 435.0, 434.25, 434.0, 434, 433.25, 433, 431, 426, 424, 424, 423.0, 423, 421, 420, 418, 414, 410.25, 410, 404, 403.0, 401.75, 400.0, 394.75, 394, 393.5, 392, 389, 386, 386, 386, 384, 383.75, 383.25, 381, 379.0, 378.25, 377.75, 377, 375.25, 373.75, 372, 369.5, 369, 368.75, 367.0, 367, 366.75, 366.25, 366.25, 365.75, 365, 363.25, 361.25, 360, 360, 359, 358, 357.5, 356, 356, 355, 352.75, 349, 349, 348.25, 347.5, 343.75, 343.25, 342, 340.75, 340.25, 340, 339.25, 338.75, 338.25, 337.25, 337.25, 336.5, 335.5, 335, 335.0, 334.0, 332.5, 331.75, 328.0, 328, 327.25, 327.0, 326, 324.0, 319.75, 315.75, 314.5, 312, 311, 310, 3

## Merge based on chapters

this is beacuse some articles are too small and can be merged using the same logic we used to merge the sub articles. Its a good thing that we dont need to split anymore bcoz all greater than 500 token chunks have been split into smaller chunks.


In [42]:
def merge_consecutive_pairs2(chunks, max_tokens=500):
    """Merge consecutive pairs of chunks if their combined token count is < max_tokens. Repeat until no more merges possible. Preserves order."""
    if len(chunks) < 2:
        return chunks

    changed = True
    current_list = chunks[:]

    while changed:
        changed = False
        merged = []
        i = 0
        while i < len(current_list):
            if i == len(current_list) - 1:
                # Last element left as is if odd
                merged.append(current_list[i])
                i += 1
            else:
                first = current_list[i]
                second = current_list[i + 1]
                combined_tokens = first['token_count'] + second['token_count']

                if combined_tokens < max_tokens:
                    # Merge
                    merged_text = first['text'] + ' ' + second['text']
                    merged_dict = {
                        #'Article number': first['Article number'],
                        #'chapter': first['chapter'],
                        'section': first['section'],
                        'text': merged_text,
                        'token_count': count_tokens(merged_text) # Recalculate for accuracy
                        #'sub-article': f"{first['sub-article']} + {second['sub-article']}"
                    }
                    merged.append(merged_dict)
                    changed = True
                    i += 2
                else:
                    merged.append(first)
                    i += 1
        current_list = merged
    return current_list

In [43]:
#only big changes are here

# Group by article and handle splitting
chapter_groups = {}
for item in final_chunks:
    chapter_num = item['chapter']
    if chapter_num not in chapter_groups:
        chapter_groups[chapter_num] = []
    chapter_groups[chapter_num].append(item)


# Now apply merging to each article's post-split list
final_chunks2 = []
for chapter_num, group_list in chapter_groups.items():
    if group_list:
        merged = merge_consecutive_pairs2(group_list)
        final_chunks2.extend(merged)

# Output or save final_chunks (list of dicts)
print(f"Processed {len(final_chunks2)} chunks.")

Processed 328 chunks.


In [44]:
token_counts2= []
for item in final_chunks2:
    token_counts2.append(item['token_count'])
    # print(item['token_count'])

print(sorted(token_counts2,reverse=True))

[498.25, 498.25, 496, 495.0, 495, 494, 494, 491, 491, 490.5, 489, 487, 485.25, 485, 485, 484.75, 482, 482, 482, 482.0, 481.5, 481, 481, 479, 478, 477, 477, 475, 475, 475, 475, 472, 472, 472, 471, 470, 470, 469, 469, 468.5, 468, 468, 468.0, 468, 467.75, 467.25, 466, 464.0, 464, 463, 462, 461, 459, 459, 458, 458, 456, 455.25, 454, 453, 453, 451.75, 451, 451, 450, 450, 449, 449, 448.75, 448, 447.75, 447, 446, 446, 445, 445, 440, 440, 439, 439, 438, 437, 436, 435, 435.0, 434.25, 434.0, 433.25, 433, 431, 428, 426, 424, 424, 423, 423.0, 423, 421, 420, 420, 419, 418, 418, 417, 417, 414, 412, 412, 410.25, 410, 407, 406, 406, 406, 405, 404, 404, 403, 403.0, 402, 401.75, 400.0, 396, 394.75, 394, 393.5, 392, 392, 390, 390, 389, 387, 386, 386, 386, 384, 383, 382, 381, 381, 381, 379, 379, 379.0, 378.25, 378, 377.75, 377, 375.25, 375, 375, 373, 372, 372, 372, 369.5, 369, 367, 367.0, 367, 366.75, 366.25, 366, 365.75, 365, 363, 361.25, 361, 361, 360, 360, 359, 359, 359, 358, 358, 357, 356, 356, 355, 3

In [45]:
print(token_counts2==token_counts)

False


In [46]:
chaps = set()
for item in final_chunks2:
    chaps.add(item['section'])

In [47]:
chaps,len(chaps)

({'PART XXII SHORT TITLE, COMMENCEMENT, AUTHORITATIVE TEXT IN HINDI AND REPEALS',
  'Part I.—Union and its territory',
  'Part II.—Citizenship',
  'Part III.—Fundamental Rights',
  'Part IV.— Directive Principles of State Policy',
  'Part IX.—The Panchayats',
  'Part IXA.—The Municipalities',
  'Part IXB.—Co-operative Societies',
  'Part V.—The Union',
  'Part VI.—The States',
  'Part VIII.—The Union Territories',
  'Part X.—The Scheduled and Tribal Areas',
  'Part XI.—Relations between the Union and the States',
  'Part XII.—Finance, Property, Contracts and Suits',
  'Part XIII.—Trade, Commerce and Intercourse within the Territory of India',
  'Part XIV.—Services under the Union and the States',
  'Part XIVA.—Tribunals',
  'Part XIX.—MISCELLANEOUS',
  'Part XV.—Elections',
  'Part XVI.—Special Provisions Relating to Certain Classes',
  'Part XVIII.—EMERGENCY PROVISIONS',
  'Part XVII—LANGUAGE',
  'Part XX.—Amendment of the Constitution',
  'Part XXI.—Temporary, Transitional and Specia

## Final data

In [48]:
final_data = final_chunks2

In [49]:
#removing redundant metadata
keys_to_remove = ['chapter', 'Article number', 'sub-article']

for d in final_data:   # your_list = list of dicts
    for key in keys_to_remove:
        d.pop(key, None)   # safely remove key if it exists

In [50]:
final_data[:2]

[{'section': 'Preamble',
  'text': 'PREAMBLE\nWE, THE PEOPLE OF INDIA, having solemnly resolved to constitute \nIndia into a \n1[SOVEREIGN SOCIALIST SECULAR DEMOCRATIC \nREPUBLIC] and to secure to all its citizens:\nJUSTICE, social, economic and political;\n \nLIBERTY of thought, expression, belief, faith and worship;\nEQUALITY of status and of opportunity;\nand to promote among them all\nFRATERNITY assuring the dignity of the individual and the 2[unity \nand integrity of the Nation];\nIN OUR CONSTITUENT ASSEMBLY this twenty-sixth day of \nNovember, 1949, do HEREBY ADOPT, ENACT AND GIVE TO \nOURSELVES THIS CONSTITUTION. PART I\nTHE UNION AND ITS TERRITORY\n1. Name and territory of the Union.—(1) India, that is Bharat, \nshall be a Union of States.\n1[(2) The States and the territories thereof shall be as specified in \nthe First Schedule.]\n(3) The territory of India shall comprise—\n(a) the territories of the States; \n2[(b) the Union territories specified in the First Schedule; \nand

# Embedding

In [51]:
from sentence_transformers import SentenceTransformer

#i want the model to take 512 i/p tokens
model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")
print(model.max_seq_length)  # should print 512


512


In [52]:
#test
single_sentence = "This is all about the Constitution of India"
single_embedding = model.encode(single_sentence)
print(f"Sentence: {single_sentence}")
print(f"Embedding:\n{single_embedding}")
print(f"Embedding size: {single_embedding.shape}")

Sentence: This is all about the Constitution of India
Embedding:
[-2.29483880e-02 -2.45491207e-01 -3.11872005e-01  2.16670521e-02
 -1.54332280e-01 -1.99742481e-01 -6.04187138e-02  1.80678025e-01
  2.79928632e-02  2.46636719e-01  1.54098526e-01 -3.19197588e-02
 -1.36623830e-01 -9.63764936e-02 -9.26896930e-02 -1.76162988e-01
  2.06279047e-02  3.02298874e-01 -5.07821031e-02  2.18498915e-01
 -1.75143689e-01 -8.95166695e-02 -3.94101560e-01 -4.48017865e-02
 -9.99092385e-02  1.76205635e-01  4.89419028e-02  3.98151278e-02
 -5.03560483e-01 -3.29972617e-02 -1.54742345e-01  9.80408862e-02
 -2.03792639e-02  1.48961574e-01 -8.70389849e-05 -3.25626820e-01
  2.79523253e-01 -3.00019747e-04 -5.36120981e-02  4.33210611e-01
 -2.36822009e-01  2.06530616e-01 -1.66841477e-01 -1.54242843e-01
 -2.46435598e-01  3.18585694e-01  9.69069600e-02  3.91059779e-02
 -4.47299443e-02  4.38098252e-01  4.29562360e-01 -9.93021727e-02
  1.63305998e-01 -3.36457342e-01 -3.23792584e-02  1.70630783e-01
 -1.15379952e-02 -1.10062

In [53]:
import tqdm

In [54]:
%%time
import torch
# Send the model to the GPU
model.to("cuda") 

# Create embeddings one by one on the GPU - by looping
for item in final_data:
    item["embedding"] = model.encode(item["text"])

CPU times: total: 1min 30s
Wall time: 11.3 s


In [55]:
final_data[:5]

[{'section': 'Preamble',
  'text': 'PREAMBLE\nWE, THE PEOPLE OF INDIA, having solemnly resolved to constitute \nIndia into a \n1[SOVEREIGN SOCIALIST SECULAR DEMOCRATIC \nREPUBLIC] and to secure to all its citizens:\nJUSTICE, social, economic and political;\n \nLIBERTY of thought, expression, belief, faith and worship;\nEQUALITY of status and of opportunity;\nand to promote among them all\nFRATERNITY assuring the dignity of the individual and the 2[unity \nand integrity of the Nation];\nIN OUR CONSTITUENT ASSEMBLY this twenty-sixth day of \nNovember, 1949, do HEREBY ADOPT, ENACT AND GIVE TO \nOURSELVES THIS CONSTITUTION. PART I\nTHE UNION AND ITS TERRITORY\n1. Name and territory of the Union.—(1) India, that is Bharat, \nshall be a Union of States.\n1[(2) The States and the territories thereof shall be as specified in \nthe First Schedule.]\n(3) The territory of India shall comprise—\n(a) the territories of the States; \n2[(b) the Union territories specified in the First Schedule; \nand

In [56]:
%%time

#faster embedding while done as batches
# Turn text chunks into a single list => this is a list of all sentence chunks
text_chunks = [item["text"] for item in final_chunks2]



# Embed all texts in batches
text_chunk_embeddings = model.encode(text_chunks,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings

CPU times: total: 25.3 s
Wall time: 9.76 s


tensor([[-0.0840, -0.1364, -0.0833,  ..., -0.4912, -0.1376, -0.0175],
        [ 0.1100, -0.4856, -0.0691,  ..., -0.3096, -0.1628, -0.0545],
        [ 0.1024,  0.0083, -0.1829,  ..., -0.2138, -0.7527, -0.0408],
        ...,
        [ 0.3618, -0.0226, -0.1912,  ..., -0.2770, -0.6494, -0.0676],
        [ 0.0970,  0.0466, -0.1205,  ..., -0.0234, -0.4428, -0.0338],
        [ 0.1455, -0.2216, -0.1993,  ..., -0.0603, -0.3044, -0.2186]],
       device='cuda:0')

## Saving Embeddings to a file

In [57]:
import pandas as pd
# Save embeddings to file
embeddings_df = pd.DataFrame(final_data)
save_path = "constitution_embeddings.csv"
embeddings_df.to_csv(save_path, index=False)

In [58]:
embeddings_df[:3]

Unnamed: 0,section,text,token_count,embedding
0,Preamble,"PREAMBLE\nWE, THE PEOPLE OF INDIA, having sole...",262.0,"[-0.08396467, -0.13635199, -0.08326967, 0.1866..."
1,Part I.—Union and its territory,2. Admission or establishment of new States.—P...,402.0,"[0.10995457, -0.4855723, -0.06913098, 0.099049..."
2,Part I.—Union and its territory,4. Laws made under articles 2 and 3 to provide...,270.0,"[0.10243828, 0.008272459, -0.18292052, 0.14246..."


In [59]:
# Import saved file and view
constitution = pd.read_csv(save_path)
constitution.head()

Unnamed: 0,section,text,token_count,embedding
0,Preamble,"PREAMBLE\nWE, THE PEOPLE OF INDIA, having sole...",262.0,[-8.39646682e-02 -1.36351988e-01 -8.32696706e-...
1,Part I.—Union and its territory,2. Admission or establishment of new States.—P...,402.0,[ 1.09954573e-01 -4.85572308e-01 -6.91309795e-...
2,Part I.—Union and its territory,4. Laws made under articles 2 and 3 to provide...,270.0,[ 1.02438278e-01 8.27245880e-03 -1.82920516e-...
3,Part II.—Citizenship,6. Rights of citizenship of certain persons wh...,440.0,[-7.83850774e-02 4.36395667e-02 -1.12444967e-...
4,Part II.—Citizenship,8. Rights of citizenship of certain persons of...,353.0,[ 5.27101196e-03 7.31148869e-02 -1.04214944e-...


### The end of data preprocessing !