In [None]:
import boto3
import re
import pandas as pd
import logging
import time

# Initialize a dictionary to store all results across pages
final_result = {}
anomaly_detected = False  # Flag for detecting anomalies
anomaly_pages = []  # List to track pages with anomalies
global_counter = 1  # Initialize the global counter for numbering
assembly_data = {}  # Dictionary to store unique section names by section number

# Initialize boto3 Textract client (update credentials as needed)
client = boto3.client('textract',
                      aws_access_key_id='',
                      aws_secret_access_key='',
                      region_name='ap-south-1')

# Initialize additional columns to store global information for each file
assembly_constituency_no_and_name = ""
part_no = ""
section_type = ""
section_value = ""

def process_text_file(data, page_number):
    global assembly_constituency_no_and_name, part_no, global_counter, assembly_data

    # Initialize a list for storing results from this page
    results = []
    previous_line = ""  # To track the line above the current line

    # Split the input data into lines
    lines = data.splitlines()

    # Initialize section_number and section_name with default values
    section_number = None
    section_name = None

    # Extract "Assembly Constituency No and Name", "Part No", and "Section No and Name"
    for line in lines:
        # Extracting Assembly Constituency No and Name
        ac_match = re.match(r'Assembly Constituency No and Name\s*:?\s*([\dA-Za-z\s-]+)', line)
        if ac_match:
            assembly_constituency_no_and_name = ac_match.group(1).strip() if ac_match.group(1) else ""

        # Extracting Part No
        part_match = re.match(r'Part No.\s*:?\s*(\d+)', line)
        if part_match:
            part_no = part_match.group(1).strip() if part_match.group(1) else ""

        # Extracting Section No and Name
        section_match = re.match(r'Section\s+No\s+and\s+Name\s*[:.]*\s*(\d+)\s*[--]\s*(.*)', line)
        if section_match:
            # Extract section number and name for this line
            section_number = section_match.group(1).strip()
            section_name = section_match.group(2).strip()

            # Save section number and name in the assembly_data dictionary
            if assembly_constituency_no_and_name not in assembly_data:
                assembly_data[assembly_constituency_no_and_name] = {}
            assembly_data[assembly_constituency_no_and_name][section_number] = section_name

    # Process each line to capture voter IDs, names, relations, age, etc.
    for line in lines:
        line = line.replace("Available", "").replace("Photo", "").replace(":", "").strip()

        # Initialize a dictionary for the current entry
        current_entry = {}

        # Local variables for section_number and section_name
        section_number_local = ""
        section_name_local = ""

        # Check for ID pattern (3 English characters followed by 7 digits)
        id_match = re.match(r'^([A-Z]{3}\d{7})', line)
        if id_match:
            id_no = id_match.group(1)

            # Check if section_number is already available
            if not section_number:
                # Use previous line to try and fetch section number and name
                prev_number_match = re.search(r'\d+', previous_line)

                # Use prev_number_match to assign section number and name if no section number is available
                if prev_number_match:
                    section_number_local = prev_number_match.group(0)
                    section_name_local = assembly_data.get(assembly_constituency_no_and_name, {}).get(section_number_local, "Unknown Section Name")

            # If section_number was available, use it; otherwise, use the local fallback
            section_number_to_use = section_number if section_number else section_number_local
            section_name_to_use = section_name if section_name else section_name_local

            # Ensure default values if no match or valid data
            section_number_to_use = section_number_to_use or "Unknown Section No"
            section_name_to_use = section_name_to_use or "Unknown Section Name"

            # Add the global counter to the current entry
            current_entry['No'] = global_counter
            global_counter += 1

            # Add details to the entry
            current_entry['polling station'] = "126-Sanpada"
            current_entry['ID'] = id_no

            # Add the Section Number and Section Name based on extraction
            current_entry['Section Number'] = section_number_to_use
            current_entry['Section Name'] = section_name_to_use
            current_entry['Assembly Constituency No and Name'] = assembly_constituency_no_and_name
            current_entry['booth location'] = "Jerupia English Medium School, Ground Floor, Room No 3, Sector 4, Sanpada, Navi Mumbai, 400705 "
        
        # Process patterns for Name, Relation, Age, Gender, etc.
        name_match = re.match(r'^Name\s*[: ]*(.*)', line)
        if name_match:
            current_entry['Name'] = name_match.group(1).strip()

        others_name_match = re.match(r"^Others\s*(.*)", line)
        if others_name_match:
            current_entry["relation name"] = others_name_match.group(1).strip()
            current_entry["relation type"] = "Other"

        father_name_match = re.match(r"^Father's Name\s*(.*)", line)
        if father_name_match:
            current_entry["relation name"] = father_name_match.group(1).strip()
            current_entry["relation type"] = "Father"

        mother_name_match = re.match(r"^Mother's Name\s*(.*)", line)
        if mother_name_match:
            current_entry["relation name"] = mother_name_match.group(1).strip()
            current_entry["relation type"] = "Mother"

        husband_name_match = re.match(r"^Husband's Name\s*(.*)", line)
        if husband_name_match:
            current_entry["relation name"] = husband_name_match.group(1).strip()
            current_entry["relation type"] = "Husband"

        # Check for Age and Gender
        age_gender_match = re.match(r'^Age\s*(\d+)\s*Gender\s*(\w+)', line)
        if age_gender_match:
            current_entry['Age'] = age_gender_match.group(1).strip()
            current_entry['Gender'] = age_gender_match.group(2).strip()

        # Check for House Number
        house_number_match = re.match(r'^House Number\s*(.*)', line)
        if house_number_match:
            current_entry['House Number'] = house_number_match.group(1).strip()

        # Add the current entry to the results list if it's not empty
        if current_entry:
            results.append(current_entry)

        # Update the previous line after processing the current line
        previous_line = line

    return results

# Initialize a list to store all results
all_results = []

# Loop through page numbers (adjust range as needed)
for page_number in range(3, 49):
    try:
        # Construct the image name based on the current loop iteration
        image_name = f'page_{page_number}.png'
        print(f"Processing {image_name}...")

        # Textract API call to detect document text
        response = client.detect_document_text(
            Document={
                'S3Object': {
                    'Bucket': 'callince',
                    'Name': image_name
                }
            }
        )

        # Extract text from response
        data = "\n".join([x["Text"] for x in response["Blocks"] if x["BlockType"] == "LINE"])

        # Process the data for this page
        page_results = process_text_file(data, page_number)

        # Add the processed data to the all_results list
        all_results.extend(page_results)

    except client.exceptions.ThrottlingException:
        print(f"Rate limit hit while processing page {page_number}. Retrying...")
        time.sleep(5)

    except Exception as e:
        logging.error(f"Error processing {image_name}: {e}")

# Convert the final results into a dictionary suitable for a DataFrame
final_result = {}
for entry in all_results:
    for key, value in entry.items():
        if key not in final_result:
            final_result[key] = []
        final_result[key].append(value)

# Ensure uniform lengths for DataFrame creation
max_length = max(len(values) for values in final_result.values())
for key in final_result:
    final_result[key].extend([None] * (max_length - len(final_result[key])))

# Create DataFrame and save to CSV
try:
    df = pd.DataFrame.from_dict(final_result)
    df.to_csv('126-Sanpada.csv', index=False)
    print("Data successfully saved to '123 - Sanpada.csv'.")
except Exception as e:
    print(f"Error while converting to DataFrame or saving to CSV: {e}")

# Output pages with anomalies if any
if anomaly_pages:
    print(f"Pages with anomalies: {anomaly_pages}")
else:
    print("No anomalies detected.")

Processing page_3.png...
Processing page_4.png...
Processing page_5.png...
Processing page_6.png...
Processing page_7.png...
Processing page_8.png...
Processing page_9.png...
Processing page_10.png...
Processing page_11.png...
Processing page_12.png...
Processing page_13.png...
Processing page_14.png...
Processing page_15.png...
Processing page_16.png...
Processing page_17.png...
Processing page_18.png...
Processing page_19.png...
Processing page_20.png...
Processing page_21.png...
Processing page_22.png...
Processing page_23.png...
Processing page_24.png...
Processing page_25.png...
Processing page_26.png...
Processing page_27.png...
Processing page_28.png...
Processing page_29.png...
Processing page_30.png...
Processing page_31.png...
Processing page_32.png...
Processing page_33.png...
Processing page_34.png...
Processing page_35.png...
Processing page_36.png...
Processing page_37.png...
Processing page_38.png...
Processing page_39.png...
Processing page_40.png...
Processing page_41.