In [1]:
import re
import pandas as pd
from openpyxl import Workbook

In [2]:
# Read the text file
with open('65027 XII.txt', 'r') as file:
    text = file.read()

In [3]:
# Define patterns for extracting Date, School code and name, and Region
date_pattern = r"DATE:- (\d{2}/\d{2}/\d{4})"
school_pattern = r"SCHOOL : - (\d+) (.+)"
region_pattern = r"REGION: (.+)"
subject_header_pattern = r"SUB CD\s+SUB CD\s+SUB CD\s+SUB CD\s+SUB CD\s+SUB CD"


In [4]:
# Find matches for Date, School code and name, and Region
date_match = re.search(date_pattern, text)
school_match = re.search(school_pattern, text)
region_match = re.search(region_pattern, text)

date = date_match.group(1) if date_match else ""
school_code = school_match.group(1) if school_match else ""
school_name = school_match.group(2) if school_match else ""
region = region_match.group(1) if region_match else ""


In [5]:
# Find the starting index of the subject data table
subject_header_match = re.search(subject_header_pattern, text)
subject_table_start = subject_header_match.end() if subject_header_match else None


In [12]:
# If the subject table is found, extract the subject data
if subject_table_start:
    subject_data = text[subject_table_start:].strip()

    # Define the pattern to extract data for each candidate
    candidate_pattern = r"(\d+)\s+(\w)\s+(\w.+?)\s+(\d{3})\s+(\d{3})\s+(\d{3})\s+(\d{3})\s+(\d{3})\s+(\d{3})\s+(\w \w \w)\s+(\w+)\s*"

    # Find all matches in the subject data using the candidate pattern
    matches = re.findall(candidate_pattern, subject_data)

    # Initialize empty lists to store data
    data = []

    # Process each match and extract data points
    for match in matches:
        roll_number = match[0]
        gender = match[1]
        name = match[2].strip()
        subjects_list = match[3:9]
        grades_list = match[9:15]
        result = match[15]
        comp_subject = match[16].strip()

        row_data = [roll_number, gender, name, *subjects_list, *grades_list, result, comp_subject]
        data.append(row_data)

    # Create a pandas DataFrame
    df = pd.DataFrame(data, columns=['Roll No.', 'Gender', 'Student Name',
                                     'Subject 1', 'Subject 2', 'Subject 3', 'Subject 4', 'Subject 5', 'Subject 6',
                                     'Grade 1', 'Grade 2', 'Grade 3', 'Grade 4', 'Grade 5', 'Grade 6', 'Result', 'Comp Subject'])

    # Create a new Excel workbook
    workbook = Workbook()

    # Add the DataFrame to the Excel workbook
    sheet = workbook.active
    # for r_idx, row in enumerate(df.values, start=2):  # Start from row 2 to leave space for headers
    #     for c_idx, value in enumerate(row, start=1):
    #         sheet.cell(row=r_idx, column=c_idx, value=value)

    # Add additional information (Date, School Code, School Name, Region) to the Excel file
    sheet['A1'] = 'Date'
    sheet['B1'] = 'School Code'
    sheet['C1'] = 'School Name'
    sheet['D1'] = 'Region'

    sheet['A2'] = date
    sheet['B2'] = school_code
    sheet['C2'] = school_name
    sheet['D2'] = region

 # Add the column headers
    for c_idx, column in enumerate(df.columns, start=1):
        sheet.cell(row=4, column=c_idx, value=column)

     # Add the subject-wise data to the Excel file
    for r_idx, row in enumerate(df.values, start=6):  # Start from row 6 to leave space for headers and additional info
        for c_idx, value in enumerate(row, start=1):
            sheet.cell(row=r_idx, column=c_idx, value=value)

    # Save the Excel workbook
    workbook.save('output_file.xlsx')

    print("Excel file 'output_file.xlsx' has been created.")
else:
    print("Subject data table not found in the text file.")

Excel file 'output_file.xlsx' has been created.
