In [2]:
import re
import pandas as pd
import numpy as np

input_string = '''DATE:- 12/05/2023    C.B.S.E. - SENIOR SCHOOL CERTIFICATE EXAMINATION (MAIN)-2023      REGION: PATNA         PAGE:-       1
                                                ***** (SCHOOL / ROLL NO WISE GAZETTE) *****


------------------------------------------------------------------------------------------------------------------------------------------------------
ROLL    F S NAME OF CANDIDATE                                   --------------------SUBJECTS------------------  INT-SB-GRD   RES                COMP
NO      L X                                                     SUB CD  SUB CD  SUB CD  SUB CD  SUB CD  SUB CD  GR1 GR2 GR3                     SUB
                                                                MKS GR  MKS GR  MKS GR  MKS GR  MKS GR  MKS GR 
------------------------------------------------------------------------------------------------------------------------------------------------------


SCHOOL : - 65027   DAV PUBLIC SCHOOL CANTONMENT AREA GAYA BIHAR    


22614342   M ABHIMANYU KUMAR                                     301     302     041     042     043     048      B1 A2 B2    PASS                      
                                                                 076  B2 080  B1 051  D1 063  C1 052  D2 083  B1


22614343   F AKANKSHA PRIYA                                      301     302     041     042     043     048      B1 B1 B1    PASS                      
                                                                 086  A2 076  B2 075  B1 079  A2 074  B1 094  A1
22614344   F AKANKSHI KUMARI                                     301     322     041     042     043     048      A2 A2 A2    PASS                      
                                                                 084  B1 098  A1 095  A1 095  A1 095  A1 089  A2
                                                                 
DATE:- 12/05/2023    C.B.S.E. - SENIOR SCHOOL CERTIFICATE EXAMINATION (MAIN)-2023      REGION: PATNA         PAGE:-       2
                                                ***** (SCHOOL / ROLL NO WISE GAZETTE) *****


------------------------------------------------------------------------------------------------------------------------------------------------------
ROLL    F S NAME OF CANDIDATE                                   --------------------SUBJECTS------------------  INT-SB-GRD   RES                COMP
NO      L X                                                     SUB CD  SUB CD  SUB CD  SUB CD  SUB CD  SUB CD  GR1 GR2 GR3                     SUB
                                                                MKS GR  MKS GR  MKS GR  MKS GR  MKS GR  MKS GR 
------------------------------------------------------------------------------------------------------------------------------------------------------


SCHOOL : - 65027   DAV PUBLIC SCHOOL CANTONMENT AREA GAYA BIHAR    


22614365   F KHUSHI                                              301     048     041     042     043     065      A2 A2 A2    PASS                      
                                                                 084  B1 084  B1 079  A2 082  A2 082  A2 088  B2'''

# Pattern to match the unwanted text (from the top of the string to the line starting with "SCHOOL")
unwanted_pattern = r'DATE:.*?\n.*?-----.*?\n\nSCHOOL.*?\n'

# Remove unwanted text using re.sub
input_string_cleaned = re.sub(unwanted_pattern, '', input_string, flags=re.DOTALL)

# Pattern to extract Roll, Gender, and Name
roll_gender_name_pattern = r"(\d+)\s+(\w)\s+([A-Z ]+)"

# Pattern to extract Subject Codes
subject_codes_pattern = r"(\d{3}\s+){5}\d{3}"

# Pattern to extract Result
result_pattern = r"\b(PASS|FAIL|COMP)\b"

# Pattern to extract Marks and Grades for each subject
marks_grades_pattern = r"\d{3}\s+[A-D][1-9]"

# Initialize an empty list to store each student's data as a dictionary
students_data = []

# Initialize variables to keep track of student information
current_student_info = None
current_student_grades = None

# Split the input_string by newline characters
lines = input_string.strip().split('\n')

# Function to process and add student data to students_data list
def add_student_data(roll, gender, name, subject_codes, result, marks_grades):
    first_row_data = {
        'Roll': roll,
        'Gender': gender,
        'Name': name.strip(),
        'Sub_1': subject_codes[0],
        'Sub_2': subject_codes[1],
        'Sub_3': subject_codes[2],
        'Sub_4': subject_codes[3],
        'Sub_5': subject_codes[4],
        'Sub_6': subject_codes[5],
        'Result': result
    }

    second_row_data = {
        'Roll': np.NaN,
        'Gender': np.NaN,
        'Name': np.NaN,
        'Sub_1': marks_grades[0],
        'Sub_2': marks_grades[1],
        'Sub_3': marks_grades[2],
        'Sub_4': marks_grades[3],
        'Sub_5': marks_grades[4],
        'Sub_6': marks_grades[5],
        'Result': np.NaN
    }

    students_data.append(first_row_data)
    students_data.append(second_row_data)

# Iterate through each line to process student data
for line in lines:
    line = line.strip()
    # Check if the line contains Roll, Gender, and Name
    if re.match(roll_gender_name_pattern, line):
        current_student_info = line
    # Check if the line contains Marks and Grades
    elif re.match(marks_grades_pattern, line):
        current_student_grades = line
        # Extracting Roll, Gender, and Name
        roll, gender, name = re.search(roll_gender_name_pattern, current_student_info).groups()

        # Extracting Subject Codes and Separating individual subject codes
        subject_codes_string = re.search(subject_codes_pattern, current_student_info).group()
        subject_codes = re.findall(r"\d{3}", subject_codes_string)

        # Extracting Result
        result = re.search(result_pattern, current_student_info).group()

        # Extracting Marks and Grades for each subject
        marks_grades = re.findall(marks_grades_pattern, current_student_grades)
        # marks_grades = [mg.replace(' ', '') for mg in marks_grades]

        # Add student data to students_data list
        add_student_data(roll, gender, name, subject_codes, result, marks_grades)

# Create the final DataFrame using the list of student dictionaries
df = pd.DataFrame(students_data)

# Reset the index of the DataFrame
df.reset_index(drop=True, inplace=True)
df = df.fillna("")
# Print the DataFrame
# print(df)
df

Unnamed: 0,Roll,Gender,Name,Sub_1,Sub_2,Sub_3,Sub_4,Sub_5,Sub_6,Result
0,22614342.0,M,ABHIMANYU KUMAR,301,302,041,042,043,048,PASS
1,,,,076 B2,080 B1,051 D1,063 C1,052 D2,083 B1,
2,22614343.0,F,AKANKSHA PRIYA,301,302,041,042,043,048,PASS
3,,,,086 A2,076 B2,075 B1,079 A2,074 B1,094 A1,
4,22614344.0,F,AKANKSHI KUMARI,301,322,041,042,043,048,PASS
5,,,,084 B1,098 A1,095 A1,095 A1,095 A1,089 A2,
6,22614365.0,F,KHUSHI,301,048,041,042,043,065,PASS
7,,,,084 B1,084 B1,079 A2,082 A2,082 A2,088 B2,
