# Initialize

In [8]:
import re
import pymupdf4llm
import pandas as pd
import fitz
import os
import numpy as np
import fitz
from tqdm import tqdm

# Define Functions

In [9]:
def clean_markdown(text):
    # try decode from latin1
    try:
        text = text.encode('latin1').decode('latin1')
    except:
        pass
    # try decode from ascii
    try:
        text = text.encode('ascii').decode('ascii')
    except:
        pass
        

    # decode escape characters
    # text = text.encode().decode('unicode_escape')

    # use regex to add a new line at the end of a line starting with #, if the following line do not start with #
    text = re.sub(r'(\s#.*)(?=\n[^\s#])', r'\1\n', text) 

    # if match ** + only nonprintable characters excluding new line + ** replace with ' '
    text = re.sub(r'\*\*[\W&&[^\n]]+\*\*', ' ', text) 
    # same for __
    text = re.sub(r'_[\W&&[^\n]]+_', ' ', text).strip()   

    # separate line with all bold
    pattern = r'(^|\s*\n)(\*\*[^\n]*?\*\*)( ?\n)((\s*[^\*])|$)'
    replacement = r'\1\n\2\n\3\4'
    text = re.sub(pattern, replacement, text, flags=re.MULTILINE)
    # separete line with all italic
    pattern = r'(^|\s*\n)(_[^\n]*?_)( ?\n)((\s*[^_])|$)'
    replacement = r'\1\n\2\n\3\4'
    text = re.sub(pattern, replacement, text, flags=re.MULTILINE)

    # remove multiple spaces
    text = re.sub(r' {2,}', ' ', text)
    # collapse more than 2 new lines into 2
    text = re.sub(r'\n{3,}', '\n\n', text)

    return text.strip()


def create_document_df(doc, filename):
    rows = []

    # Process Document and Populate DataFrame
    doc_id = (filename.split('/')[-1]).split('.')[0].strip()
    id = 0

    for i, page in enumerate(doc):
        text = page['text']
        text = clean_markdown(text)
        pos_in_page = 0

        for text_block in text.split('\n\n'):
            text_block = text_block.strip()
            # if is empty skip it
            if text_block == '':
                continue
            # if it has no alphanumeric characters, skip it
            if not any(c.isalnum() for c in text_block):
                continue

            # clean text = markdown replace #, *, _, - characters with space, collapse multiple spaces
            clean_text = re.sub(r'[\#\*\_\-]', ' ', text_block)
            clean_text = re.sub(r' {2,}', ' ', clean_text)
            # collapse multiple whitespaces
            clean_text = re.sub(r'\s+', ' ', clean_text)
            clean_text = clean_text.strip().lower()

            id += 10

            # Append to list of rows
            rows.append({
                'doc_id': doc_id,
                'page_number': i,
                'pos_in_page': pos_in_page,
                'md_text': text_block,
                'clean_text': clean_text,
                'id': id,
                'item': None,
                'is_noise': False,
                'noise_type': None,
                'type': None,
                'emb': None,            
            })  
            
            pos_in_page += 1
    
    # Create DataFrame from list of rows
    df = pd.DataFrame(rows, columns=['doc_id', 'page_number', 'pos_in_page', 'md_text', 'clean_text', 'item', 'type' ,'id', 'emb'])

    df['is_noise'] = False
    df['noise_type'] = None

    return df

def create_df(filename):
  d = fitz.open(filename)
  page = d[0]
  width_points = page.rect.width
  height_points = page.rect.height

  doc = pymupdf4llm.to_markdown(filename,
                                # pages=(range(25)),
                                hdr_info=False,
                                margins=[0, 0, 0, 50],  # remove margins
                                page_width=width_points,         # let it detect real width
                                # page_height=height_points,   
                                # force_text=True,
                              #   table_strategy='lines',
                                page_chunks=True,  
                                show_progress = False,
                                # extract_words=True,
                              )


  df = create_document_df(doc, filename)

  df = df.set_index('id', drop=True)

  return df

def clean_df(df):
    doc_id = df['doc_id'].iloc[0]

    # print('\nCleaning document...', doc_id)

    # Remove rows with empty clean_text
    df = df[df['clean_text'] != ''].copy()

    # REMOVE HEADERS
    def remove_md_notation(text):
        text = re.sub(r'[\#\*\_]', ' ', text)
        text = re.sub(r' {2,}', ' ', text)
        return text.strip()
    
    df['md'] = df['md_text'].apply(remove_md_notation)


    def remove_4digit_years(text: str) -> str:
        """
        Removes (or masks) any 4-digit year from the text.
        """
        return re.sub(r'\b\d{4}\b', '', text)


    def label_repeated_headers(df: pd.DataFrame) -> pd.DataFrame:
        """
        Labels repeated headers in a DataFrame based on the following conditions:
        1) Page number >= 3
        2) Position in page <= 3
        3) Does NOT contain the word "item" (case-insensitive)
        4) Text repeats on the *immediately next* page
            (page_number difference == 1)
        5) The first occurrence of each distinct header is not labeled;
            only subsequent consecutive pages are labeled.

        The function sets:
        - df['is_noise'] = True
        - df['noise_type'] = 'header'
        for repeated headers (but not for the first occurrence).
        """
        # --- 1) Create candidate header mask ---
        df['candidate_header'] = (
            (df['page_number'] >= 3) &
            (df['pos_in_page'] <= 3) &
            (~df['md'].str.lower().str.contains('item'))
        )

        # --- 2) Create normalized text (remove 4-digit years, strip whitespace) ---
        df['normalized_text'] = df['md'].apply(remove_4digit_years).str.strip()

        # --- 3) Within each distinct normalized_text, track previous row’s page_number and candidate_header ---
        #     - We only shift among rows that share the same normalized_text
        #     - `previous_page` is the page_number of the last occurrence of this text
        #     - `previous_candidate` tells us if that previous occurrence was also a candidate_header
        df['previous_page'] = df.groupby('normalized_text')['page_number'].shift()
        df['previous_candidate'] = df.groupby('normalized_text')['candidate_header'].shift()

        # --- 4) Check if the current row is on the next page *and* both rows are candidate headers ---
        #     - current row (df['candidate_header'] == True)
        #     - previous row (df['previous_candidate'] == True)
        #     - page_number difference = 1
        df['header_repeats'] = (
            df['candidate_header'] & 

            df['previous_candidate'].infer_objects(False) & 
            df['previous_page'].notna() &
            ((df['page_number'] - df['previous_page']) == 1)
        )

        # --- 5) Label repeated headers ---
        df['is_noise'] = df['header_repeats']
        df['noise_type'] = df['header_repeats'].apply(lambda x: 'header' if x else None)

        return df

    df = label_repeated_headers(df)

    df.drop(columns=['candidate_header', 'normalized_text', 'previous_page', 'header_repeats'], inplace=True)


    # if there is at least one true in is_noise column
    # if df['is_noise'].any():
    #     print('Removing headers:')
    # # print page number and md_text of noise type == 'header'
    # for i, row in df[df['noise_type'] == 'header'].iterrows():
    #     print('page:\t', row['page_number'], '\t', row['md_text'].replace('\n', ' ').replace('  ', ' ').strip())



    # ----------------------------
    # 1) Helper: parse roman numerals
    # ----------------------------
    roman_map = {
        'M': 1000, 'CM': 900, 'D': 500, 'CD': 400,
        'C': 100, 'XC': 90, 'L': 50, 'XL': 40,
        'X': 10, 'IX': 9, 'V': 5, 'IV': 4, 'I': 1
    }

    def roman_to_int(roman: str) -> int:
        """Converts a valid Roman numeral to integer. Returns -1 if invalid."""
        roman = roman.upper()
        i, result = 0, 0
        while i < len(roman):
            if i+1<len(roman) and roman[i:i+2] in roman_map:
                result += roman_map[roman[i:i+2]]
                i += 2
            elif roman[i] in roman_map:
                result += roman_map[roman[i]]
                i += 1
            else:
                return -1
        return result

    # ----------------------------
    # 2) Helper: parse text block into (marker_type, marker_text, marker_value)
    # ----------------------------
    def parse_marker(text: str):
        """
        Returns a tuple (marker_type, marker_text, marker_value) or (None, None, None)
        if not matching any pattern.

        marker_type can be: 'alpha', 'roman', 'numeric', 'mixed'.
        marker_text is the static part for 'mixed' (otherwise '').
        marker_value is an integer value representing the letter, roman, or digits.
        """
        txt = text.strip()
        
        # a) Single letter (A, B, C, etc.)?
        if re.fullmatch(r'[A-Za-z]', txt):
            # Convert letter -> integer (A=1, B=2, etc.)
            letter = txt.upper()
            val = ord(letter) - ord('A') + 1
            return ('alpha', '', val)

        # b) Roman numeral?
        #    let's see if it is a valid Roman
        r_val = roman_to_int(txt)
        if r_val > 0:
            return ('roman', '', r_val)

        # c) Pure numeric (1-3 digits)?
        if re.fullmatch(r'\d{1,3}', txt):
            val = int(txt)
            return ('numeric', '', val)

        # d) Mixed: has exactly one 1-3 digit number, everything else is static
        #    e.g. "Chapter 1", "Chapter 2", ...
        #    We'll remove that 1-3 digit piece and see if the rest is consistent
        #    We only handle 1 numeric group for simplicity
        matches = re.findall(r'\d{1,3}', txt)
        if len(matches) == 1:
            # Remove that numeric portion from the text to get the static part
            # We'll replace all occurrences of that specific match just once or globally? 
            # If there's exactly 1 match, we can just do a re.sub for that match.
            number_str = matches[0]
            val = int(number_str)
            # Carefully remove only the first occurrence of that digit substring
            # or (commonly) all occurrences if you’re sure there's only one:
            static_part = re.sub(number_str, '', txt, count=1).strip()
            # If there's some leftover text, we consider it the static part
            # If it's empty, it basically means it's just a numeric -> covered above
            if static_part:
                return ('mixed', static_part, val)

        # Otherwise, no match
        return (None, None, None)

    # ----------------------------
    # 3) Label last 2 blocks per page that form sequences
    # ----------------------------
    def label_page_markers(df: pd.DataFrame) -> pd.DataFrame:
        """
        For each page, consider only the last 2 blocks. 
        Parse them, check if there's a consecutive sequence across pages.
        Label them as 'is_marker' = True if they form part of a recognized pattern
        that is monotonically increasing on consecutive pages.
        
        We also label the subsequent block once we detect a sequence, 
        so the entire run of consecutive markers gets marked.
        """
        # We'll do it in two stages:
        #   Stage A: Identify the last 2 blocks of each page
        #   Stage B: Parse each block, then look for consecutive patterns across pages

        # --- Stage A: find last 2 blocks per page ---
        # group by page_number, then pick the last 2 entries
        # (assuming df is sorted by [page_number, pos_in_page])
        df['rank_in_page_desc'] = df.groupby('page_number')['pos_in_page'] \
                                    .rank(method='first', ascending=False)
        # rank_in_page_desc == 1 or 2 means the last 2 rows in that page
        mask_last2 = df['rank_in_page_desc'].isin([1,2])
        df['candidate_page_marker'] = mask_last2

        # Prepare columns for storing marker info
        df['marker_type'] = None
        df['marker_text'] = None
        df['marker_value'] = None
        df['is_marker'] = False  # final label (True/False)

        # --- Stage B: parse markers only if candidate_page_marker is True ---
        # Then check consecutive pages
        for idx, row in df.loc[mask_last2].iterrows():
            (mtype, mtext, mval) = parse_marker(row['md_text'])
            df.at[idx, 'marker_type'] = mtype
            df.at[idx, 'marker_text'] = mtext
            df.at[idx, 'marker_value'] = mval

        # We'll now label consecutive sequences. We can do:
        #   group by (marker_type, marker_text)
        #   then examine consecutive rows in ascending page_number
        #   check if page_number difference = 1
        #   check if marker_value difference = 1
        #   set is_marker = True for them (and their pair)

        # For convenience, let's only keep the candidate rows with non-null marker_type
        sub_df = df.loc[df['candidate_page_marker'] & df['marker_type'].notna()].copy()
        sub_df.sort_values(by=['marker_type', 'marker_text', 'page_number'], inplace=True)

        # We'll do a groupby
        def detect_sequences(group):
            """
            For each group (same marker_type, marker_text),
            look for rows where marker_value is consecutive and page_number is consecutive.
            Mark is_marker = True for all that appear in consecutive sequences.
            """
            group = group.sort_values('page_number').copy()
            group['prev_page']  = group['page_number'].shift()
            group['prev_value'] = group['marker_value'].shift()

            # We’ll track a boolean that is True if:
            #   (page_number == prev_page + 1) and (marker_value == prev_value + 1)
            # But we also want to label *both* sides of that link, i.e. the row and the previous row.
            # We'll do it with a simple approach: if row is consecutive to the previous, mark both.

            group['consecutive'] = False
            for i in range(1, len(group)):
                curr = group.iloc[i]
                prev = group.iloc[i-1]
                if (curr['page_number'] == prev['page_number'] + 1) and \
                (curr['marker_value'] == prev['marker_value'] + 1):
                    # Mark both as part of the sequence
                    group.at[group.index[i],   'consecutive'] = True
                    group.at[group.index[i-1], 'consecutive'] = True

            return group

        labeled = sub_df.groupby(['marker_type', 'marker_text'], group_keys=False).apply(detect_sequences, include_groups=False)

        # Merge the 'consecutive' info back into main df
        df = df.merge(
            labeled[['consecutive']],
            how='left',
            left_index=True,
            right_index=True
        )

        # 'consecutive' = True means it's part of a recognized sequence. Let's set is_marker = True
        df['is_marker'] = df['consecutive'].infer_objects(False)


        # update is_noise
        df['is_noise'] = df['is_noise'] | df['is_marker']
        # noise type == 'page_number' when is marker is true
        df['noise_type'] = np.where(df['is_marker'] == True, 'page_number', df['noise_type'])

        # Clean up columns we don't need
        df.drop(columns=['rank_in_page_desc', 'consecutive',], inplace=True, errors='ignore')

        # marker value where the marker is not true (false or none) is set to none
        df['marker_value'] = np.where(df['is_marker'] != True, None, df['marker_value'])


        return df


    df = label_page_markers(df)



    df= label_page_markers(df)

    df = df.drop(columns=['previous_candidate', 'candidate_page_marker', 'marker_type', 'marker_text', 'is_marker', 'marker_type', 'md'])
    # rename market value column name to detected_page_number
    df = df.rename(columns={'marker_value': 'detected_page_number'})


    

    return df



# Execute

In [10]:
folder = 'database/pdfs_train'
file_names = os.listdir(folder)

# keep only pdf files and extract doc ids
file_names = [f[:-4] for f in file_names if f.endswith('.pdf')]

file_names = sorted(file_names)

# if folder outputs does not exist, create it
if not os.path.exists('outputs/train'):
    os.makedirs('outputs/train')

for file_name in tqdm(file_names):
    df = create_df(f'{folder}/' + file_name + '.pdf')
    # if a folder with the same name does not exist in oputputs, create it

    # if folder with name filename does not exist in outputs, create it
    if not os.path.exists(f'outputs/train/{file_name}'):
        os.makedirs(f'outputs/train/{file_name}')

    df = clean_df(df)
    df.to_parquet(f'outputs/train/{file_name}/initial_df.parquet')



100%|██████████| 100/100 [1:05:17<00:00, 39.18s/it]
