In [1]:
import re
import os
from tika import parser

In [2]:
junk = re.compile(r'''^\s*\d+(?:\.\d+)*\s+.+?\.{2,}\s*\d+\s*$   # remove content index
                | ^[\|\s oO]+$                                # just bars or spaces or o/O
                | ^\s*[A-Z]{2}\s*$                            # two‐letter all-caps
                | ^\s*FIGURE\s+\d+                            # remove image captions
                | ^(?:[-+]?\d+(\.\d+)?\s+){2,}[-+]?\d+(\.\d+)?\s*$  # remove numbers
                ''', re.IGNORECASE | re.VERBOSE)

duplicate_fragment = re.compile(r'^(.+)\s+\1$')
camel_split       = re.compile(r'([a-z])([A-Z])')

def clean_text(raw_text):

    text = raw_text.replace('\r', '\n')
    text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)

    text = camel_split.sub(r'\1 \2', text)

    text = re.sub(r'\n{3,}', '\n\n', text)

    lines = []
    for line in text.split('\n'):
        if line.strip():
            lines.append(line)

    cleaned = []
    last_kept = None
    i = 0
    while i < len(lines):

        line = lines[i]

        if junk.match(line):
            i = i + 1
            continue

        # If this line doesn’t end in punctuation, merge it with the next
        if not re.search(r'[.?!;:]$', line) and i + 1 < len(lines):
            line = f"{line.strip()} {lines[i+1].strip()}"
            i = i + 2
        else:
            i = i + 1

        # Keep only letters and spaces, then normalize spaces
        alpha = re.sub(r'[^A-Za-z\s]', ' ', line)
        alpha = re.sub(r'\s+', ' ', alpha).strip()

        # Filter out exact duplicates or repeated fragments
        if alpha == last_kept or duplicate_fragment.match(alpha) or len(line)<20:
            continue

        cleaned.append(alpha)
        last_kept = alpha

    return "\n".join(cleaned)


if __name__ == "__main__":

    all_files = True
    i = 1

    f= open("E:/Projects/Seq2Seq/data/corpus/Final_Val_Text.txt", 'w')
    f.close()

    f= open("E:/Projects/Seq2Seq/data/corpus/Final_Train_Text.txt", 'w')
    f.close()

    while all_files:
        if i>10:

            pdf_path = "E:/Projects/Seq2Seq/data/raw_pdf/" + str(i) + '.pdf'
            if not os.path.exists(pdf_path):
                break

            raw = parser.from_file(pdf_path).get('content', '') or ''
            result = clean_text(raw)

            with open("E:/Projects/Seq2Seq/data/corpus/Final_Val_Text.txt", 'a', encoding='utf-8') as f:
                f.write(result + '\n')
        else:
            pdf_path = "E:/Projects/Seq2Seq/data/raw_pdf/" + str(i) + '.pdf'
            if not os.path.exists(pdf_path):
                break

            raw = parser.from_file(pdf_path).get('content', '') or ''
            result = clean_text(raw)

            with open("E:/Projects/Seq2Seq/data/corpus/Final_Train_Text.txt", 'a', encoding='utf-8') as f:
                f.write(result + '\n')


        i = i + 1


2025-07-11 18:22:32,388 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...
