# Adding in PDF parsing of text for other agencies


In [None]:
import pandas as pd
import importlib
import shutil
import os

import re

from engine.gather import PDFParser


In [None]:
parsed_file_name = 'parsed_reports.pkl'
report_dir = '../../output/report_pdfs'
importlib.reload(PDFParser)
reports = []
parsed_pdfs = pd.read_pickle(parsed_file_name)
for pdf in os.listdir(report_dir):
    report_id = pdf[:-4]
    if not pdf.endswith('.pdf'):
        continue
    if not parsed_pdfs.query('report_id == @report_id').empty:
        continue
    try:
        text = PDFParser.extractTextFromPDF(os.join(report_dir, pdf))

    except Exception as e:
        print(f"Error parsing {pdf}: {e}")
        continue
    reports.append({
        'report_id': report_id,
        'text': text
    })

parsed_pdfs = pd.concat([parsed_pdfs, pd.DataFrame(reports)], ignore_index=True)
parsed_pdfs.to_pickle('parsed_reports.pkl')
print(f"There are {len(parsed_pdfs)} reports in total out of {len(os.listdir(report_dir))}")
parsed_pdfs

In [None]:
parsed_pdfs = pd.read_pickle(parsed_file_name)
parsed_pdfs.sort_values(by='report_id', inplace=True)
parsed_pdfs

The first problem is that ATSb and TAIC have some duplicated reports. We dont want to have duplicated doucments inside the searcher. So for now it will be best if we just through away all of the occurance reports and just keep the regualr reports.

In [None]:
counts = parsed_pdfs['text'].value_counts()
deduped_pdfs = parsed_pdfs[~parsed_pdfs['text'].isin(counts[counts > 3].index)]

deduped_pdfs

## ATSB

After the parsing the PDFs need to have readable text as well as identifiable page numbers.

I will start by having a look at the different strcutures that are present

**aviation**
2000 - ATSB_a_2001_710
Really basic without a content section

### Running new parse

In [None]:
# Get all report IDs that start with ATSB*
importlib.reload(PDFParser)
atsb_text = parsed_pdfs[parsed_pdfs['report_id'].str.startswith('ATSB')].reset_index(drop=True)
atsb_text[['text', 'valid_page_numbers']] = [
    PDFParser.formatText(PDFParser.cleanText(text), report_id) for report_id, text, _ in atsb_text.to_records(index=False)]
atsb_text

### Checking to see if page numbers are valid

In [None]:
failed = atsb_text[atsb_text['valid_page_numbers'] == False]
failed['year'] = failed['report_id'].map(lambda x: int(x[7:11]))

atsb_text.valid_page_numbers.value_counts()

In [None]:
65/len(atsb_text)

## TSB

In [None]:
importlib.reload(PDFParser)
tsb_text = parsed_pdfs[parsed_pdfs['report_id'].str.startswith('TSB')].reset_index(drop=True)
tsb_text[['text', 'valid_page_numbers']] = [
    PDFParser.formatText(PDFParser.cleanText(text), report_id) for report_id, text, _ in tsb_text.to_records(index=False)]
tsb_text['year'] = tsb_text['report_id'].map(lambda x: int(x[6:10]))
tsb_text

In [None]:
failed = tsb_text[tsb_text['valid_page_numbers'] == False]

tsb_text.valid_page_numbers.value_counts()

In [None]:
text = parsed_pdfs.query('report_id == "TSB_a_2011_F0012"').text.values[0]
with open("individual.txt", "w") as f:
    f.write(text)
importlib.reload(PDFParser)
parsed_text, _ = PDFParser.formatTSBText(text, "test")

with open("individual-parsed.txt", 'w') as f:
    f.write(parsed_text)

## TAIC

I should also move over taic to this system then I can merge them togather

In [None]:
importlib.reload(PDFParser)
taic_text = parsed_pdfs[parsed_pdfs['report_id'].str.startswith('TAIC')].reset_index(drop=True)
taic_text[['text', 'valid_page_numbers']] = [
    PDFParser.formatText(PDFParser.cleanText(text), report_id) for report_id, text in taic_text.to_records(index=False)]
taic_text['year'] = taic_text['report_id'].map(lambda x: int(x[7:11]))
taic_text

In [None]:
failed = taic_text[taic_text['valid_page_numbers'] == False]

taic_text.valid_page_numbers.value_counts()

## Checking results

In [None]:
# Save to text files parsed files to let me inspect them
shutil.rmtree('parsed_reports/', ignore_errors=True)

os.makedirs('parsed_reports/', exist_ok=True)
for index, text in failed[['report_id', 'text']].sample(10, random_state=42).to_records(index=False):
    with open(os.path.join('parsed_reports', f'{index}.txt'), 'w') as f:
        f.write(text)
    shutil.copy(os.path.join(report_dir, f"{index}.pdf"), os.path.join('parsed_reports', f"{index}.pdf"))

In [None]:
report_id = "TAIC_r_2004_103"
text = parsed_pdfs.query(f'report_id == "{report_id}"').text.values[0]
with open("individual.txt", "w") as f:
    f.write(text)
importlib.reload(PDFParser)
parsed_text, _ = PDFParser.formatText(text, report_id)

with open("individual-parsed.txt", 'w') as f:
    f.write(parsed_text)

In [None]:
PDFParser.convertPDFToText(report_dir, '../../output/parsed_reports.pkl', True)

In [None]:
all_processed_pdfs = pd.read_pickle('../../output/parsed_reports.pkl')
value_counts = all_processed_pdfs.valid.value_counts()
print(f"{value_counts.iloc[1] / (value_counts.iloc[0] + value_counts.iloc[1]) * 100:.2f}% of the pdfs are invalid")

### Making test set

In [None]:
# Create the test pdfs

test_pdfs = [
    "ATSB_r_2021_010",
    "ATSB_r_2021_004",    
    "ATSB_a_2007_030",
    "ATSB_a_2002_646",
    "TSB_a_2022_O0118",
    "TSB_m_2021_A0041",
    "TSB_a_2011_F0012",
    "TAIC_r_2014_103",
    "TAIC_r_2004_121",
    "TAIC_a_2019_006",
]

test_report_dfs = '../../tests/data/output/report_pdfs/' 

shutil.rmtree(test_report_dfs, ignore_errors=True)
os.mkdir(test_report_dfs)
for report_id in test_pdfs:
    shutil.copy(os.path.join(report_dir, f'{report_id}.pdf'), os.path.join(test_report_dfs, f'{report_id}.pdf'))

### Problems found with the extraction

Some of the reports cant be extracted and need to be repaired with their page names.

I will move forward and potentially add the page numbers repairer later:

| report | problem |
| --- | --- |
| ATSB_a_2001_348 | Not matching any of the roman numerals. Faulty first match of int. |
| ATSB_a_2000_157 | No good regex matches of page numbers. Only matching roman numerals. |
| ATSB_a_2002_328 | There is a random 65 that messes up the order when being filled in. By checkign the number of pages and the suggested amount it coul be fixed. |
| ATSB_a_2008_052 | There is a missing roman numeral in the report and so it is causing the incorrect labelign of pages as the auto fill only works off anchors. | 
| ATSB_a_2023_012 | Not handling case of duplicate page number correctly. The solution should have the most amount of pages labelled correctly. |
| ATSB_m_2004_203 | Early mistake in regex matching causes rest of page numbers to be off. Having some tier of a match and its quality could help fix this one by working from the back down. |
| ATSB_m_2004_201 | Matching random non roman numerals. This could be fixed by counting the length of the pdf and fixing it. |
| ATSB_r_2003_005 | Same as above. Could filter out page nubmers that imply a document that is too long. |
| ATSB_r_2006_010 | Same as above |
| ATSB_r_2021_002 | Not handling duplicate roman numerals correctly. | 
| ATSB_m_2022_001 | Not handling duplicate page numbers correctly. |
| ATSB_r_2021_002 & ATSB_m_2021_001 & ATSB_a_2022_009 & ATSB_a_2022_007 & ATSB_a_2022_001 | Appendices are messing up the page numbers. |
| ATSB_a_2022_068 | Need to match as many as possible page numbers correctly |
| TAIC_a_2019_006 | The page one match is off as there is a mssing space. This could be fixed by looking at the later page matches. |
| TAIC_m_2009_203 | The roman numerals has an error in the pdf. The raw matches are more acurate and it only fails when it does the syncing. Maybe cancel out the syncing if it is already valid? |
| TAIC_r_2004_103 | This one has problems where there is internal error in the numbering. Therefore the syncing makes it less accurate then the simiple regex matching. |
