# Raw Data Extraction

This notebook extract raw data from the pdf file and store the text in .txt files.

Below code create seperate files for seperate pdfs

In [10]:
import fitz  # PyMuPDF
import os

# Function to extract text, tables, and hyperlinks
def extract_pdf_content_with_links(pdf_path):
    extracted_text = ""
    links_data = []

    # Open the PDF with PyMuPDF
    pdf_document = fitz.open(pdf_path)

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        extracted_text += f"Page {page_num + 1}:\n{page.get_text('text')}\n\n"
        
        # Extracting link annotations (uri and surrounding text)
        links = page.get_links()
        for link in links:
            if 'uri' in link:
                uri = link["uri"]
                rect = link["from"]  # Rectangular area of the link
                # Extract the text within the link's rectangle
                link_text = page.get_textbox(rect)
                if not link_text:
                    link_text = "Unknown Text"
                links_data.append((link_text, uri))

    pdf_document.close()
    
    return extracted_text, links_data
        

# Function to process PDF files from the subfolders and save in 'data_txt'
def process_pdf_files(main_folder_path, output_folder):
    # Ensure the output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Loop through each of the 24 subfolders in the main folder
    for subfolder in os.listdir(main_folder_path):
        subfolder_path = os.path.join(main_folder_path, subfolder)

        # Process only if it's a directory
        if os.path.isdir(subfolder_path):
            for filename in os.listdir(subfolder_path):
                if filename.endswith(".pdf"):
                    pdf_path = os.path.join(subfolder_path, filename)
                    # Extract the text and hyperlinks
                    text, links = extract_pdf_content_with_links(pdf_path)
                    # Display the extracted text
                    #print("Extracted Text:\n", text)
                
                    # Display extracted hyperlinks
                    # print("\nExtracted Hyperlinks:")
                    # for link_text, uri in links:
                    #     print(f"Text: {link_text}, Link: {uri}")
                    
                    # Save the extracted text into a .txt file with the same name in 'data_txt' folder
                    txt_filename = os.path.splitext(filename)[0] + ".txt"
                    txt_path = os.path.join(output_folder, txt_filename)
                    
                    with open(txt_path, 'w', encoding='utf-8') as txt_file:
                        txt_file.write(text)
                    
                    print(f"Processed {filename} from {subfolder} and saved to {txt_filename}")


# Call the function for a specific directory
main_folder_path = "../data/data_pdf"
output_folder = "../data/data_txt"
process_pdf_files(main_folder_path, output_folder)



Processed CAO-05 Services for Multilingual Learner Students.pdf from Academics (CAO) and saved to CAO-05 Services for Multilingual Learner Students.txt
Processed CAO-25 International Field Trips Guidelines & Forms.pdf from Academics (CAO) and saved to CAO-25 International Field Trips Guidelines & Forms.txt
Processed CAO-08 Grading Requirements.pdf from Academics (CAO) and saved to CAO-08 Grading Requirements.txt
Processed CAO-24 Domestic Overnight Field Trip Guidelines.pdf from Academics (CAO) and saved to CAO-24 Domestic Overnight Field Trip Guidelines.txt
Processed CAO-01 Promotion Policy.pdf from Academics (CAO) and saved to CAO-01 Promotion Policy.txt
Processed CAO-22 General Field Trip Guidelines.pdf from Academics (CAO) and saved to CAO-22 General Field Trip Guidelines.txt
Processed CAO-23 Day Field Trip Guidelines.pdf from Academics (CAO) and saved to CAO-23 Day Field Trip Guidelines.txt
Processed CAO-07 Graduation Requirements.pdf from Academics (CAO) and saved to CAO-07 Gradua

Below code creates one text file for all pdfs

In [13]:
import fitz  # PyMuPDF
import os

# Function to extract text, tables, and hyperlinks
def extract_pdf_content_with_links(pdf_path):
    extracted_text = ""
    links_data = []

    # Open the PDF with PyMuPDF
    pdf_document = fitz.open(pdf_path)

    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        extracted_text += f"Page {page_num + 1}:\n{page.get_text('text')}\n\n"
        
        # Extracting link annotations (uri and surrounding text)
        links = page.get_links()
        for link in links:
            if 'uri' in link:
                uri = link["uri"]
                rect = link["from"]  # Rectangular area of the link
                # Extract the text within the link's rectangle
                link_text = page.get_textbox(rect)
                if not link_text:
                    link_text = "Unknown Text"
                links_data.append((link_text, uri))

    pdf_document.close()
    
    return extracted_text, links_data
        

# Function to process PDF files from the subfolders and save in one text file
def process_pdf_files(main_folder_path, output_file):
    # Open the output file for writing (creates or overwrites the file)
    with open(output_file, 'w', encoding='utf-8') as output_txt:
        # Loop through each of the 24 subfolders in the main folder
        for subfolder in os.listdir(main_folder_path):
            subfolder_path = os.path.join(main_folder_path, subfolder)

            # Process only if it's a directory
            if os.path.isdir(subfolder_path):
                for filename in os.listdir(subfolder_path):
                    if filename.endswith(".pdf"):
                        pdf_path = os.path.join(subfolder_path, filename)
                        # Extract the text and hyperlinks
                        text, links = extract_pdf_content_with_links(pdf_path)

                        # Display the extracted text
                        #print("Extracted Text:\n", text)
                    
                        # Display extracted hyperlinks
                        # print("\nExtracted Hyperlinks:")
                        # for link_text, uri in links:
                        #     print(f"Text: {link_text}, Link: {uri}")
                        
                        # Write the extracted text into the output text file
                        output_txt.write(text)

                        print(f"Processed {filename} from {subfolder}")

# Call the function for the main folder and specify the output text file
main_folder_path = "../data/data_pdf"
output_file = "../data/data_txt/_merged_corpus.txt"
process_pdf_files(main_folder_path, output_file)



Processed CAO-05 Services for Multilingual Learner Students.pdf from Academics (CAO)
Processed CAO-25 International Field Trips Guidelines & Forms.pdf from Academics (CAO)
Processed CAO-08 Grading Requirements.pdf from Academics (CAO)
Processed CAO-24 Domestic Overnight Field Trip Guidelines.pdf from Academics (CAO)
Processed CAO-01 Promotion Policy.pdf from Academics (CAO)
Processed CAO-22 General Field Trip Guidelines.pdf from Academics (CAO)
Processed CAO-23 Day Field Trip Guidelines.pdf from Academics (CAO)
Processed CAO-07 Graduation Requirements.pdf from Academics (CAO)
Processed CAO-27 Water Activities on Field Trips.pdf from Academics (CAO)
Processed CAO-06  GPA Calculation Method.pdf from Academics (CAO)
Processed CAO-03 Textbook Management.pdf from Academics (CAO)
Processed FAM-03 Student Government.pdf from Family and Community Advancement (FAM)
Processed FAM-01 School Parent Councils.pdf from Family and Community Advancement (FAM)
Processed FAM-08 Translation and Interpreta