## PDF Text Extraction using PyPDF package

### Importing required packages

In [25]:
import PyPDF2
import os

### Function for extracting text from pdf using PyPDF2's PdfReader function

In [26]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text = ''
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
        return text

### Function to iterate through all pdf files to perform text extraction on each file and save the respective output in .txt format

In [32]:
def process_pdfs_in_directory(directory_path, output_dir):
    # Iterate through all PDF files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            pdf_file_path = os.path.join(directory_path, filename)
            print("Parsing file ",filename, " saved at path ", pdf_file_path)
            pdf_content = extract_text_from_pdf(pdf_file_path)
            
            # save to text file
            year = filename.split("-")[0]
            level = filename.split("-")[1]
            new_name = "PyPDF_RR_"+year+"_"+level+"_combined.txt"
            if pdf_content:
                output_file_path = os.path.join(output_dir, new_name)
                print("Saving PyPDF txt file ",new_name, " at path ", output_file_path)
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(pdf_content)
                    print("Saved the txt file to " + output_file_path)
            else:
                print("No content for this file")
        

### Main function definition

In [33]:
def main():
    pdfs_directory_path = "../data"  # Replace with the actual path to your directory containing PDFs
    output_dir = "../sample_output/PyPDF"
    process_pdfs_in_directory(pdfs_directory_path, output_dir)

### Driver Code

In [35]:
if __name__ == "__main__":
    main()

Parsing file  2024-l1-topics-combined-2.pdf  saved at path  ../data\2024-l1-topics-combined-2.pdf
Saving PyPDF txt file  PyPDF_RR_2024_l1_combined.txt  at path  ../sample_output/PyPDF\PyPDF_RR_2024_l1_combined.txt
Saved the txt file to ../sample_output/PyPDF\PyPDF_RR_2024_l1_combined.txt
Parsing file  2024-l2-topics-combined-2.pdf  saved at path  ../data\2024-l2-topics-combined-2.pdf
Saving PyPDF txt file  PyPDF_RR_2024_l2_combined.txt  at path  ../sample_output/PyPDF\PyPDF_RR_2024_l2_combined.txt
Saved the txt file to ../sample_output/PyPDF\PyPDF_RR_2024_l2_combined.txt
Parsing file  2024-l3-topics-combined-2.pdf  saved at path  ../data\2024-l3-topics-combined-2.pdf
Saving PyPDF txt file  PyPDF_RR_2024_l3_combined.txt  at path  ../sample_output/PyPDF\PyPDF_RR_2024_l3_combined.txt
Saved the txt file to ../sample_output/PyPDF\PyPDF_RR_2024_l3_combined.txt
