In [None]:
import os
import pandas as pd
from PyPDF2 import PdfReader

def extract_pdf_values(folder_path, output_csv_path, parse_numbers=False):
    """
    Extracts Pb-210, Bi-214, Pb-214, and PtSrc-Pb-210 values from all PDF files in the specified folder
    and saves the data into a single CSV file.
    Parameters:
    folder_path: the path to the folder containing PDF files
    output_csv_path: the path to the output CSV file where the combined data will be saved
    parse_numbers: if True, will parse the scientific notation numbers in the results into more
    readable floats e.g. 6.2E-002 >> 0.062
    returns: None
    """

    combined_data = []  # List to store data from all PDF files

    for filename in os.listdir(folder_path):
        if filename.endswith(".PDF") or filename.endswith(".pdf"):  # Ensure the file is a PDF
            file_path = os.path.join(folder_path, filename)
            file_name_without_extension = os.path.splitext(filename)[0]  # Extract filename without extension

            # Initialize variables to store isotopes' values
            pb210 = Bi214 = Pb214 = ptsrc_pb210 = None

            # Read in the PDF file
            try:
                reader = PdfReader(file_path)
                if len(reader.pages) < 3:
                    print(f"PDF file '{filename}' has less than 3 pages. Skipping.")
                    continue  # Move to the next file if less than 3 pages
                page = reader.pages[2]  # Assuming the data is on the third page as before
                text = page.extract_text()
                lines = text.split('\n')

                # Look for the desired isotopes in the data
                for line in lines:
                    if 'Pb-210' in line:
                        pb210, pb210error = line.split()[-2:]
                    elif 'Bi-214' in line:
                        Bi214, Bi214error = line.split()[-2:]
                    elif 'Pb-214' in line:
                        Pb214, Pb214error = line.split()[-2:]

                # Look for PtSrc Pb-210 values in the PtSrc PDF
                ptsrc_pdf_name = f"PtSrc_{file_name_without_extension}.PDF"
                ptsrc_pdf_path = os.path.join(folder_path, ptsrc_pdf_name)
                if os.path.exists(ptsrc_pdf_path):
                    reader = PdfReader(ptsrc_pdf_path)
                    if len(reader.pages) < 3:
                        print(f"PtSrc PDF file '{ptsrc_pdf_name}' has less than 3 pages. Skipping.")
                        continue  # Move to the next file if less than 3 pages
                    page = reader.pages[2]
                    text = page.extract_text()
                    lines = text.split('\n')
                    for line in lines:
                        if 'Pb-210' in line:
                            ptsrc_pb210, PtSrc_Pb210error = line.split()[-2:]

                # Store the data in a dictionary with the filename as the first column
                data = {'File': filename, 'Pb-210': float(pb210), 'Pb-210 error': float(pb210error), 'Bi-214': float(Bi214), 'Bi-214 error': float(Bi214error), 'Pb-214': float(Pb214), 'Pb-214 error': float(Pb214error), 'ptsrc_pb210': float(ptsrc_pb210), 'ptsrc_pb210 error': float(PtSrc_Pb210error)}
                combined_data.append(data)
            except Exception as e:
                print(f"Error processing PDF file '{filename}': {e}")

    # Convert the combined data list to a DataFrame
    combined_df = pd.DataFrame(combined_data)

    # If user specified parse_numbers=True, parse the scientific notation numbers to floats
    if parse_numbers:
        combined_df[['Pb-210', 'Bi-214', 'Pb-214', 'ptsrc_pb210']] = combined_df[['Pb-210', 'Bi-214', 'Pb-214', 'ptsrc_pb210']].apply(lambda x: x.str.strip().astype(float))

    # Save the combined data to a CSV file
    combined_df.to_csv(output_csv_path, index=False)

    
  
# Example usage (mac users):
#If you are using Windows, add an "r" before the "" of the pathname
extract_pdf_values("folderpath", "corename_canberradata.csv")



