# Welcome to the 210Pb age model script!

### <div style="text-align: right"> Last modified by A.A. Lehrmann 18 November 2024 </div>


### The script below will extract radioisotope data from Canberra PDFs, run the age model (from the Wellner Lab Group excel model (Appleby, 2001; Boldt et al., 2013), and plot the age model

### Important instructions before you begin:

    1. NEVER edit raw data. Do not delete Canberra PDFs. Do not remove sediment weights from original lab notebook excel sheet.

    2. Make an /CORE_AgeModelOutput/ folder to put all of your script's outputs

    3. When copying folder paths, make sure to remove quotation marks

    4. Always add the extension .csv to your output files

## First, extract radioisotope data from Canberra PDFs
Run cell (press triangle that says run) and follow instructions.

In [None]:
import os
import pandas as pd
from PyPDF2 import PdfReader

def extract_pdf_values(folder_path, output_csv_path, parse_numbers=False):
    combined_data = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".PDF") or filename.endswith(".pdf"):
            file_path = os.path.join(folder_path, filename)

            # Initialize variables to store isotopes' values
            pb210 = pb210error = Bi214 = Bi214error = Pb214 = Pb214error = None

            # Read in the PDF file
            try:
                reader = PdfReader(file_path)
                if len(reader.pages) < 3:
                    print(f"PDF file '{filename}' has less than 3 pages. Skipping.")
                    continue
                page = reader.pages[2]
                text = page.extract_text()
                lines = text.split('\n')

                # Check if the filename starts with 'PtSrc_'
                if filename.startswith("PtSrc_"):
                    # Only extract Pb-210 value for PtSrc files
                    for line in lines:
                        if 'Pb-210' in line:
                            ptsrc_pb210, PtSrc_Pb210error = line.split()[-2:]
                            break
                    # Store only Pb-210 values if found
                    if ptsrc_pb210 is not None and PtSrc_Pb210error is not None:
                        data = {'File': filename, 'Pb-210': float(ptsrc_pb210), 'Pb-210 error': float(PtSrc_Pb210error)}
                    else:
                        print(f"Pb-210 not found in '{filename}'. Skipping.")
                        continue
                else:
                    # Extract values for other files
                    for line in lines:
                        if 'Pb-210' in line:
                            pb210, pb210error = line.split()[-2:]
                        elif 'Bi-214' in line:
                            Bi214, Bi214error = line.split()[-2:]
                        elif 'Pb-214' in line:
                            Pb214, Pb214error = line.split()[-2:]

                    # Validate that all required data is present
                    if pb210 is None or pb210error is None:
                        print(f"Pb-210 not found in '{filename}'. Skipping.")
                        continue
                    if Bi214 is None or Bi214error is None:
                        print(f"Bi-214 not found in '{filename}'. Skipping.")
                        continue
                    if Pb214 is None or Pb214error is None:
                        print(f"Pb-214 not found in '{filename}'. Skipping.")
                        continue

                    # Store the extracted data
                    data = {
                        'File': filename,
                        'Pb-210': float(pb210),
                        'Pb-210 error': float(pb210error),
                        'Bi-214': float(Bi214),
                        'Bi-214 error': float(Bi214error),
                        'Pb-214': float(Pb214),
                        'Pb-214 error': float(Pb214error),
                    }
                
                combined_data.append(data)
            except Exception as e:
                print(f"Error processing PDF file '{filename}': {e}")

    # Create a DataFrame
    combined_df = pd.DataFrame(combined_data)

    # Extract the numeric part at the end of the file name for sorting
    def extract_numeric_suffix(file_name):
        try:
            # Split by underscores, dots, or hyphens and get the last numeric part before the extension
            parts = file_name.split('_')[-1].split('.')[0]
            return int(parts)
        except ValueError:
            return float('nan')  # Return NaN if the suffix is not numeric

    # Apply sorting based on the numeric suffix
    combined_df['File_order'] = combined_df['File'].apply(extract_numeric_suffix)
    
    # Sort by the numeric part extracted
    combined_df = combined_df.sort_values(by='File_order')

    # Remove the 'File_order' column before saving
    combined_df = combined_df.drop(columns=['File_order'])

    # Optional: Parse numbers into floats if specified
    if parse_numbers:
        for col in ['Pb-210', 'Bi-214', 'Pb-214']:
            if col in combined_df.columns:
                combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')

    # Save the sorted DataFrame to CSV
    combined_df.to_csv(output_csv_path, index=False)

# Example usage
folder_path = input("Enter the folder path of Canberra PDFs: ")  # Ask for folder path
output_csv_path = input("Enter the output CSV file path (e.g. CORE_CanberraData_DATE.csv): ")  # Ask for the output file path
extract_pdf_values(folder_path, output_csv_path)


### Make note of which samples are missing data! This will be important when we plot!

# Open the output .csv file

Check to make sure all radioisotope data translated correctly. 

# Create two new columns
- ptsrc_pb210
- ptsrc_pb210 error

# Move Point Source Lead 210 data to ptsrc_pb210 and uncertainty to ptsrc_pb210 error of associated samples

# CHECK the following

Radioisotope data should have the following headings

 ### | File    | Pb-210   | Pb-210 error    | Bi-214  | Bi-214 error   | Pb-214    |Pb-214 error |  ptsrc_pb210    | ptsrc_pb210 error  | 


# Create a new .csv file from lab notebook for the sample weight data

Sample weights should have the following headings: 

### | Core    | Top of interval (cm)   | Center point of interval    |Base of interval (cm)  | sediment weight (g)    | 




Run cell below

In [None]:
import pandas as pd
import numpy as np
import re

# Prompt user for file paths and output file name
csv1_path = input("Enter the path to the sample weight CSV file (e.g., /path/weights.csv): ")
csv2_path = input("Enter the path to the Canberra data CSV file (e.g., /path/canberra.csv): ")
output_file_name = input("Enter the path for the output CSV file (e.g., CORE_AgeModel_DATE.csv): ")

# Load the CSV files
csv1 = pd.read_csv(csv1_path)
csv2 = pd.read_csv(csv2_path)

# Extract 'Center point of interval' from .csv 2 based on the median of the last two digits in 'File'
csv2['Center point of interval'] = csv2['File'].apply(
    lambda x: np.median([int(num) for num in re.findall(r'\d+', x.split('_')[-1])])
)

# Merge CSV files based on 'Center point of interval'
data = pd.merge(csv1, csv2, on='Center point of interval', how='left')  # Merge using the new 'Center point of interval'

# Now proceed with the calculations based on your updated requirements

# Extract the 'year of core' as input
year_of_core = int(input("Enter the year of core (e.g., 2023): "))

# Calculate columns
data['Pb-210 activity (Bq/g)'] = data['Pb-210']/data['sediment weight (g)']
data['Pb-210 correction factor'] = data['ptsrc_pb210'] / 151031.56  # Correction factor for Pb-210
data['Self absorb. Corrected Pb-210 activity (Bq/g)'] = data['Pb-210 activity (Bq/g)'] / data['Pb-210 correction factor'] # Calculate Self absorb. Corrected Pb-210 activity (Bq/g)
data['Bi-214 activity (Bq/g)'] = data['Bi-214'] / data['sediment weight (g)']
data['Pb-214 activity (Bq/g)'] = data['Pb-214'] / data['sediment weight (g)']

# Averaged supported activity of Bi-214 and Pb-214
data['Averaged supported activity of Bi-214 and Pb-214 (Bq/g)'] = (
    data['Bi-214 activity (Bq/g)'] + data['Pb-214 activity (Bq/g)']
) / 2

# Calculate background activity uncertainty (Bq/g)
data['Background activity uncertainty (Bq/g)'] = (
    (data['Bi-214 error'] + data['Pb-214 error']) / 2
) / data['sediment weight (g)']


# Calculate Excess Pb-210 (Bq/g)
data['Excess Pb-210 (Bq/g)'] = data['Self absorb. Corrected Pb-210 activity (Bq/g)'] - data['Averaged supported activity of Bi-214 and Pb-214 (Bq/g)']

# Calculate surface activity (value of first interval's 'Excess Pb-210 (Bq/g)')
data['Surface activity'] = data['Excess Pb-210 (Bq/g)'].iloc[0]

# Calculate Age bp
data['Age bp'] = (1 / 0.03114) * np.log(data['Surface activity'] / data['Excess Pb-210 (Bq/g)'])

# Calculate 'calendar years pre year of core'
data['calendar years pre year of core'] = year_of_core - data['Age bp']

# Save the final DataFrame to a new CSV file
data.to_csv(output_file_name, index=False)  # Save as user-defined file name

print(f"Calculations completed, data exported to '{output_file_name}'")


# Check the output data. Make sure data isnt *fishy*
Look at the column labeled 'Age'. Are the ages within the realm of possibility? If not, ask Asmara for help!

# Now plot it!
Run cell below 

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import pandas as pd  # Assuming you are using pandas for data handling

# Get the core name from user input
core_name = input("Enter the core name for the title: ")

# Ask user for depths to label "calendar years pre year of core"
depths_to_label_input = input("Enter the depths (comma-separated) where 'calendar years pre year of core' should be labeled (or type 'all' to label all intervals): ")

# If the user types 'all', label all intervals
if depths_to_label_input.lower() == 'all':
    depths_to_label = data['Center point of interval'].tolist()  # Use all depth intervals
else:
    # Otherwise, parse the comma-separated depths
    depths_to_label = [float(depth.strip()) for depth in depths_to_label_input.split(",")]

# Ask user if there are intervals with undetectable radioisotopes
missing_data_input = input("Are there any intervals with undetectable amounts of radioisotopes? (yes/no): ").strip().lower()

if missing_data_input == 'yes':
    # Ask for depths with undetectable radioisotopes
    missing_depths_input = input("Enter the depths (comma-separated) with undetectable radioisotopes: ")
    missing_depths = [float(depth.strip()) for depth in missing_depths_input.split(",")]
else:
    missing_depths = []

# Define colors for the series and lighter shades for error bars
excess_pb210_color = 'black'
excess_pb210_error_color = mcolors.to_rgba(excess_pb210_color, alpha=0.3)  # Lighter black with transparency
supported_activity_color = 'grey'
supported_activity_error_color = mcolors.to_rgba(supported_activity_color, alpha=0.3)  # Lighter grey with transparency

# Ask user where to save the plot PDF
save_location = input("Enter the full path where you want to save the plot PDF (e.g., /path/to/your/directory/): ")
plot_filename = f"{core_name}_Age_Model.pdf"  # Generate filename based on core name
save_path = save_location + "/" + plot_filename

# Plot 1: Dynamic Core 210Pb Uncorrected Activity
plt.figure(figsize=(3, 5))
plt.errorbar(data['Pb-214 activity (Bq/g)'], data['Center point of interval'], 
             xerr=data['Pb-214 error'], fmt='-', color=excess_pb210_color, label='Pb-214 activity (Bq/unit)', 
             capsize=5, linewidth=1, ecolor=excess_pb210_error_color)  # Lightened error color

# Highlight missing depths with brown spans
for y in missing_depths:
    plt.axhspan(y - 0.5, y + 0.5, alpha=0.5, color='brown', label='Undetectable radioisotope' if y == missing_depths[0] else None)

plt.title(f"{core_name} 210 Pb Uncorrected Activity", fontsize=18)
plt.xlabel("Bq/g", fontsize=14)
plt.ylabel("Depth (cm)", fontsize=14)
plt.gca().invert_yaxis()  # Invert y-axis to show depth from surface
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=1)  # Move legend below plot
plt.tight_layout()
plt.savefig(save_path, format='pdf', bbox_inches='tight')  # Save as PDF at user-specified location
plt.show()

# Plot 2: Dynamic Core 210Pb Activity with error bars
plt.figure(figsize=(5, 10))

# Series 1: Excess Pb-210 (Bq/g) with error bars
plt.errorbar(data['Excess Pb-210 (Bq/g)'], data['Center point of interval'], 
             xerr=data['Pb-210 error'], fmt='-', color=excess_pb210_color, label='Excess Pb-210', 
             capsize=5, linewidth=1, ecolor=excess_pb210_error_color)  # Lightened error color

# Series 2: Averaged supported activity of Bi-214 and Pb-214 (Bq/g) with error bars
plt.errorbar(data['Averaged supported activity of Bi-214 and Pb-214 (Bq/g)'], data['Center point of interval'], 
             xerr=data['Background activity uncertainty (Bq/g)'], fmt='-', color=supported_activity_color, 
             label='Background Activity', capsize=5, linewidth=1, 
             ecolor=supported_activity_error_color)  # Lightened error color

# Highlight missing depths with brown spans
for y in missing_depths:
    plt.axhspan(y - 0.5, y + 0.5, alpha=0.5, color='brown', label='Undetectable radioisotope' if y == missing_depths[0] else None)

# Label the selected depths with "calendar years pre year of core" from the data column
for i, depth in enumerate(data['Center point of interval']):
    if depth in depths_to_label:
        # Get the corresponding 'calendar years pre year of core' value and convert to an integer for a clean label
        year_value = data['calendar years pre year of core'].iloc[i]
        
        # Check if the value is NaN (and skip it if it is)
        if not pd.isna(year_value):
            year = int(year_value)
            # Add an offset to move the label slightly to the right (along the x-axis)
            plt.text(data['Excess Pb-210 (Bq/g)'].iloc[i] + 0.05, depth,  # Add 0.1 to shift the label to the right
                     f'{year}', fontsize=14, color='black', verticalalignment='center')

plt.title(f"{core_name} Age Model", fontsize=18)
plt.xlabel("Bq/unit", fontsize=14)
plt.ylabel("Depth (cm)", fontsize=14)
plt.gca().invert_yaxis()  # Invert y-axis to show depth from surface
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=1)  # Move legend below plot
plt.tight_layout()
plt.savefig(save_path, format='pdf', bbox_inches='tight')  # Save as PDF at user-specified location
plt.show()


# Well done!

#### When you've finished, go to Cell > All Output > Clear to be ready for the next user of this script.