In [None]:
import fnmatch
import os
length = len(fnmatch.filter(os.listdir('./individual_day_files/'), '*.csv'))
approx_time = length * 1.1
print(approx_time)

In [None]:
import pandas as pd
import numpy as np
import os
import time
from datetime import datetime, timedelta
import pyproj
from sklearn.neighbors import BallTree

# Decorator function to measure the execution time of any function it wraps
def measure_execution_time(func):
    def wrapper(*args, **kwargs):
        # Record the start time before function execution
        start_time = time.time()
        # Execute the function and capture the result
        result = func(*args, **kwargs)
        # Record the end time after function execution
        end_time = time.time()
        # Calculate the total execution time
        execution_time = end_time - start_time
        # Print the execution time for the function
        print(f"Function {func.__name__} took {execution_time:.8f} seconds to execute.")
        return result
    return wrapper

@measure_execution_time
def process_data_with_nested_functions(directory_path, po_file_path, utm_zone=18, utm_hemisphere='N', k=6):
    """
    Main function that processes multiple data files, converts coordinates, and finds the k-nearest neighbors
    using nested helper functions.
    """

    # Function to read and preprocess all CSV files in the given directory
    def fix_headers(directory_path):
        # List all files in the directory that end with .csv
        files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

        # Initialize an empty list to hold DataFrames created from each file
        dataframes = []
        
        # Iterate through each file found in the directory
        for file in files:
            file_path = os.path.join(directory_path, file)

            # Extract the day of the year and year from the filename for date calculation
            base_name = os.path.splitext(file)[0]
            _, day_of_year, year = base_name.split('_')
            day_of_year = int(day_of_year)
            year = int(year)

            # Convert extracted day of the year and year to a standard date format (mm/dd/yyyy)
            date = datetime(year, 1, 1) + timedelta(days=day_of_year - 1)
            date_str = date.strftime('%m/%d/%Y')

            # Read the header lines from the file to create descriptive column names
            with open(file_path, 'r') as file:
                header_line = file.readline().strip().split()
                units_line = file.readline().strip().split()

            # Combine the header and unit lines to create detailed column names
            merged_header = [f"{header} ({unit})" for header, unit in zip(header_line, units_line)]
            df = pd.read_csv(file_path, delimiter=r'\s+', skiprows=2, names=merged_header)

            # Add a new column 'date' to the DataFrame using the extracted date
            df['date'] = date_str

            # Append the processed DataFrame to the list
            dataframes.append(df)

        return dataframes

    # Function to add site IDs and convert UTM coordinates to latitude and longitude
    def add_site_id_and_convert_utm_to_latlon(dataframes, utm_zone=18, utm_hemisphere='N'):
        # Define the coordinate projection from UTM to Latitude/Longitude
        utm_proj = pyproj.Proj(proj='utm', zone=utm_zone, ellps='WGS84', datum='WGS84', units='m', north=utm_hemisphere)
        lat_lon_proj = pyproj.Proj(proj='latlong', datum='WGS84')

        # List to store processed DataFrames with added coordinates
        processed_dataframes = []
        
        # Iterate through each DataFrame to process them individually
        for df in dataframes:
            # Generate a unique site ID for each row in the DataFrame
            total_rows = len(df)
            site_ids = [f'{i}' for i in range(total_rows)]
            df['site_id'] = site_ids
            df = df.reset_index(drop=True)

            # Convert UTM coordinates to latitude and longitude if the necessary columns are present
            if "X (UTM)" in df.columns and "Y (UTM)" in df.columns:
                lon_model, lat_model = pyproj.transform(utm_proj, lat_lon_proj, df["X (UTM)"].values, df["Y (UTM)"].values)
                df['lon_model'] = pd.Series(lon_model)
                df['lat_model'] = pd.Series(lat_model)
            else:
                raise ValueError("Columns 'X (UTM)' and 'Y (UTM)' not found in DataFrame")

            # Append the processed DataFrame to the list
            processed_dataframes.append(df)

        return processed_dataframes

    # Function to find the k-nearest neighbors using BallTree for spatial analysis
    def find_k_neighbors(dfs, df2, k=6):
        # List to hold results of matched neighbors
        all_results = []

        # Iterate through each DataFrame to find neighbors for each
        for df1 in dfs:
            # Convert latitude and longitude to radians for spatial computation
            coords1 = np.deg2rad(df1[['lat_model', 'lon_model']].values)
            tree = BallTree(coords1, metric='haversine')

            # Convert df2 coordinates to radians for querying against BallTree
            coords2 = np.deg2rad(np.c_[df2['Latitude'], df2['Longitude']])
            distances, indices = tree.query(coords2, k=k+1)

            # Transpose results to align with indices and distances for matching
            distances = distances.transpose()
            indices = indices.transpose()

            # Select the closest match for each point
            closest = indices[0]
            closest_dist = distances[0]
            df = df1.loc[closest].reset_index(drop=True)
            df = df.add_prefix('cs_')  # Prefix columns for clarity

            # Prefix columns in df2 for clarity
            df2_prefixed = df2.add_prefix('os_')

            # Concatenate the matched results
            result = pd.concat([df, df2_prefixed], axis=1)
            all_results.append(result)

        # Combine all individual results into a single DataFrame
        final_result = pd.concat(all_results, axis=0).reset_index(drop=True)
        return final_result
    
    # Function to process the PO4 data by matching dates and site IDs
    def process_po4_data(po4_df, ball_tree_df, model_data_df):
        # Initialize an empty DataFrame to store matched data
        matching_data = pd.DataFrame()

        # Iterate through each row in the PO4 DataFrame
        for index, po4_row in po4_df.iterrows():
            # Extract the SampleDate from the current row
            sample_date = po4_row['SampleDate']

            # Filter the BallTree DataFrame for rows with matching dates
            matching_dates = ball_tree_df[ball_tree_df['cs_date'] == sample_date]

            # If matching dates are found, proceed to check the Station ID
            if not matching_dates.empty:
                station_id = po4_row['Station']

                # Filter the matching dates for rows with matching Station IDs
                matching_stations = matching_dates[matching_dates['os_MonitoringStation'] == station_id]

                # If both date and station match, retrieve the cs_site_id
                if not matching_stations.empty:
                    cs_site_id = matching_stations.iloc[0]['cs_site_id']

                    # Find rows in model_data where both date and site_id match
                    matching_model_data = model_data_df[(model_data_df['date'] == sample_date) &
                                                        (model_data_df['site_id'] == cs_site_id)]

                    # If matching model data is found, combine all relevant data
                    if not matching_model_data.empty:
                        for _, model_data_row in matching_model_data.iterrows():
                            # Combine PO4 row, matching BallTree row, and model data row
                            combined_row = pd.concat([po4_row, matching_stations.iloc[0], model_data_row], axis=0)

                            # Append the combined row to the final DataFrame
                            matching_data = pd.concat([matching_data, combined_row.to_frame().T], ignore_index=True)

        return matching_data

    # Step 1: Fix headers in the data files and add a date column
    dataframes = fix_headers(directory_path)
    
    # Step 2: Load the PO file and filter rows where Depth is less than or equal to 1.0
    po = pd.read_csv(po_file_path)
    filtered_po = po[po['Depth'] <= 1.0]
    
    # Step 3: Add site_id and convert UTM coordinates to latitude/longitude for all DataFrames
    processed_dataframes = add_site_id_and_convert_utm_to_latlon(dataframes, utm_zone, utm_hemisphere)
    
    # Step 4: Prepare a unique set of locations from filtered PO data
    x = np.array(list(set(tuple(p) for p in filtered_po[['Latitude', 'Longitude', 'MonitoringStation']].values)))
    locations_CBP = pd.DataFrame(x, columns=['Latitude', 'Longitude', 'MonitoringStation'])
    locations_CBP["Longitude"] = pd.to_numeric(locations_CBP["Longitude"])
    locations_CBP["Latitude"] = pd.to_numeric(locations_CBP["Latitude"])
    
    # Step 5: Find the k nearest neighbors for each location in the processed data
    final_result = find_k_neighbors(processed_dataframes, locations_CBP, k=k)
    
    # Step 6: Load PO4 data and process it using matched results from BallTree
    po4_data = pd.read_csv(po_file_path)
    matching_po4_data = process_po4_data(po4_data, final_result, pd.concat(processed_dataframes))
    
    return matching_po4_data

In [None]:
result = process_data_with_nested_functions('./individual_day_files/', './PO4_wq.csv')
print(result)
#result.to_csv('new_result.csv')