In [3]:
import pandas as pd
import re
import os
import numpy as np

# List of files
files = [
    "../../../data-lake/HT/WaterQuality/WoodsLake_Data Request_20240724/462.5_WoodsLakeMiddleSurface_chlorophylla_continuous.csv",
    "../../../data-lake/HT/WaterQuality/WoodsLake_Data Request_20240724/462.5_WoodsLakeMiddleSurface_conductivity_continuous.csv",
    "../../../data-lake/HT/WaterQuality/WoodsLake_Data Request_20240724/462.5_WoodsLakeMiddleSurface_dissolvedoxygen(%)_continuous.csv",
    "../../../data-lake/HT/WaterQuality/WoodsLake_Data Request_20240724/462.5_WoodsLakeMiddleSurface_dissolvedoxygen(mgperL)_continuous.csv",
    "../../../data-lake/HT/WaterQuality/WoodsLake_Data Request_20240724/462.5_WoodsLakeMiddleSurface_pH_continuous.csv",
    "../../../data-lake/HT/WaterQuality/WoodsLake_Data Request_20240724/462.5_WoodsLakeMiddleSurface_phycocyanin_continuous.csv",
    "../../../data-lake/HT/WaterQuality/WoodsLake_Data Request_20240724/462.5_WoodsLakeMiddleSurface_salinity_continuous.csv",
    "../../../data-lake/HT/WaterQuality/WoodsLake_Data Request_20240724/462.5_WoodsLakeMiddleSurface_temperature_continuous.csv",
    "../../../data-lake/HT/WaterQuality/WoodsLake_Data Request_20240724/462.5_WoodsLakeMiddleSurface_turbidity_continuous.csv",
]

# Load the mapping keys
mapping_keys_df = pd.read_csv("mapping_keys.csv")

def process_data(file):
    df = pd.read_csv(file, header=None, usecols=[0], encoding='unicode_escape')
    df = df.reset_index(drop=True)
    df.columns = ["Raw"]
    
    # Split the 'Raw' column by multiple spaces
    split_data = df["Raw"].str.split(r'\s+', expand=True)
    
    # Extract Date and Time
    df['Date'] = split_data[0] + ' ' + split_data[2]
    
    # Extract Data value
    df['Data'] = split_data[3]
    
    # Drop the original 'Raw' column
    df = df[['Date', 'Data']]

    # Set values for Depth and QC
    df['Depth'] = 0
    df["QC"] = "N"
    
    # Convert 'Data' to numeric
    df['Data'] = pd.to_numeric(df['Data'], errors='coerce')

    df['Date'] = df['Date'].apply(pd.to_datetime, format='%d/%m/%Y %H:%M:%S', errors='coerce')
    df = df.sort_values(by='Date')
    
    # Extract the variable name from the file
    variable_match = re.search(r'_([^_]+)_continuous\.csv$', file)
    variable_name = variable_match.group(1) if variable_match else 'Unknown'
    df['Variable'] = variable_name
    #print(df)
    
    return df

def filter_and_save_data(df, variable_name, output_filename):
    # Filter rows where the variable is equal to the specified variable_name
    variable_data = df.loc[df['Variable'] == variable_name]
    #print(variable_data)

    # Extract columns needed for the filtered data
    filtered_data = variable_data.loc[:, ["Variable", "Date", "Depth", "Data", "QC"]]

    # Replace empty cells with NaN
    filtered_data.replace("", np.nan, inplace=True)

    # Convert value of different units
    if variable_name in mapping_keys_df['Params.Name'].values:
        conv_factor = mapping_keys_df.loc[mapping_keys_df['Params.Name'] == variable_name, 'Conv'].iloc[0]
        filtered_data['Data'] = pd.to_numeric(filtered_data['Data'], errors='coerce')  # Convert non-numeric values to NaN
        filtered_data['Data'] *= conv_factor

    # Specify the directory path
    directory = '../../../data-warehouse/csv/ht/wlwq'
    
    # Create the directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Write the filtered DataFrame to a CSV file in the specified directory
    filtered_data.to_csv(os.path.join(directory, output_filename), index=False)

# Specify the variables needed
first_column = mapping_keys_df.iloc[:, 0]
first_column.values.tolist()

# Iterate over each file
for file in files:
    # Process the data for the current file
    df = process_data(file)
    
    # Replace empty cells with NaN
    df.replace("", np.nan, inplace=True)
    
    variable = df['Variable'].values[0]
    # Filter mapping_keys_df to find the row corresponding to the variable
    key_row = mapping_keys_df[mapping_keys_df['Params.Name'] == variable]
    
    # Extract the key value from the row
    key_value = key_row['Key Value'].values[0] if not key_row.empty else None
    
    # Construct the output filename
    output_filename = f'WoodsLakeMiddleSurfaceContinuous_{key_value.replace(" ", "")}_profile_Data.csv'
    print(output_filename)
    
    # Filter and save data
    filter_and_save_data(df, variable, output_filename)


WoodsLakeMiddleSurfaceContinuous_Chlorophyll-a_profile_Data.csv
WoodsLakeMiddleSurfaceContinuous_SpecificConductivity_profile_Data.csv
WoodsLakeMiddleSurfaceContinuous_O2Saturation_profile_Data.csv
WoodsLakeMiddleSurfaceContinuous_DissolvedOxygen_profile_Data.csv
WoodsLakeMiddleSurfaceContinuous_pH_profile_Data.csv
WoodsLakeMiddleSurfaceContinuous_Phycocyanin_profile_Data.csv
WoodsLakeMiddleSurfaceContinuous_Salinity_profile_Data.csv
WoodsLakeMiddleSurfaceContinuous_Temperature_profile_Data.csv
WoodsLakeMiddleSurfaceContinuous_Turbidity_profile_Data.csv
