In [13]:
import pandas as pd
import numpy as np
import os

# File to be processed
dir = "../../../data-lake/HT/CyanoLake"

mapping_keys_df = pd.read_csv("mapping_keys.csv")

def process_data(dir):
    for file in os.listdir(dir):
        df = pd.read_csv(os.path.join(dir,file), header=0, encoding='utf-8')

        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
        df["date"] = df["date"].dt.strftime("%Y-%m-%d")
        df['time'] = pd.to_datetime(df['time'], format='%H:%M:%S')
        df['time'] = df['time'].dt.strftime('%H:%M:%S')
        df['date'] = pd.to_datetime(df["date"] + " " + df["time"], errors='coerce')
        df = df.drop("time", axis=1)
        
        # print(df)
        
        variables_name = list(df)
        variables = variables_name[3:]
        # print(variables)
        for lake in ["Arthurs Lake","Woods Lake"]:
            print(lake)
            for variable in variables:
                df_filtered = pd.DataFrame()  # Initialize an empty DataFrame

                # Date column
                df_filtered['Date'] = df.loc[df['name']==lake, 'date']

                # Data column
                df_filtered['Data'] = df.loc[df['name']==lake, variable]

                # Variable column
                df_filtered['Variable'] = variable

                # Depth column
                df_filtered['Depth'] = 0

                # QC column
                df_filtered["QC"] = 'N'

                df_filtered = df_filtered.sort_values(by='Date')

                # Replace empty cells with NaN
                df_filtered.replace("", np.nan, inplace=True)

                df_filtered = df_filtered.loc[:, ["Variable", "Date", "Depth", "Data", "QC"]]

                # Convert value of different units
                conv_factor = mapping_keys_df.loc[mapping_keys_df['Params.Name'] == variable, 'Conv'].iloc[0]
                if conv_factor != 1:
                    df_filtered['Data'] = pd.to_numeric(df_filtered['Data'], errors='coerce')  # Convert non-numeric values to NaN
                    df_filtered['Data'] *= conv_factor

                name_conv = mapping_keys_df.loc[mapping_keys_df['Params.Name'] == variable, 'Key Value'].iloc[0]
                if lake == "Arthurs Lake":
                    directory = "../../../data-warehouse/csv/ht/alwq"
                    site = "ArthursLake"
                elif lake == "Woods Lake":
                    directory = "../../../data-warehouse/csv/ht/wlwq"
                    site = "WoodsLake"
                output_filename = f'{site}_CyanoLake_{name_conv.replace(" ","")}_{variable.split("_")[-1]}_profile_Data.csv'
                print(output_filename)
                print(df_filtered)

                # Write the filtered DataFrame to a CSV file in the specified directory
                df_filtered.to_csv(os.path.join(directory, output_filename), index=False)

process_data(dir)

Arthurs Lake
ArthursLake_CyanoLake_Chlorophyll-a_med_profile_Data.csv
     Variable                Date  Depth  Data QC
0    chla_med 2023-01-02 11:18:29      0   3.0  N
1    chla_med 2023-01-07 11:18:29      0   2.8  N
2    chla_med 2023-01-09 11:08:31      0   0.0  N
3    chla_med 2023-01-12 11:18:26      0   0.4  N
4    chla_med 2023-01-14 11:08:32      0   0.9  N
..        ...                 ...    ...   ... ..
129  chla_med 2024-07-22 10:08:38      0   0.0  N
130  chla_med 2024-08-04 10:18:32      0   3.3  N
131  chla_med 2024-08-09 10:18:31      0   0.8  N
132  chla_med 2024-08-14 10:18:34      0   0.9  N
133  chla_med 2024-08-19 10:18:30      0   0.3  N

[134 rows x 5 columns]
ArthursLake_CyanoLake_Chlorophyll-a_mean_profile_Data.csv
      Variable                Date  Depth  Data QC
0    chla_mean 2023-01-02 11:18:29      0   3.0  N
1    chla_mean 2023-01-07 11:18:29      0   2.8  N
2    chla_mean 2023-01-09 11:08:31      0   0.1  N
3    chla_mean 2023-01-12 11:18:26      0   