In [8]:
import pandas as pd
import os

def process_group(group_name, dir_name, mapping_keys_file):
    # Create an empty list for content
    df_list = []

    # Iterate through files in the directory
    for file in os.listdir(dir_name):
        # Check for 'Data' and the group name in the file name
        if 'Data' in file and group_name in file:
            df = pd.read_csv(os.path.join(dir_name, file))
            df_list.append(df)

    # Concatenate all dataframes in the list
    final_content = pd.concat(df_list, ignore_index=True)

    df = final_content
    df['Variable'] = group_name

    df = df.sort_values(by='Date')
    print(df)

    df = df.groupby('Date').agg({
        'Variable': 'first',
        'Depth': 'mean',
        'Data': 'sum',
        'QC': 'mean'
    }).reset_index()

    # Reorder columns to match the original order
    df = df[['Variable', 'Date', 'Depth', 'Data', 'QC']]

    print(df)

    # Read the mapping keys (if required for further processing)
    mapping_keys_df = pd.read_csv(mapping_keys_file)

    # Construct the output filename
    output_filename = f'WoodsLakeMiddle_{group_name}_profile_Data.csv'

    # Write the filtered DataFrame to a CSV file in the specified directory
    df.to_csv(os.path.join(dir_name, output_filename), index=False)

# Directory and mapping keys file
dir_name = "../../../data-warehouse/csv/ht/wlwq"
mapping_keys_file = "mapping_keys.csv"

# List of groups to process
groups = [
    "Bacillariophyta",
    "Charophyta",
    "Chlorophyta",
    "Chrysophyta",
    "Cryptophyta",
    "Cyanobacteria",
    "Dinophyta",
    "Euglenophyta",
    "Ochrophyta"
]

# Process each group
for group in groups:
    process_group(group, dir_name, mapping_keys_file)


      Variable                 Date  Depth  Data   QC
48  Ochrophyta  2014-11-21 11:30:00    0.2   1.0  4.0
5   Ochrophyta  2014-11-21 11:30:00    0.2   1.0  4.0
89  Ochrophyta  2015-01-28 11:20:00    0.2   1.0  4.0
6   Ochrophyta  2015-01-28 11:20:00    0.2   1.0  4.0
49  Ochrophyta  2015-02-23 12:39:00    0.2   2.0  4.0
..         ...                  ...    ...   ...  ...
86  Ochrophyta  2023-11-29 11:00:00    0.2  45.0  4.0
87  Ochrophyta  2023-12-07 10:45:00    0.2   4.0  4.0
45  Ochrophyta  2023-12-07 10:45:00    0.2   4.0  4.0
88  Ochrophyta  2023-12-14 12:20:00    0.2  38.0  4.0
46  Ochrophyta  2023-12-14 12:20:00    0.2  38.0  4.0

[98 rows x 5 columns]
      Variable                 Date  Depth     Data   QC
0   Ochrophyta  2014-11-21 11:30:00    0.2      2.0  4.0
1   Ochrophyta  2015-01-28 11:20:00    0.2      2.0  4.0
2   Ochrophyta  2015-02-23 12:39:00    0.2      6.0  4.0
3   Ochrophyta  2015-07-22 10:20:00    0.2    198.0  4.0
4   Ochrophyta  2015-08-13 13:30:00    0.2  

In [1]:
import pandas as pd 
import os 

dir_name = "../../../data-warehouse/csv/ht/wlwq"
  
# specifying an empty list for content 
df_list = [] 

# Iterate through files in the directory
for file in os.listdir(dir_name): 
    # Check for 'Data' and 'Bacillariophyta' in the file name
    if 'Data' in file and 'Bacillariophyta' in file:
        df = pd.read_csv(os.path.join(dir_name, file))
        df_list.append(df) 

# Concatenate all dataframes in the list
final_content = pd.concat(df_list, ignore_index=True)

print(final_content)

                               Variable                 Date  Depth  Data  QC
0         Bacillariophyta (Ulnaria sp.)  2022-12-14 14:55:00    0.2   1.0   4
1         Bacillariophyta (Ulnaria sp.)  2023-08-02 12:20:00    0.2   1.0   4
2         Bacillariophyta (Ulnaria sp.)  2023-08-23 12:00:00    0.2   1.0   4
3         Bacillariophyta (Ulnaria sp.)  2023-09-28 12:30:00    0.2   1.0   4
4      Bacillariophyta (Achnanthes sp.)  2015-01-28 11:20:00    0.2   1.0   4
..                                  ...                  ...    ...   ...  ..
265      Bacillariophyta (Navicula sp.)  2015-01-28 11:20:00    0.2   1.0   4
266      Bacillariophyta (Navicula sp.)  2015-02-23 12:39:00    0.2   1.0   4
267      Bacillariophyta (Navicula sp.)  2015-04-28 13:20:00    0.2   1.0   4
268      Bacillariophyta (Navicula sp.)  2015-05-14 09:56:00    0.2   1.0   4
269  Bacillariophyta (Cymbella hauckii)  2015-05-14 09:56:00    0.2   1.0   4

[270 rows x 5 columns]


In [2]:
df = final_content
df['Variable'] = 'Bacillariophyta'

df = df.sort_values(by='Date')
print(df)

            Variable                 Date  Depth    Data  QC
82   Bacillariophyta  2014-01-08 15:00:00    0.2    10.0   4
5    Bacillariophyta  2014-05-12 13:45:00    0.2     1.0   4
93   Bacillariophyta  2014-05-12 13:45:00    0.2    30.0   4
125  Bacillariophyta  2014-05-12 13:45:00    0.2    70.0   4
126  Bacillariophyta  2014-07-23 09:40:00    0.2  5700.0   4
..               ...                  ...    ...     ...  ..
77   Bacillariophyta  2024-01-25 11:20:00    0.2    76.0   4
262  Bacillariophyta  2024-02-08 15:05:00    0.2  1000.0   4
78   Bacillariophyta  2024-02-08 15:05:00    0.2  1300.0   4
79   Bacillariophyta  2024-02-16 10:20:00    0.2   730.0   4
263  Bacillariophyta  2024-02-16 10:20:00    0.2   950.0   4

[270 rows x 5 columns]


In [3]:
df = df.groupby('Date').agg({
    'Variable': 'first',
    'Depth': 'mean',
    'Data': 'sum',
    'QC': 'mean'
}).reset_index()

# Reorder columns to match the original order
df = df[['Variable', 'Date', 'Depth', 'Data', 'QC']]

print(df)

           Variable                 Date  Depth     Data   QC
0   Bacillariophyta  2014-01-08 15:00:00    0.2     10.0  4.0
1   Bacillariophyta  2014-05-12 13:45:00    0.2    101.0  4.0
2   Bacillariophyta  2014-07-23 09:40:00    0.2   5702.0  4.0
3   Bacillariophyta  2014-09-18 09:50:00    0.2  12000.0  4.0
4   Bacillariophyta  2014-11-21 11:30:00    0.2      2.0  4.0
..              ...                  ...    ...      ...  ...
91  Bacillariophyta  2024-01-08 14:55:00    0.2   2201.0  4.0
92  Bacillariophyta  2024-01-18 15:30:00    0.2   1401.0  4.0
93  Bacillariophyta  2024-01-25 11:20:00    0.2   1376.0  4.0
94  Bacillariophyta  2024-02-08 15:05:00    0.2   2300.0  4.0
95  Bacillariophyta  2024-02-16 10:20:00    0.2   1680.0  4.0

[96 rows x 5 columns]


In [6]:
mapping_keys_df = pd.read_csv("mapping_keys.csv")
# Specify the directory path
directory = '../../../data-warehouse/csv/ht/wlwq'
        
# Construct the output filename
output_filename = f'WoodsLakeMiddle_Bacillariophyta_profile_Data.csv'

# Write the filtered DataFrame to a CSV file in the specified directory
df.to_csv(os.path.join(directory, output_filename), index=False)