In [11]:
import pandas as pd
import re
import os
import numpy as np

# List of filepaths
filepaths = [
    "../../data-lake/HT/Meteorology/Rainfall Woods Lake At Dam.csv"
]

# Load the mapping keys
mapping_keys_df = pd.read_csv("mapping_keys_wl.csv")

def process_data(filepath):
    df = pd.read_csv(filepath, usecols=[0], header=None, encoding='unicode_escape')
    df = df.iloc[1:,:]
    df = df.reset_index(drop=True)
    df.columns = ["Raw"]
    
    # Split the 'Raw' column by multiple spaces
    split_data = df["Raw"].str.split(r'\s+', expand=True)
    
    # Extract Date and Time
    df['Date'] = split_data[0] + ' ' + split_data[2]
    
    # Extract Data value
    df['Data'] = split_data[3]
    
    # Drop the original 'Raw' column
    df = df[['Date', 'Data']]

    # Set values for Depth and QC
    df['Depth'] = 0
    df["QC"] = "N"
    
    # Convert 'Data' to numeric
    df['Data'] = pd.to_numeric(df['Data'], errors='coerce')

    df['Date'] = df['Date'].apply(pd.to_datetime, format='%d/%m/%Y %H:%M:%S', errors='coerce')
    df = df.sort_values(by='Date')
    
    # Extract the variable name from the URL
    # variable_match = re.search (r'[^/]+_(ChannelFlow[^_]*)', filepath)
    # variable_name = variable_match.group(1) if variable_match else 'Unknown'
    df['Variable'] = "Rainfall"
    #print(df)
    
    return df

def filter_and_save_data(df, variable_name, output_filename):
    # Filter rows where the variable is equal to the specified variable_name
    variable_data = df.loc[df['Variable'] == variable_name]
    #print(variable_data)

    # Extract columns needed for the filtered data
    filtered_data = variable_data.loc[:, ["Variable", "Date", "Depth", "Data", "QC"]]

    # Replace empty cells with NaN
    filtered_data.replace("", np.nan, inplace=True)

    # Convert value of different units
    if variable_name in mapping_keys_df['Params.Name'].values:
        conv_factor = mapping_keys_df.loc[mapping_keys_df['Params.Name'] == variable_name, 'Conv'].iloc[0]
        filtered_data['Data'] = pd.to_numeric(filtered_data['Data'], errors='coerce')  # Convert non-numeric values to NaN
        filtered_data['Data'] *= conv_factor

    # Specify the directory path
    directory = '../../data-warehouse/csv/ht/met'
    
    # Create the directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Write the filtered DataFrame to a CSV file in the specified directory
    filtered_data.to_csv(os.path.join(directory, output_filename), index=False)

# Specify the variables needed
first_column = mapping_keys_df.iloc[:, 0]
first_column.values.tolist()

# Iterate over each URL
for filepath in filepaths:
    # Process the data for the current URL
    df = process_data(filepath)
    
    # Replace empty cells with NaN
    df.replace("", np.nan, inplace=True)
    
    variable = df['Variable'].values[0]
    # Filter mapping_keys_df to find the row corresponding to the variable
    key_row = mapping_keys_df[mapping_keys_df['Params.Name'] == variable]
    
    # Extract the key value from the row
    key_value = key_row['Key Value'].values[0] if not key_row.empty else None
    
    # Construct the output filename .replace(" ", "")
    output_filename = f'WoodsLakeAtDam_{key_value.replace(" ", "")}_profile_Data.csv'
    print(output_filename)
    
    # Filter and save data
    filter_and_save_data(df, variable, output_filename)


WoodsLakeAtDam_Precipitation_profile_Data.csv


In [17]:
import pandas as pd
import re
import os
import numpy as np

# List of filepaths
filepaths = [
    "../../data-lake/BOM/HT/Hourly aggregate/AirTemp.csv",
    "../../data-lake/BOM/HT/Hourly aggregate/Humidity.csv",
    "../../data-lake/BOM/HT/Hourly aggregate/Rainfall.csv",
    "../../data-lake/BOM/HT/Hourly aggregate/WD.csv",
    "../../data-lake/BOM/HT/Hourly aggregate/WS.csv"
]

# Load the mapping keys
mapping_keys_df = pd.read_csv("mapping_keys_bom.csv")

def process_data(filepath):
    df = pd.read_csv(filepath, header=None, encoding='unicode_escape')
    df = df.reset_index(drop=True)
    
    # Extract Date and Time
    df['Date'] = df[0] + ' ' + df[2]
    
    # Extract Data value
    df['Data'] = df[3]
    
    # Drop the original 'Raw' column
    df = df[['Date', 'Data']]

    # Set values for Depth and QC
    df['Depth'] = 0
    df["QC"] = "N"
    
    # Convert 'Data' to numeric
    df['Data'] = pd.to_numeric(df['Data'], errors='coerce')

    df['Date'] = df['Date'].apply(pd.to_datetime, format='%d/%m/%Y %H:%M:%S', errors='coerce')
    df = df.sort_values(by='Date')
    
    # Extract the variable name from the URL
    variable_match = re.search (r'/([^/]+)\.csv$', filepath)
    variable_name = variable_match.group(1) if variable_match else 'Unknown'
    df['Variable'] = variable_name
    #print(df)
    
    return df

def filter_and_save_data(df, variable_name, output_filename):
    # Filter rows where the variable is equal to the specified variable_name
    variable_data = df.loc[df['Variable'] == variable_name]
    #print(variable_data)

    # Extract columns needed for the filtered data
    filtered_data = variable_data.loc[:, ["Variable", "Date", "Depth", "Data", "QC"]]

    # Replace empty cells with NaN
    filtered_data.replace("", np.nan, inplace=True)

    # Convert value of different units
    if variable_name in mapping_keys_df['Params.Name'].values:
        conv_factor = mapping_keys_df.loc[mapping_keys_df['Params.Name'] == variable_name, 'Conv'].iloc[0]
        filtered_data['Data'] = pd.to_numeric(filtered_data['Data'], errors='coerce')  # Convert non-numeric values to NaN
        filtered_data['Data'] *= conv_factor

    # Specify the directory path
    directory = '../../data-warehouse/csv/ht/met'
    
    # Create the directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Write the filtered DataFrame to a CSV file in the specified directory
    filtered_data.to_csv(os.path.join(directory, output_filename), index=False)

# Specify the variables needed
first_column = mapping_keys_df.iloc[:, 0]
first_column.values.tolist()

# Iterate over each URL
for filepath in filepaths:
    # Process the data for the current URL
    df = process_data(filepath)
    
    # Replace empty cells with NaN
    df.replace("", np.nan, inplace=True)
    
    variable = df['Variable'].values[0]
    # Filter mapping_keys_df to find the row corresponding to the variable
    key_row = mapping_keys_df[mapping_keys_df['Params.Name'] == variable]
    
    # Extract the key value from the row
    key_value = key_row['Key Value'].values[0] if not key_row.empty else None
    
    # Construct the output filename .replace(" ", "")
    output_filename = f'BOM_{key_value.replace(" ", "")}_profile_Data.csv'
    print(output_filename)
    
    # Filter and save data
    filter_and_save_data(df, variable, output_filename)


BOM_AirTemperature_profile_Data.csv
BOM_RelativeHumidity_profile_Data.csv
BOM_Precipitation_profile_Data.csv
BOM_WindDirection_profile_Data.csv
BOM_WindSpeed_profile_Data.csv


In [44]:
import pandas as pd
import re
import os
import numpy as np

# List of filepaths
filepaths = [
    "../../data-lake/BOM/IDC/IDCJAC0009_096033_1800_Data.csv"
]

# Load the mapping keys
mapping_keys_df = pd.read_csv("mapping_keys_bomidc.csv")

def process_data(filepath):
    df = pd.read_csv(filepath, encoding='unicode_escape')
    df = df.reset_index(drop=True)
    
    # Extract Date and Time
    # time_delta = pd.Timedelta(hours=0, minutes=0, seconds=0)
    # time_str = f"{time_delta.components.hours:02}:{time_delta.components.minutes:02}:{time_delta.components.seconds:02}"
    # df['Time'] = time_str
    df['Date'] = df.iloc[:, 4].astype(str) + '/' + df.iloc[:, 3].astype(str) + '/' + df.iloc[:, 2].astype(str)
    
    # df['Date'] = df['YMD'] + " " + df['Time'] 
    
    # Extract Data value
    df['Data'] = df.iloc[:, 5]

    # Extract QC
    df['QC'] = df.iloc[:, 7]
    
    # Drop the original 'Raw' column
    df = df[['Date', 'Data', 'QC']]

    # Set values for Depth
    df['Depth'] = 0
    
    # Convert 'Data' to numeric
    df['Data'] = pd.to_numeric(df['Data'], errors='coerce')

    # df['Date'] = df['Date'].apply(pd.to_datetime, format='%d/%m/%Y %H:%M:%S', errors='coerce')
    # Assuming your DataFrame is named df
    df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y') + pd.to_timedelta('00:00:00')

    # If you want to convert it back to a string with the specified format
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')
    df = df.sort_values(by='Date')
    print(df.head())
    
    # Extract the variable name from the URL
    # variable_match = re.search (r'/([^/]+)\.csv$', filepath)
    # variable_name = variable_match.group(1) if variable_match else 'Unknown'
    df['Variable'] = "IDCJAC0009"
    #print(df)
    
    return df

def filter_and_save_data(df, variable_name, output_filename):
    # Filter rows where the variable is equal to the specified variable_name
    variable_data = df.loc[df['Variable'] == variable_name]
    #print(variable_data)

    # Extract columns needed for the filtered data
    filtered_data = variable_data.loc[:, ["Variable", "Date", "Depth", "Data", "QC"]]

    # Replace empty cells with NaN
    filtered_data.replace("", np.nan, inplace=True)

    # Convert value of different units
    if variable_name in mapping_keys_df['Params.Name'].values:
        conv_factor = mapping_keys_df.loc[mapping_keys_df['Params.Name'] == variable_name, 'Conv'].iloc[0]
        filtered_data['Data'] = pd.to_numeric(filtered_data['Data'], errors='coerce')  # Convert non-numeric values to NaN
        filtered_data['Data'] *= conv_factor

    # Specify the directory path
    directory = '../../data-warehouse/csv/ht/met'
    
    # Create the directory if it doesn't exist
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Write the filtered DataFrame to a CSV file in the specified directory
    filtered_data.to_csv(os.path.join(directory, output_filename), index=False)

# Specify the variables needed
first_column = mapping_keys_df.iloc[:, 0]
first_column.values.tolist()

# Iterate over each URL
for filepath in filepaths:
    # Process the data for the current URL
    df = process_data(filepath)
    
    # Replace empty cells with NaN
    df.replace("", np.nan, inplace=True)
    
    variable = df['Variable'].values[0]
    # Filter mapping_keys_df to find the row corresponding to the variable
    key_row = mapping_keys_df[mapping_keys_df['Params.Name'] == variable]
    
    # Extract the key value from the row
    key_value = key_row['Key Value'].values[0] if not key_row.empty else None
    
    # Construct the output filename .replace(" ", "")
    output_filename = f'BOMIDC_{key_value.replace(" ", "")}_profile_Data.csv'
    print(output_filename)
    
    # Filter and save data
    filter_and_save_data(df, variable, output_filename)


                  Date  Data   QC  Depth
0  2000-01-01 00:00:00   NaN  NaN      0
1  2000-01-02 00:00:00   NaN  NaN      0
2  2000-01-03 00:00:00   NaN  NaN      0
3  2000-01-04 00:00:00   NaN  NaN      0
4  2000-01-05 00:00:00   NaN  NaN      0
BOMIDC_Precipitation_profile_Data.csv
