In [1]:
import os
import pandas as pd
import yaml

In [2]:
# Load the configuration from the YAML file
def load_config(config_file):
    with open(config_file, 'r') as file:
        config = yaml.safe_load(file)
    return config

In [3]:
def load_csvs(main_folder):
    for root, dirs, files in os.walk(main_folder):
        print(root)
        for file in files:
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path)
                # Yield the DataFrame and the file path (optional, for reference)
                yield df, file_path

In [4]:
# Function to process unchanged files
def process_files(base_dir):
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith('csv'):
                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path)
                yield df, file_path

In [5]:
def process_unchanged_files(config, main_folder, output_path):
    # Process unchanged files
    unchanged_files = config['no_change_parameters'] #list of files
    # Load each CSV file from the main folder
    for df, file_path in process_files(main_folder):
        param_name = os.path.splitext(os.path.basename(file_path))[0]
        if param_name in unchanged_files:
            #print(f"Skipping {param_name} as it is in the no_change_parameters list.")
            file_path = os.path.join(output_path, f'{param_name}.csv')
            df.to_csv(file_path)
            

In [54]:


def aggregate_data(df, method, index_count, regions_to_combine, new_region, output_csv="aggregated_data.csv"):
    """
    Aggregates data in a DataFrame based on the given regions and saves the result as a new CSV file.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        method (str): Aggregation method (e.g., 'sum', 'mean').
        index_count (int): The number of index levels to use for aggregation.
        regions_to_combine (list): List of region names to combine.
        new_region (str): Name for the new aggregated region.
        output_csv (str): Path to save the resulting CSV file.

    Returns:
        pd.DataFrame: The aggregated DataFrame.
    """
    if method not in ['sum', 'mean']:
        raise ValueError("Supported methods are 'sum' and 'mean'")

    # Reset the index to avoid issues if there are multiple levels
    df = df.reset_index()

    # Create the list of index columns dynamically based on the value of index_count
    index_columns = df.columns[:index_count].tolist()

    # Set the index to the first 'index_count' columns dynamically
    df.set_index(index_columns, inplace=True)

    # Filter rows corresponding to the regions to combine
    filtered_df = df[df.index.get_level_values('Region').isin(regions_to_combine)]

    # Perform the aggregation for each combination of levels dynamically
    aggregated_rows = []
    grouped_by = index_columns[1:]  # Use all levels except the last for grouping
    print(grouped_by)

    # Perform the aggregation for each unique combination of levels except 'Region'
    for group in filtered_df.groupby(grouped_by):
        group_data = group[1]

        # Perform aggregation (sum or mean)
        if method == 'sum':
            aggregated_row = group_data.sum(numeric_only=True)
        elif method == 'mean':
            aggregated_row = group_data.mean(numeric_only=True)
        elif method == 'copy':
            aggregated_row = group_data.copy()

        # Add the new region name and the aggregated values to the row
        aggregated_row['Region'] = new_region
        
        # Re-insert the group labels from 2nd and 3rd index levels
        if isinstance(group[0], tuple):
            for i, level_name in enumerate(grouped_by):
                aggregated_row[level_name] = group[0][i]
        else:
            aggregated_row[grouped_by[0]] = group[0]

        aggregated_rows.append(aggregated_row)

    # Create a DataFrame from the aggregated rows
    aggregated_df = pd.DataFrame(aggregated_rows)

    # Remove the original regions and append the aggregated rows
    result_df = df.copy() #[~df.index.get_level_values('Region').isin(regions_to_combine)]
    result_df = result_df.reset_index()
    result_df = pd.concat([result_df, aggregated_df], ignore_index=True)
   
    return result_df
    

In [55]:
def process_changed_files(config, main_folder, output_path):
    changed_files = config['changed_files'] # list of file with parameters
    
    #Load each CSV file from the main folder
    for df, file_path in process_files(main_folder):
        #combined_new_regions = pd.DataFrame()
        param_name = os.path.splitext(os.path.basename(file_path))[0]
        # Construct the full file path
        file_path = os.path.join(output_path, f'{param_name}.csv')
        for file in changed_files:
            if param_name == file['file_name']:
                method = file['method']
                index_count = file['index_count']
                regions_to_combine = file['regions_to_combine']
                new_regions = file['new_regions']
                print(param_name, method, index_count, new_regions, regions_to_combine)
                df.set_index(df.columns[:index_count].tolist(), inplace=True)
                
                # Apply aggregation based on config
                aggregated_df = aggregate_data(df, method, index_count, regions_to_combine, new_regions) 
                
                aggregated_df.to_csv(file_path, index=False)
                

In [None]:
pd.set_option('display.max_rows', None)
base_dir = "/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Data/Parameters"
output_dir = "/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/DataNew"
config = load_config("config.yaml")
#process_unchanged_files(config, base_dir, output_dir)
process_changed_files(config, base_dir, output_dir)