In [1]:
import os
import pandas as pd
import yaml

In [2]:
# Load the configuration from the YAML file
def load_config(config_file):
    with open(config_file, 'r') as file:
        config = yaml.safe_load(file)
    return config

In [3]:
def load_csvs(main_folder):
    for root, dirs, files in os.walk(main_folder):
        print(root)
        for file in files:
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path)
                # Yield the DataFrame and the file path (optional, for reference)
                yield df, file_path

In [4]:
# Function to process unchanged files
def process_files(base_dir):
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith('csv'):
                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path)
                yield df, file_path

In [9]:
def process_unchanged_files(config, main_folder, output_path):
    # Process unchanged files
    unchanged_files = config['no_change_parameters'] #list of files
    # Load each CSV file from the main folder
    for df, file_path in process_files(main_folder):
        param_name = os.path.splitext(os.path.basename(file_path))[0]
        if param_name in unchanged_files:
            #print(f"Skipping {param_name} as it is in the no_change_parameters list.")
            file_path = os.path.join(output_path, f'{param_name}.csv')
            df.to_csv(file_path)
            

In [11]:
def aggregate_data(df, method, index_count, regions_to_combine, new_region, output_csv="aggregated_data.csv"):
    """
    Aggregates data in a DataFrame based on the given regions and saves the result as a new CSV file.

    Parameters:
        df (pd.DataFrame): The input DataFrame.
        method (str): Aggregation method (e.g., 'sum', 'mean').
        index_count (str): The column to use as the index for aggregation.
        regions_to_combine (list): List of region names to combine.
        new_region (str): Name for the new aggregated region.
        output_csv (str): Path to save the resulting CSV file.

    Returns:
        pd.DataFrame: The aggregated DataFrame.
    """
    if method not in ['sum', 'mean']:
        raise ValueError("Supported methods are 'sum' and 'mean'")

    # Filter rows corresponding to the regions to combine
    filtered_df = df[df[index_count].isin(regions_to_combine)]

    # Perform the aggregation
    if method == 'sum':
        aggregated_row = filtered_df.sum(numeric_only=True)
    elif method == 'mean':
        aggregated_row = filtered_df.mean(numeric_only=True)

    # Add the new region name to the index column
    aggregated_row[index_count] = new_region

    # Remove the original regions and append the aggregated row
    result_df = df[~df[index_count].isin(regions_to_combine)]
    result_df = pd.concat([result_df, pd.DataFrame([aggregated_row])], ignore_index=True)

    # Save the result to a new CSV file
    result_df.to_csv(output_csv, index=False)

    return result_df

In [None]:
def process_changed_files(config, main_folder, output_path):
    changed_files = config['changed_files'] # list of file with parameters
    
    #Load each CSV file from the main folder
    for df, file_path in process_files(main_folder):
        #combined_new_regions = pd.DataFrame()
        param_name = os.path.splitext(os.path.basename(file_path))[0]
        # Construct the full file path
        file_path = os.path.join(output_path, f'{param_name}.csv')
        for file in changed_files:
            if param_name == file['file_name']:
                method = file['method']
                index_count = file['index_count']
                regions_to_combine = file['regions_to_combine']
                new_regions = file['new_regions']
                print(param_name, method, index_count, new_regions, regions_to_combine)
                df.set_index(df.columns[:index_count].tolist(), inplace=True)
                
                # Apply aggregation based on config
                aggregated_df = aggregate_data(df, method, index_count, regions_to_combine, new_regions) 
                
                print(aggregated_df.head())

In [15]:
pd.set_option('display.max_rows', None)
base_dir = "/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Data/Parameters"
output_dir = "/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/DataNew"
config = load_config("config.yaml")
process_unchanged_files(config, base_dir, output_dir)
process_changed_files(config, base_dir, output_dir)

Par_RegionalModelPeriodEmissionLimit sum 2 ['SCANDINAVIA'] ['NO', 'SE']


KeyError: 2

In [6]:
config = load_config('config.yaml')

#regions_config = load_config('regions.yaml')

main_folder = '/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Data/Parameters' 
output_path = '/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/DataNew/Parameters' 

process_files(config, main_folder, output_path)

/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Data/Parameters
/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Data/Parameters/Par_TotalAnnualMaxActivity
Index(['Region', 'Technology', 'Year', 'Value', 'Unnamed: 4', 'Unit', 'Source',
       'Updated at', 'Updated by'],
      dtype='object')
Extracted param_name: Par_TotalAnnualMaxActivity
None
No region configuration found for Par_TotalAnnualMaxActivity. Skipping.
/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Data/Parameters/Par_TotalAnnualMaxActivity/Europe_openENTRANCE_technoFriendly
/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Data/Parameters/Par_REMinProductionTarget
Index(['Region', 'Fuel', 'Year', 'Value', 'Unnamed: 4', 'Unit', 'Source',
       'Updated at', 'Updated by'],
      dtype='object')
Extracted param_name: Par_REMinProductionTarget
None
No region configuration found for Par_REMinProductionTarget. Skipping.
/Users/shwetat/Projects/Genesys-mod_data_repo/GEN