In [1]:
import os
import pandas as pd
import yaml

In [2]:
# Load the configuration from the YAML file
def load_config(config_file):
    with open(config_file, 'r') as file:
        config = yaml.safe_load(file)
    return config

In [3]:
def load_csvs(main_folder):
    for root, dirs, files in os.walk(main_folder):
        print(root)
        for file in files:
            if file.endswith('.csv'):
                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path)
                # Yield the DataFrame and the file path (optional, for reference)
                yield df, file_path

In [4]:
def aggregate_data(df, method, index_count):
    group_by_columns = df.columns[:index_count].tolist()
    
    
    if method == 'sum':
        
        # Automatically detect numeric and non-numeric columns
        aggregation_functions = {}
        for col in df.columns:
            if col in group_by_columns:
                continue  

            if pd.api.types.is_numeric_dtype(df[col]):
                aggregation_functions[col] = 'sum'  
            else:
                aggregation_functions[col] = 'first'  # Keep the first value for non-numeric columns

        # Apply the aggregation dynamically
        df = df.groupby(group_by_columns).agg(aggregation_functions).reset_index()
        return df
    
    elif method == 'average':
        return df.groupby(group_by_columns).mean().reset_index()
    
    elif method == 'copy':
        return df.copy()
    
    else:
        raise ValueError(f"Aggregation method '{method}' is not supported.")

In [8]:
def process_files(config, main_folder, output_path):
    no_change_files = config.get('no_change_parameters', [])
    
    for df, file_path in load_csvs(main_folder):
        print(df.columns)
        param_name = os.path.splitext(os.path.basename(file_path))[0]
        print('Extracted param_name:', param_name)

        if param_name in no_change_files:
            print(f"Skipping {param_name} as it is in the no_change_parameters list.")
            continue
        
        # Apply the changes based on the config
        param_config = config.get(param_name, None)

        if not param_config:
            print(f"No configuration found for {param_name}.")  
            continue
       
        agg_method = param_config.get('aggregation_Method')
        index_count = param_config.get('index_count')
        
        # Fetch region-specific configuration for this parameter from regions.yaml
        region_config = config.get('regions', None)
        new_region = config.get('new_regions')
        print(new_region)
        
        if region_config:
            # Get regions to aggregate and new region
            regions_to_aggregate = region_config
            

            # Check if regions are defined properly
            if not regions_to_aggregate or not new_region:
                print(f"Invalid region configuration for {param_name}. Skipping.")
                continue
        else:
            print(f"No region configuration found for {param_name}. Skipping.")
            continue

        # Filter the data to include only rows with regions to aggregate
        if 'Region' in df.columns:  
            df_to_aggregate = df[df['Region'].isin(regions_to_aggregate)]
        else:
            print(f"No 'Region' column found in {param_name}. Skipping.")
            continue
                
        # Apply aggregation based on config
        aggregated_df = aggregate_data(df_to_aggregate, agg_method, index_count) # not creating new region. need to take a look in aggregate_data function
        print(aggregated_df.tail())
        aggregated_df.to_csv('agg_test.csv')

        #require fixing 
        aggregated_df['Region'] = new_region  
        df = pd.concat([df, aggregated_df], ignore_index=True)
        
        # Save the updated DataFrame to a new location
        df.to_csv(output_path, index=False)
        print(f"Updated {param_name} with aggregated data for new region {output_path}.")
        




In [9]:
config = load_config('config.yaml')

#regions_config = load_config('regions.yaml')

main_folder = '/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Data/Parameters' 
output_path = '/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/DataNew/Parameters' 

process_files(config, main_folder, output_path)

/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Data/Parameters
/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Data/Parameters/Par_TotalAnnualMaxActivity
Index(['Region', 'Technology', 'Year', 'Value', 'Unnamed: 4', 'Unit', 'Source',
       'Updated at', 'Updated by'],
      dtype='object')
Extracted param_name: Par_TotalAnnualMaxActivity
None
No region configuration found for Par_TotalAnnualMaxActivity. Skipping.
/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Data/Parameters/Par_TotalAnnualMaxActivity/Europe_openENTRANCE_technoFriendly
/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Data/Parameters/Par_REMinProductionTarget
Index(['Region', 'Fuel', 'Year', 'Value', 'Unnamed: 4', 'Unit', 'Source',
       'Updated at', 'Updated by'],
      dtype='object')
Extracted param_name: Par_REMinProductionTarget
None
No region configuration found for Par_REMinProductionTarget. Skipping.
/Users/shwetat/Projects/Genesys-mod_data_repo/GEN