### New setup

In [1]:
import os
import pandas as pd
import yaml
                

In [2]:
# Load the configuration from the YAML file
def load_config(config_file):
    with open(config_file, 'r') as file:
        config = yaml.safe_load(file)
    return config

In [3]:
# Function to read csv files sequentially from the directory as dataframe
def process_files(base_dir):
    for root, dirs, files in os.walk(base_dir):
        for file in files:
            if file.endswith('csv'):
                file_path = os.path.join(root, file)
                df = pd.read_csv(file_path)
                yield df, file_path

In [4]:
def process_unchanged_files(config, main_folder, output_path):
    # Process unchanged files
    unchanged_files = config['parameters']['unchanged_files']
    print(f"Unchanged files: {unchanged_files}")
    for df, file_path in process_files(main_folder):
        param_name = os.path.splitext(os.path.basename(file_path))[0]
        if param_name in unchanged_files:
            #print(f"Skipping {param_name} as it is in the no_change_parameters list.")
            file_path = os.path.join(output_path, f'{param_name}.csv')
            df.to_csv(file_path)
            

In [5]:
def process_changed_files(config, main_folder, output_path):
    changed_files = config['parameters']['changed_files'] # list of file with parameters

    for df, file_path in process_files(main_folder):
        param_name = os.path.splitext(os.path.basename(file_path))[0]
       
        file_path = os.path.join(output_path, f'{param_name}.csv')
        for file in changed_files:
            if param_name == file['file_name']: 
                method = file['method']
                index_count = file['index_count']
                regions_to_split = file['regions_to_split']
                new_regions = file['new_regions']
                split_ratio = file['split_ratio']
                print(param_name, method, index_count, regions_to_split, new_regions, split_ratio)
                
                df.set_index(df.columns[:index_count].tolist(), inplace=True)
                
                for region in regions_to_split: 
                    if region in df.index.get_level_values('Region'):
                        print("True")
                        region_data = df.loc[region]
                        
                        new_region_dfs = []
                        for new_region, ratio in split_ratio.items():
                            if index_count == 1:
                                new_data = region_data.to_frame().T if isinstance(region_data, pd.Series) else region_data.copy()
                                new_data['Value'] *= ratio
                                new_data['Region'] = new_region
                                new_data.reset_index(inplace=True, drop=True)
                                new_region_dfs.append(new_data)
                            else:
                                new_data = region_data.copy()
                                new_data['Value'] *= ratio
                                new_data['Region'] = new_region
                                new_data.reset_index(inplace=True) 
                                new_region_dfs.append(new_data) 
                        combined_new_regions = pd.concat(new_region_dfs, ignore_index=True)
                        df.reset_index(inplace=True)
                        df = pd.concat([df, combined_new_regions], ignore_index=True)

                        df.set_index(df.columns[:index_count].tolist(), inplace=True)
                        df.to_csv(file_path)   
                    else:
                        print("do not have region in dataset")
                        df.to_csv(file_path)    

                        

In [6]:
def trade_files(config, main_folder, output_path, trade_connection):
    trade_changed_files = config['parameters']['trade_files']  
    
    for df, file_path in process_files(main_folder):
        param_name = os.path.splitext(os.path.basename(file_path))[0]
        file_path = os.path.join(output_path, f'{param_name}.csv') 
        
        for file in trade_changed_files:
            if param_name == file['file_name']:
                method = file['method']
                index_count = file['index_count']
                regions_to_split = file['regions_to_split']
                new_regions = file['new_regions']
                split_ratio = file['split_ratio']
                new_connections = file.get('new_connections', {})
                
                print(param_name, method, index_count, regions_to_split, new_regions, split_ratio, new_connections)
                
                df.set_index(df.columns[:index_count].tolist(), inplace=True)

                for region in regions_to_split:
                    if region in df.index.get_level_values('Region'):
                        print(f"Processing region: {region}")
                        ref_row = df.loc[region].iloc[0].to_dict() 
                        
                        region_data = df.loc[region].copy() #added later
                        index_cols = df.index.names #added later
                        new_region_dfs = []
                        
                        for new_region, ratio in split_ratio.items():
                            new_data = region_data.to_frame().T if isinstance(region_data, pd.Series) else region_data.copy()
                            new_data['Value'] *= ratio
                            new_data.loc[:, "Region"] = new_region
                            new_data.reset_index(inplace=True)
                            new_region_dfs.append(new_data)
                        
                        """# Add new connections dynamically
                        for key, details in new_connections.items():
                            if "-" in key:
                                reg1, reg2 = key.split("-")

                                # Manually create a new row for this connection
                                conn_row = ref_row.copy()

                                conn_row[index_cols[0]] = reg1  # Update first index column (e.g., 'Region')
                                if len(index_cols) > 1:
                                    conn_row[index_cols[1]] = reg2  # Update second index column (e.g., 'Region2')

                                conn_row["Value"] = details["value"]  # Assign new connection value

                                # Convert to DataFrame and add to list
                                new_region_dfs.append(pd.DataFrame([conn_row]))
                                print(f"Added connection: {reg1} -> {reg2} with Value: {details['value']}")"""
                        
                        # Add new connections from excel sheet
                        # Load workbook
                        xls = pd.ExcelFile(trade_connection)
                        df_connections = pd.DataFrame()
                        for sheet_name in xls.sheet_names:
                            if sheet_name == param_name:
                                df_connections = pd.read_excel(xls, sheet_name=sheet_name)

                        combined_new_regions = pd.concat(new_region_dfs, ignore_index=True)
                        combined_new_regions = pd.concat([combined_new_regions,df_connections], ignore_index=True)
                        df.reset_index(inplace=True)
                        df = pd.concat([df, combined_new_regions], ignore_index=True)
                        df.set_index(df.columns[:index_count].tolist(), inplace=True)
                        df.to_csv(file_path)
                    else:
                        print(f"Region {region} not found in dataset")
                        df.to_csv(file_path)

    print("Processing completed.")





In [7]:
def timeseries_files(config, main_folder, output_path):
    ts_files = config['parameters']['time_series_files'] 
    for df, file_path in process_files(main_folder):
        df.columns = df.iloc[0]
        df = df[1:]
        param_name = os.path.splitext(os.path.basename(file_path))[0]
        file_path = os.path.join(output_path, f'{param_name}.csv')
        for file in ts_files:
            if param_name == file['file_name']: 
                regions_to_split = file['regions_to_split']
                new_regions = file['new_regions']
                split_ratio = file['split_ratio']
                print(param_name, regions_to_split, new_regions, split_ratio)
                
                for region in new_regions:
                    df[region] = df[regions_to_split].sum(axis=1) * split_ratio[region]
            df.to_csv(file_path)
                

In [None]:
pd.set_option('display.max_rows', None)
base_dir = "/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Data/Parameters"
base_dir_t = "/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Data/Timeseries"
output_dir = "/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/DataNew"
config = load_config("config_disaggregation.yaml")
trade_connection = '/Users/shwetat/Projects/Genesys-mod_data_repo/GENeSYS_MOD.data/Aggregate Disaggregate Scripts/routeSettings.xlsx'
process_unchanged_files(config, base_dir, output_dir)
process_changed_files(config, base_dir, output_dir)
trade_files(config, base_dir, output_dir, trade_connection)
timeseries_files(config, base_dir_t, output_dir)
#special_files(config, base_dir, output_dir)