In [1]:
import os
import pandas as pd
import openpyxl
import math

In [2]:
output_csv_directory = 'output_csv'
output_excel_directory = 'output_excel'
sets_and_tags_directory = '00_Sets&Tags'
parameter_directory = '00_Parameters'
excel_file_path = 'GENeSYS-MOD_User_Input_Settings_v07_kh_06-10-2023.xlsx'
current_directory = os.getcwd()

In [3]:
def read_settings_file(file_path):
    # Open the Excel file
    xls = pd.ExcelFile(excel_file_path, engine='openpyxl')
    
    # Get the list of sheet names in the Excel file
    sheets_to_read = xls.sheet_names
    
    # Initialize an empty dictionary to store DataFrames
    data_frames = {}
    filtered_df = {}
    unique_values = {}
    
    unique_values_concatenated = pd.DataFrame()
    column_list = []
    
    # Read sheets and store them in the dictionary
    for sheet_name in sheets_to_read:
        data_frames = xls.parse(sheet_name)
    
        filtered_df= data_frames[data_frames.iloc[:, 1] == 1] # Assuming the second column is indexed at 1 (0-based index)
    
        column_list.append(filtered_df.columns[0]) # collect column header for each set sheet
       
        unique_values= pd.DataFrame(filtered_df.iloc[:, 0].unique())  # Assuming the first column is indexed at 0 (0-based index)
        unique_values_parameter = pd.DataFrame(unique_values)
        
        unique_values_concatenated = pd.concat([unique_values_concatenated, unique_values], axis=1)
    
    # Close the Excel file
    xls.close()    
        
    # Need to put header to the dataframe
    unique_values_concatenated.columns = column_list
    
    # Create a CSV file containing unique values
    unique_values_csv_file_path = os.path.join(output_csv_directory, 'Sets.csv')
    unique_values_concatenated.to_csv(unique_values_csv_file_path, index=False, decimal='.') 
    
    if "Region" in unique_values_concatenated.columns:
        unique_values_concatenated["Region2"] = unique_values_concatenated["Region"]
    # Return the concatenated DataFrame of unique values
    return unique_values_concatenated

In [8]:
def read_regular_parameters(main_directory, file_pattern='Par_'):
    filepaths = []

    # Updated path for the sets_and_tags_directory within the 00_Parameters directory
    sets_and_tags_directory_f = os.path.join(main_directory, parameter_directory, sets_and_tags_directory)
    
    # Check if the sets_and_tags_directory exists, then add files
    if os.path.exists(sets_and_tags_directory_f):
        for f in os.listdir(sets_and_tags_directory_f):
            if f.startswith(file_pattern) and f.endswith('.csv'):
                filepaths.append(os.path.join(sets_and_tags_directory_f, f))

    # Continue to add files from subdirectories of 00_Parameters
    parameters_directory_f = os.path.join(main_directory, parameter_directory)
    for root, dirs, files in os.walk(parameters_directory_f):
        for file in files:
            if file.startswith(file_pattern) and file.endswith('.csv'):
                filepaths.append(os.path.join(root, file))

    return filepaths



def process_regular_parameter(csv_file_path, unique_values_concatenated):
    # Compute and truncate worksheet_name to ensure it doesn't exceed 31 characters
    worksheet_name = os.path.splitext(os.path.basename(csv_file_path))[0]
    if len(worksheet_name) > 31:
        worksheet_name = worksheet_name[:31]

    # Read the CSV file into a Pandas DataFrame
    df = pd.read_csv(csv_file_path, delimiter=',')

    # Data conversions and handling NaNs
    if 'Year' in df.columns:
        df['Year'] = pd.to_numeric(df['Year'], errors='coerce', downcast='integer')
        df = df.dropna(subset=['Year'])  # Dropping NaNs in 'Year'

    if 'Mode_of_operation' in df.columns:
        df['Mode_of_operation'] = df['Mode_of_operation'].astype('Int64', errors='ignore')
        df = df.dropna(subset=['Mode_of_operation'])  # Dropping NaNs in 'Mode_of_operation'

    # Rename columns with .1, .2, etc. naming convention
    for col in df.columns:
        if '.' in col:
            base_name, counter = col.split('.')
            new_col_name = f"{base_name}{int(counter) + 1}"  # Add 1 because we start from the first duplicate
            df.rename(columns={col: new_col_name}, inplace=True)

    # Filter DataFrame based on unique_values_concatenated
    columns_to_keep = [col for col in df.columns if col in unique_values_concatenated.columns or col == 'Value']
    df = df[columns_to_keep]

    for header in unique_values_concatenated.columns:
        if header in df.columns:
            df = df[df[header].isin(unique_values_concatenated[header])]

    # Determine the pivot column
    pivot_column = 'Region2' if 'Region2' in df.columns and 'Region' in df.columns else 'Year'

    # Pivot the DataFrame if the pivot column exists
    if pivot_column in df.columns:
        # Pivot the DataFrame
        df_pivot = df.pivot(index=[col for col in df.columns if col not in [pivot_column, 'Value']],
                            columns=pivot_column, values='Value').reset_index()

        # Flatten MultiIndex columns (if any)
        df_pivot.columns = ['_'.join(map(str, col)).strip() if isinstance(col, tuple) else str(col) for col in df_pivot.columns.values]

        # Replace NaNs with empty strings for better readability
        df_pivot.replace('nan', '', inplace=True)
    else:
        df_pivot = df  # If pivot_column is not in df, use original DataFrame

    return df, df_pivot, worksheet_name


#    return df, worksheet_name


In [9]:
def output_regular_parameters(df, worksheet_name, output_directory, writer=None, output_format='excel'):
    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)

    if output_format == 'csv':
        # Define the path for the CSV file
        output_file_path = os.path.join(output_directory, f"{worksheet_name}.csv")
        # Write DataFrame to CSV
        df.to_csv(output_file_path, index=False)

    elif output_format == 'excel' and writer is not None:
        # Check if the sheet exists and remove it
        if worksheet_name in writer.book.sheetnames:
            del writer.book[worksheet_name]
        # Write DataFrame to Excel using the provided writer
        df.to_excel(writer, sheet_name=worksheet_name, index=False)



In [12]:
# Ensure unique_values_concatenated is defined
unique_values_concatenated = read_settings_file(excel_file_path)

# Define the output format - can be 'csv' or 'excel'
output_format = 'csv'  # Change this to 'csv' if you want CSV output

# Process each regular parameter file
regular_parameter_paths = read_regular_parameters(current_directory)

# Store the worksheet names and corresponding dataframes
worksheets_data = {}

if output_format == 'excel':
    output_excel_file_path = os.path.join(output_excel_directory, 'output.xlsx')

    # Process files and store dataframes with their names
    for path in regular_parameter_paths:
        df_original, df_pivot, worksheet_name = process_regular_parameter(path, unique_values_concatenated)
        worksheets_data[worksheet_name] = df_pivot

    # Sort worksheet names alphabetically, excluding 'Sets'
    sorted_worksheets = sorted([name for name in worksheets_data if name != 'Sets'])

    # Write to Excel file
    with pd.ExcelWriter(output_excel_file_path, engine='openpyxl') as excel_writer:
        # Write 'Sets' sheet first
        unique_values_concatenated.to_excel(excel_writer, sheet_name='Sets', index=False)

        # Write other sheets in alphabetical order
        for worksheet_name in sorted_worksheets:
            df_pivot = worksheets_data[worksheet_name]
            df_pivot.to_excel(excel_writer, sheet_name=worksheet_name, index=False)

else:
    # CSV output processing (same as before)
    for path in regular_parameter_paths:
        df_original, df_pivot, worksheet_name = process_regular_parameter(path, unique_values_concatenated)
        try:
            output_csv_file_path = os.path.join(output_csv_directory, f"{worksheet_name}.csv")
            df_original.to_csv(output_csv_file_path, index=False)
        except Exception as e:
            print(f"Error processing file {path}: {e}")
