In [1]:
import os
import pandas as pd
import openpyxl
import math

In [2]:
# Get the current working directory (where the script is located)
current_directory = os.getcwd()

# Assuming the script is in 'Conversion Script', and you need to go one level up to access 'Data' and 'Output'
parent_directory = os.path.dirname(current_directory)

# Directories inside the 'Data' folder
data_directory = os.path.join(parent_directory, 'Data')
sets_and_tags_directory = os.path.join(data_directory, 'Sets&Tags')
parameter_directory = os.path.join(data_directory, 'Parameters')
timeseries_directory = os.path.join(data_directory, 'Timeseries')

# Directories for output
output_directory = os.path.join(parent_directory, 'Output')
output_csv_directory = os.path.join(output_directory, 'output_csv')
output_excel_directory = os.path.join(output_directory, 'output_excel')
output_excel_file_path = os.path.join(output_excel_directory, 'output.xlsx')
output_excel_file_path_timeseries = os.path.join(output_excel_directory, 'output_timeseries.xlsx')

# Path to the Excel settings file
excel_file_path = os.path.join(current_directory, 'Set_filter_file.xlsx')

# Output format
output_file_format = 'csv'  # Change this to 'csv' or 'excel' depending on your needs
output_format = 'long' # Change to 'wide' or 'long' depending on your needs

In [3]:
def read_settings_file(file_path):
    # Open the Excel file using the provided file path
    xls = pd.ExcelFile(file_path, engine='openpyxl')
    
    # Get the list of sheet names in the Excel file
    sheets_to_read = xls.sheet_names
    
    unique_values_concatenated = pd.DataFrame()
    column_list = []
    
    # Read sheets and store them in the dictionary
    for sheet_name in sheets_to_read:
        data_frame = xls.parse(sheet_name)
    
        filtered_df = data_frame[data_frame.iloc[:, 1] == 1] # Assuming the second column is indexed at 1 (0-based index)
    
        column_list.append(filtered_df.columns[0]) # Collect column header for each set sheet
       
        unique_values = pd.DataFrame(filtered_df.iloc[:, 0].unique())  # Assuming the first column is indexed at 0 (0-based index)
        
        unique_values_concatenated = pd.concat([unique_values_concatenated, unique_values], axis=1)
    
    # Close the Excel file
    xls.close()    
    
    # Need to put header to the DataFrame
    unique_values_concatenated.columns = column_list
    
    # Create a CSV file containing unique values
    unique_values_csv_file_path = os.path.join(output_csv_directory, 'Sets.csv')
    unique_values_concatenated.to_csv(unique_values_csv_file_path, index=False, decimal='.') 
    
    if "Region" in unique_values_concatenated.columns:
        unique_values_concatenated["Region2"] = unique_values_concatenated["Region"]
    
    # Return the concatenated DataFrame of unique values
    return unique_values_concatenated


In [4]:
def read_regular_parameters(main_directory, file_pattern='Par_'):
    filepaths = []

    # Updated path for the sets_and_tags_directory within the 00_Parameters directory
    sets_and_tags_directory_f = os.path.join(main_directory, parameter_directory, sets_and_tags_directory)
    
    # Check if the sets_and_tags_directory exists, then add files
    if os.path.exists(sets_and_tags_directory_f):
        for f in os.listdir(sets_and_tags_directory_f):
            if f.startswith(file_pattern) and f.endswith('.csv'):
                filepaths.append(os.path.join(sets_and_tags_directory_f, f))

    # Continue to add files from subdirectories of 00_Parameters
    parameters_directory_f = os.path.join(main_directory, parameter_directory)
    for root, dirs, files in os.walk(parameters_directory_f):
        for file in files:
            if file.startswith(file_pattern) and file.endswith('.csv'):
                filepaths.append(os.path.join(root, file))

    return filepaths



def process_regular_parameter(csv_file_path, unique_values_concatenated):
    # Compute and truncate worksheet_name to ensure it doesn't exceed 31 characters
    worksheet_name = os.path.splitext(os.path.basename(csv_file_path))[0]
    if len(worksheet_name) > 31:
        worksheet_name = worksheet_name[:31]

    # Read the CSV file into a Pandas DataFrame
    df = pd.read_csv(csv_file_path, delimiter=',')

    # Data conversions and handling NaNs
    if 'Year' in df.columns:
        df['Year'] = pd.to_numeric(df['Year'], errors='coerce', downcast='integer')
        df = df.dropna(subset=['Year'])  # Dropping NaNs in 'Year'

    if 'Mode_of_operation' in df.columns:
        df['Mode_of_operation'] = df['Mode_of_operation'].astype('Int64', errors='ignore')
        df = df.dropna(subset=['Mode_of_operation'])  # Dropping NaNs in 'Mode_of_operation'

    # Rename columns with .1, .2, etc. naming convention
    for col in df.columns:
        if '.' in col:
            base_name, counter = col.split('.')
            new_col_name = f"{base_name}{int(counter) + 1}"  # Add 1 because we start from the first duplicate
            df.rename(columns={col: new_col_name}, inplace=True)

    # Filter DataFrame based on unique_values_concatenated
    columns_to_keep = [col for col in df.columns if col in unique_values_concatenated.columns or col == 'Value']
    df = df[columns_to_keep]

    for header in unique_values_concatenated.columns:
        if header in df.columns:
            df = df[df[header].isin(unique_values_concatenated[header])]

    # Determine the pivot column
    pivot_column = 'Region2' if 'Region2' in df.columns and 'Region' in df.columns else 'Year'

    # Pivot the DataFrame if the pivot column exists
    if pivot_column in df.columns:
        # Pivot the DataFrame
        df_pivot = df.pivot(index=[col for col in df.columns if col not in [pivot_column, 'Value']],
                            columns=pivot_column, values='Value').reset_index()

        # Flatten MultiIndex columns (if any)
        df_pivot.columns = ['_'.join(map(str, col)).strip() if isinstance(col, tuple) else str(col) for col in df_pivot.columns.values]

        # Replace NaNs with empty strings for better readability
        df_pivot.replace('nan', '', inplace=True)
    else:
        df_pivot = df  # If pivot_column is not in df, use original DataFrame

    return df, df_pivot, worksheet_name


In [5]:
def read_regular_parameters(main_directory, file_pattern='Par_'):
    filepaths = []

    # Updated path for the sets_and_tags_directory within the 00_Parameters directory
    sets_and_tags_directory_f = os.path.join(main_directory, parameter_directory, sets_and_tags_directory)
    
    # Check if the sets_and_tags_directory exists, then add files
    if os.path.exists(sets_and_tags_directory_f):
        for f in os.listdir(sets_and_tags_directory_f):
            if f.startswith(file_pattern) and f.endswith('.csv'):
                filepaths.append(os.path.join(sets_and_tags_directory_f, f))

    # Continue to add files from subdirectories of 00_Parameters
    parameters_directory_f = os.path.join(main_directory, parameter_directory)
    for root, dirs, files in os.walk(parameters_directory_f):
        for file in files:
            if file.startswith(file_pattern) and file.endswith('.csv'):
                filepaths.append(os.path.join(root, file))

    return filepaths



def process_regular_parameter(csv_file_path, unique_values_concatenated):
    # Compute and truncate worksheet_name to ensure it doesn't exceed 31 characters
    worksheet_name = os.path.splitext(os.path.basename(csv_file_path))[0]
    if len(worksheet_name) > 31:
        worksheet_name = worksheet_name[:31]

    # Read the CSV file into a Pandas DataFrame
    df = pd.read_csv(csv_file_path, delimiter=',')

    # Data conversions and handling NaNs
    if 'Year' in df.columns:
        df['Year'] = pd.to_numeric(df['Year'], errors='coerce', downcast='integer')
        df = df.dropna(subset=['Year'])  # Dropping NaNs in 'Year'

    if 'Mode_of_operation' in df.columns:
        df['Mode_of_operation'] = df['Mode_of_operation'].astype('Int64', errors='ignore')
        df = df.dropna(subset=['Mode_of_operation'])  # Dropping NaNs in 'Mode_of_operation'

    # Rename columns with .1, .2, etc. naming convention
    for col in df.columns:
        if '.' in col:
            base_name, counter = col.split('.')
            new_col_name = f"{base_name}{int(counter) + 1}"  # Add 1 because we start from the first duplicate
            df.rename(columns={col: new_col_name}, inplace=True)

    # Filter DataFrame based on unique_values_concatenated
    columns_to_keep = [col for col in df.columns if col in unique_values_concatenated.columns or col == 'Value']
    df = df[columns_to_keep]

    for header in unique_values_concatenated.columns:
        if header in df.columns:
            df = df[df[header].isin(unique_values_concatenated[header])]    
    
    # Initialize df_pivot
    df_pivot = df  # Default to original DataFrame

    if output_format == 'wide':
        # Determine the pivot column
        pivot_column = 'Region2' if 'Region2' in df.columns and 'Region' in df.columns else 'Year'    
    
        # Pivot the DataFrame if the pivot column exists
        if pivot_column in df.columns:
            # Pivot the DataFrame
            df_pivot = df.pivot(index=[col for col in df.columns if col not in [pivot_column, 'Value']],
                            columns=pivot_column, values='Value').reset_index()

            # Flatten MultiIndex columns (if any)
            df_pivot.columns = ['_'.join(map(str, col)).strip() if isinstance(col, tuple) else str(col) for col in df_pivot.columns.values]

            # Replace NaNs with empty strings for better readability
            df_pivot.replace('nan', '', inplace=True)

    return df_pivot, worksheet_name


In [6]:
def output_regular_parameters(dataframes_dict, output_directory, output_file_format='excel'):
    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)

    if output_file_format == 'excel':
        
        # Sort worksheet names alphabetically, but keep 'Sets' at the beginning
        sorted_worksheet_names = ['Sets'] + sorted([name for name in dataframes_dict if name != 'Sets'])

        # Write to Excel file
        with pd.ExcelWriter(output_excel_file_path, engine='openpyxl', mode='w') as excel_writer:
            for worksheet_name in sorted_worksheet_names:
                df_to_output = dataframes_dict[worksheet_name]
                df_to_output.to_excel(excel_writer, sheet_name=worksheet_name, index=False)
    else:
        # Handle CSV output if required (same as before)
        for worksheet_name, df in dataframes_dict.items():
            output_file_path = os.path.join(output_csv_directory, f"{worksheet_name}.csv")
            df.to_csv(output_file_path, index=False)


In [7]:
# Ensure unique_values_concatenated is defined
unique_values_concatenated = read_settings_file(excel_file_path)

# Process each regular parameter file
regular_parameter_paths = read_regular_parameters(current_directory)

# Store the worksheet names and corresponding dataframes
worksheets_data = {'Sets': unique_values_concatenated}  # Including 'Sets' sheet

# Process files and store dataframes with their names
for path in regular_parameter_paths:
    df_pivot, worksheet_name = process_regular_parameter(path, unique_values_concatenated)
    worksheets_data[worksheet_name] = df_pivot  # or df_original based on your requirement

# Call the function to output data
output_regular_parameters(worksheets_data, output_excel_directory, output_file_format)


In [8]:
def read_filter_timeseries(timeseries_dir, unique_values_concatenated):
    filtered_data = {}

    # Get the list of unique regions from unique_values_concatenated
    unique_regions = unique_values_concatenated['Region'].unique()

    # Iterate through each subdirectory in '00_Timeseries'
    for subdir in os.listdir(timeseries_dir):
        subdir_path = os.path.join(timeseries_dir, subdir)
        if os.path.isdir(subdir_path):
            # Assuming there is only one CSV file per subdirectory
            csv_file = next((f for f in os.listdir(subdir_path) if f.endswith('.csv')), None)
            if csv_file:
                csv_path = os.path.join(subdir_path, csv_file)

                # Read only the first row (excluding the first row of the file) to get the headers (regions)
                headers = pd.read_csv(csv_path, skiprows=1, nrows=0)

                # Include the first column (whatever it is) and filter the rest based on unique_regions
                columns_to_keep = [headers.columns[0]] + [col for col in headers.columns[1:] if col in unique_regions]

                # Now read the entire CSV with filtered columns, skipping the first row
                df = pd.read_csv(csv_path, skiprows=1, usecols=columns_to_keep)

                filtered_data[subdir] = df

    return filtered_data

In [9]:
def output_timeseries_data(filtered_data, output_file_format='excel'):
    # Determine the output directory based on the file format
    if output_file_format == 'excel':
        output_directory = output_excel_directory
        output_file_path = output_excel_file_path_timeseries
    else:
        output_directory = output_csv_directory

    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)

    if output_file_format == 'excel':
        # Check if the Excel file exists to decide on the mode
        mode = 'a' if os.path.exists(output_file_path) else 'w'

        # Open the ExcelWriter
        with pd.ExcelWriter(output_file_path, engine='openpyxl', mode=mode) as writer:
            for sheet_name, df in filtered_data.items():
                if mode == 'a' and sheet_name in writer.book.sheetnames:
                    del writer.book[sheet_name]
                df.to_excel(writer, sheet_name=sheet_name, index=False)

    elif output_file_format == 'csv':
        for file_name, df in filtered_data.items():
            output_file_path = os.path.join(output_directory, f"{file_name}.csv")
            df.to_csv(output_file_path, index=False)


In [10]:
# Read and filter time series data
filtered_timeseries_data = read_filter_timeseries(timeseries_directory, unique_values_concatenated)

# Output the processed data
output_timeseries_data(filtered_timeseries_data, output_file_format)