In [12]:
import pandas as pd
import re
import os as os

# ---------------------------------------------------
# 1. Sector Mapping for Standardization
# ---------------------------------------------------

sector_mapping = {
    "C10T12": "C10-C12",
    "C13T15": "C13-C15",
    "C31_32": "C31_C32",
    "E37T39": "E37-E39",
    "J59_60": "J59_J60",
    "J62_63": "J62_J63",
    "M69_70": "M69_M70",
    "M74_75": "M74_M75",
    "L":"L68"
}


# ---------------------------------------------------
# 2. Apply Sector Mapping
# ---------------------------------------------------

def apply_sector_mapping(df):
    """
    Replace sector names in both rows (index) and columns based on the mapping,
    accounting for leading underscores.
    """
    # Adjust sector mapping to handle leading underscores
    sector_pattern = {rf'_{re.escape(k)}\b': f"_{v}" for k, v in sector_mapping.items()}

    # Apply mapping to rows (index)
    df.index = df.index.to_series().replace(sector_pattern, regex=True)

    # Apply mapping to columns
    df.columns = df.columns.to_series().replace(sector_pattern, regex=True)

    return df

# ---------------------------------------------------
# 1. Helper Functions for Prefixing Rows and Columns
# ---------------------------------------------------

def prefix_rows(df):
    df.index = ['op_' + str(row) for row in df.index]
    return df

def prefix_columns(df):
    df.columns = ['ip_' + str(col) for col in df.columns]
    return df

# ---------------------------------------------------
# 2. Function to Extract Country Codes from Columns
# ---------------------------------------------------

def extract_country_codes(df):
    country_codes = set()
    for col in df.columns:
        match = re.match(r'ip_([A-Z]{2,5})_', col)
        if match:
            country_codes.add(match.group(1))

    if 'FIGW1' not in country_codes and any(col.startswith('ip_FIGW1') for col in df.columns):
        country_codes.add('FIGW1')

    return list(country_codes)

# ---------------------------------------------------
# 3. Column and Row Aggregation Functions
# ---------------------------------------------------

def aggregate_columns(df, country_prefix):
    columns_to_aggregate = {
        'N': [f'ip_{country_prefix}_N77', f'ip_{country_prefix}_N78', f'ip_{country_prefix}_N79', f'ip_{country_prefix}_N80T82'],
        'Q': [f'ip_{country_prefix}_Q86', f'ip_{country_prefix}_Q87_88'],
        'R_S': [f'ip_{country_prefix}_R90T92', f'ip_{country_prefix}_R93', f'ip_{country_prefix}_S94', f'ip_{country_prefix}_S95', f'ip_{country_prefix}_S96']
    }
    new_columns = {}
    for new_col, old_cols in columns_to_aggregate.items():
        available_cols = [col for col in old_cols if col in df.columns]
        if available_cols:
            new_columns[f'ip_{country_prefix}_{new_col}'] = df[available_cols].sum(axis=1)

    return pd.DataFrame(new_columns)

def aggregate_rows(df, country_prefix):
    rows_to_aggregate = {
        'N': [f'op_{country_prefix}_N77', f'op_{country_prefix}_N78', f'op_{country_prefix}_N79', f'op_{country_prefix}_N80T82'],
        'Q': [f'op_{country_prefix}_Q86', f'op_{country_prefix}_Q87_88'],
        'R_S': [f'op_{country_prefix}_R90T92', f'op_{country_prefix}_R93', f'op_{country_prefix}_S94', f'op_{country_prefix}_S95', f'op_{country_prefix}_S96']
    }
    new_rows = {}
    for new_row, old_rows in rows_to_aggregate.items():
        available_rows = [row for row in old_rows if row in df.index]
        if available_rows:
            new_rows[f'op_{country_prefix}_{new_row}'] = df.loc[available_rows].sum()

    return pd.DataFrame(new_rows).T

# ---------------------------------------------------
# 4. Functions to Remove Old Rows and Columns
# ---------------------------------------------------

def remove_old_columns(df, country_prefix):
    columns_to_remove = [
        f'ip_{country_prefix}_N77', f'ip_{country_prefix}_N78', f'ip_{country_prefix}_N79', f'ip_{country_prefix}_N80T82',
        f'ip_{country_prefix}_Q86', f'ip_{country_prefix}_Q87_88',
        f'ip_{country_prefix}_R90T92', f'ip_{country_prefix}_R93', f'ip_{country_prefix}_S94', f'ip_{country_prefix}_S95', f'ip_{country_prefix}_S96'
    ]
    return df.drop(columns=[col for col in columns_to_remove if col in df.columns])

def remove_old_rows(df, country_prefix):
    rows_to_remove = [
        f'op_{country_prefix}_N77', f'op_{country_prefix}_N78', f'op_{country_prefix}_N79', f'op_{country_prefix}_N80T82',
        f'op_{country_prefix}_Q86', f'op_{country_prefix}_Q87_88',
        f'op_{country_prefix}_R90T92', f'op_{country_prefix}_R93', f'op_{country_prefix}_S94', f'op_{country_prefix}_S95', f'op_{country_prefix}_S96'
    ]
    return df.drop(index=[row for row in rows_to_remove if row in df.index])

# ---------------------------------------------------
# 5. Insertion Functions for Aggregated Rows and Columns
# ---------------------------------------------------

def insert_aggregated_columns(df, new_columns, country_prefix):
    for col_name in new_columns.columns:
        if col_name.endswith('_N'):
            insert_after = f'ip_{country_prefix}_M74_M75'
        elif col_name.endswith('_Q'):
            insert_after = f'ip_{country_prefix}_P85'
        elif col_name.endswith('_R_S'):
            insert_after = f'ip_{country_prefix}_Q'
        else:
            insert_after = None

        if insert_after in df.columns:
            insertion_idx = df.columns.get_loc(insert_after) + 1
            df_part1 = df.iloc[:, :insertion_idx]
            df_part2 = df.iloc[:, insertion_idx:]
            new_col_df = pd.DataFrame(new_columns[col_name])
            new_col_df.columns = [col_name]
            df = pd.concat([df_part1, new_col_df, df_part2], axis=1)
        else:
            df[col_name] = new_columns[col_name]

    return df

def insert_aggregated_rows(df, new_rows, country_prefix):
    for row_name, series in new_rows.iterrows():
        if row_name.endswith('_N'):
            insert_after = f'op_{country_prefix}_M74_M75'
        elif row_name.endswith('_Q'):
            insert_after = f'op_{country_prefix}_P85'
        elif row_name.endswith('_R_S'):
            insert_after = f'op_{country_prefix}_Q'
        else:
            insert_after = None

        if insert_after in df.index:
            insertion_idx = df.index.get_loc(insert_after) + 1
            df_part1 = df.iloc[:insertion_idx, :]
            df_part2 = df.iloc[insertion_idx:, :]
            new_row_df = pd.DataFrame(series).T
            new_row_df.index = [row_name]
            df = pd.concat([df_part1, new_row_df, df_part2])
        else:
            df.loc[row_name] = series

    return df

# ---------------------------------------------------
# 6. Function to Add Gross Output Row
# ---------------------------------------------------

def add_gross_output_row(df):
    data_columns = df.filter(regex='^ip_')
    gross_output = data_columns.sum(axis=0)
    gross_output_row = pd.DataFrame(gross_output).T
    gross_output_row.index = ['op_GO']

    if len(gross_output_row.columns) != len(df.columns):
        raise ValueError(f"Gross output row has {len(gross_output_row.columns)} values, but the DataFrame has {len(df.columns)} columns.")

    df_with_gross_output = pd.concat([df, gross_output_row], axis=0)
    return df_with_gross_output

# ---------------------------------------------------
# 5. Function to Calculate and Insert CPI Weight
# ---------------------------------------------------

def add_country_cpi_weight(df):
    """
    Calculate the CPI weight (iso_cpi_weight) for each sector based on household consumption.
    The new column is added directly to the right of the last consumption-related column for each country.
    """
    # Define the rows to exclude from the household consumption sum
    rows_to_exclude = ['op_W2_D21X31', 'op_W2_OP_RES', 'op_W2_OP_NRES', 
        'op_W2_D1', 'op_W2_D29X39', 'op_W2_B2A3G', 'op_GO']

    # Get all country codes from the DataFrame
    country_codes = extract_country_codes(df)

    for country in country_codes:
        # Identify household consumption column for the country
        household_col = f'ip_{country}_P3_S14'

        # Check if the household consumption column exists
        if household_col not in df.columns:
            print(f"Household consumption column {household_col} not found for country {country}. Skipping...")
            continue

        # Calculate the total household consumption for the country, excluding specified rows
        total_household_consumption = df.loc[~df.index.isin(rows_to_exclude), household_col].sum()

        # Calculate the country CPI weight as the sector's share of total household consumption
        cpi_weight_column = df[household_col] / total_household_consumption

        # Determine the insertion index: right after the last related column (P5M)
        insert_after = f'ip_{country}_P5M'
        if insert_after in df.columns:
            insertion_idx = df.columns.get_loc(insert_after) + 1
        else:
            # If expected column not found, append to the end as a fallback
            insertion_idx = len(df.columns)

        # Insert the calculated column into the DataFrame at the specific position
        df.insert(insertion_idx, f'{country}_cpi_weight', cpi_weight_column)

    return df


# ---------------------------------------------------
# 7. Main Function to Aggregate and Insert Rows/Columns and Add Gross Output
# ---------------------------------------------------

def aggregate_and_insert_with_gross_output(df):
    """
    Main function to handle the full process of row and column aggregation, insertion, and adding the Gross Output row.
    """
    country_codes = extract_country_codes(df)

    for country in country_codes:
        # Aggregate and Insert Columns
        new_columns = aggregate_columns(df, country)
        df = remove_old_columns(df, country)
        df = insert_aggregated_columns(df, new_columns, country)

        # Aggregate and Insert Rows
        new_rows = aggregate_rows(df, country)
        df = remove_old_rows(df, country)
        df = insert_aggregated_rows(df, new_rows, country)

    # Add Gross Output Row at the end
    df = add_gross_output_row(df)

    return df

# ---------------------------------------------------
# 8. Split Tables into Quadrants
# ---------------------------------------------------

def extract_tables_manually(df):
    num_industries = 56
    num_countries = 46
    interindustry_size = num_industries * num_countries - 1 # Total rows/columns for interindustry quadrant -1 for index=0

    interindustry_row_start = 0
    interindustry_row_end = interindustry_size + 1  #+1 for index=0
    interindustry_col_start = 1  # Skip the row labels column
    interindustry_col_end = interindustry_size + 1  # Columns for industry-to-industry flows

    final_demand_row_start = 0
    final_demand_row_end = interindustry_size + 1 #+1 for index=0
    final_demand_col_start = interindustry_col_end  # Start where the interindustry columns ended
    final_demand_col_end = df.shape[1]  # To the end of the data

    output_row_start = interindustry_size + 1  #+1 for index=0
    output_row_end = interindustry_size + 8  # 7 rows for output data and +1 for index=0
    output_col_start = 1  # Same as interindustry flow's start
    output_col_end = final_demand_col_start  # Same end as interindustry flow

    interindustry_df = df.iloc[interindustry_row_start:interindustry_row_end,
                            [0] + list(range(interindustry_col_start, interindustry_col_end))]
    
    final_demand_df = df.iloc[final_demand_row_start:final_demand_row_end,
                            list(range(final_demand_col_start, final_demand_col_end))]

    output_df = df.iloc[output_row_start:output_row_end,
                            [0] + list(range(output_col_start, output_col_end))]

    return interindustry_df, final_demand_df, output_df

# ---------------------------------------------------
# 9. Save the Split Tables
# ---------------------------------------------------

def save_tables(interindustry_df, final_demand_df, output_df, full_df, year, output_dir):
    """
    Save interindustry, final demand, and output tables, along with the full DataFrame, into respective directories by year.
    """
    # Ensure the output directory exists for this specific year
    year_dir = os.path.join(output_dir, str(year))
    os.makedirs(year_dir, exist_ok=True)

    # Save each table as a CSV file
    interindustry_df.to_csv(os.path.join(year_dir, f'interindustry_table_{year}.csv'), index=True)
    final_demand_df.to_csv(os.path.join(year_dir, f'final_demand_table_{year}.csv'), index=True)
    output_df.to_csv(os.path.join(year_dir, f'output_table_{year}.csv'), index=True)

    # Save the full processed DataFrame with Gross Output row
    full_df.to_csv(os.path.join(year_dir, f'full_table_{year}_aggregated_with_GO.csv'), index=True)

    print(f"Processed and saved tables for the year {year} in directory: {year_dir}")
 

# ---------------------------------------------------
# 10. Main Function to Process All Files and Split Tables
# ---------------------------------------------------

def process_all_files_in_folder(input_dir, output_dir):
    """
    Process all CSV files in the input folder, perform aggregation and insertion,
    add the 'Gross Output' row, add CPI weights, split into quadrants, and save.
    """
    os.makedirs(output_dir, exist_ok=True)

    for filename in os.listdir(input_dir):
        if filename.endswith(".csv"):
            file_path = os.path.join(input_dir, filename)
            print(f"Processing {filename}...")

            df = pd.read_csv(file_path, index_col=0)
            df = apply_sector_mapping(df)
            df = prefix_rows(df)
            df = prefix_columns(df)

            full_df = aggregate_and_insert_with_gross_output(df)
            
            # Add CPI weights for each sector
            full_df = add_country_cpi_weight(full_df)

            year = filename.split('_')[0]
            interindustry_df, final_demand_df, output_df = extract_tables_manually(full_df)

            save_tables(interindustry_df, final_demand_df, output_df, full_df, year, output_dir)

# Example usage
input_dir = 'C:/Users/danie/Nextcloud/Coding/Masterthesis/data/raw/figaro_tables'
output_dir = 'C:/Users/danie/Nextcloud/Coding/Masterthesis/data/processed/processed_tables'

# Process all files in the folder
process_all_files_in_folder(input_dir, output_dir)

Processing 2010_figaro_64.csv...
Processed and saved tables for the year 2010 in directory: C:/Users/danie/Nextcloud/Coding/Masterthesis/data/processed/processed_tables\2010
Processing 2011_figaro_64.csv...
Processed and saved tables for the year 2011 in directory: C:/Users/danie/Nextcloud/Coding/Masterthesis/data/processed/processed_tables\2011
Processing 2012_figaro_64.csv...
Processed and saved tables for the year 2012 in directory: C:/Users/danie/Nextcloud/Coding/Masterthesis/data/processed/processed_tables\2012
Processing 2013_figaro_64.csv...
Processed and saved tables for the year 2013 in directory: C:/Users/danie/Nextcloud/Coding/Masterthesis/data/processed/processed_tables\2013
Processing 2014_figaro_64.csv...
Processed and saved tables for the year 2014 in directory: C:/Users/danie/Nextcloud/Coding/Masterthesis/data/processed/processed_tables\2014
Processing 2015_figaro_64.csv...
Processed and saved tables for the year 2015 in directory: C:/Users/danie/Nextcloud/Coding/Master