In [1]:
import pandas as pd
import numpy as np

In [2]:
A_df = pd.read_csv("data/A.csv", header=None)
B_df = pd.read_csv("data/B.csv", header=None)
C_df = pd.read_csv("data/C.csv", header=None)

# Convert all string-looking numbers to floats
A = A_df.apply(pd.to_numeric, errors='coerce').values
B = B_df.apply(pd.to_numeric, errors='coerce').values
C = C_df.apply(pd.to_numeric, errors='coerce').values

In [3]:
A_index_df = pd.read_csv("data/index_A.csv")
B_index_df = pd.read_csv("data/index_B.csv")
C_index_df = pd.read_csv("data/index_C.csv")

In [4]:
A_foreground_df = pd.read_csv("data/Foreground_A.csv")

# Removing Transportation (deregionalization)

In [5]:
A_transport_df = pd.read_csv("data/Transportation_A.csv")

In [6]:
# create a dict mapping each provider name to all its indices in A_index_df
mapping = A_index_df.groupby('provider name')['index'].apply(list)

# build a single flat list of all matching indices for the foreground processes
matched_indices_transport = [
    idx
    for name in A_transport_df['provider name']
    if name in mapping
    for idx in mapping[name]
]

In [7]:
import numpy as np

# matched_indices_transport is the list of indices to remove
to_drop = np.array(sorted(set(matched_indices_transport), key=int))

# 1) Remove from A_index_df
mask_keep = ~A_index_df['index'].isin(to_drop)
A_index_df = A_index_df.loc[mask_keep].copy()

# 2) Remove corresponding rows and columns from A
A = np.delete(A, to_drop, axis=0)  # remove rows
A = np.delete(A, to_drop, axis=1)  # remove columns

# 3) Remove the same columns from B (keep rows)
B = np.delete(B, to_drop, axis=1)

# 4) Reset the index column in A_index_df
A_index_df['index'] = np.arange(len(A_index_df), dtype=int)

## Aggregating electricity

In [8]:
A_elec_df = pd.read_csv("data/Electricity_A.csv")

In [9]:
# Inputs assumed:
# A : numeric numpy array (rows x cols)
# A_index_df : DataFrame with columns ["index", "provider name", "flow name", ...]
# A_elec_df : DataFrame with column ["provider name"] listing all electricity providers
# The indices in A_index_df["index"] align with both row and column positions of A.

# 0) Build the set of electricity provider names
elec_names = set(A_elec_df['provider name'].dropna().astype(str).unique())

# 1) Find their indices in A_index_df
elec_idx = A_index_df.loc[A_index_df['provider name'].astype(str).isin(elec_names), 'index'].astype(int).unique()

# 2) Locate the mix row index (must exist)
mix_name = "Electricity Mix (Global)"
mix_rows = A_index_df.loc[A_index_df['provider name'] == mix_name, 'index'].astype(int).unique()
if len(mix_rows) == 0:
    raise ValueError("Electricity Mix (Global) not found in A_index_df['provider name'].")
mix_idx = int(mix_rows[0])

# Ensure the mix row is not purged
elec_idx_set = set(map(int, elec_idx))
elec_idx_wo_mix = sorted(elec_idx_set - {mix_idx})

# 3) Aggregate: add all electricity rows (except the mix row) into the mix row, column-wise
if len(elec_idx_wo_mix) > 0:
    # in case of NaNs
    add_block = np.nansum(A[elec_idx_wo_mix, :], axis=0)
    A[mix_idx, :] = np.nan_to_num(A[mix_idx, :]) + np.nan_to_num(add_block)

# 4) Decide what to drop
rows_to_drop = np.array(elec_idx_wo_mix, dtype=int)            # drop electricity rows except the mix row
cols_to_drop = np.array(elec_idx_wo_mix, dtype=int)            # drop electricity columns except the mix column

# (Optionally also drop the mix COLUMN; keep it if you want to retain that process as a column)
# To ALSO drop the mix column, uncomment the next line:
# cols_to_drop = np.array(sorted(elec_idx_set), dtype=int)

# 5) Remove rows/columns from A
if rows_to_drop.size > 0:
    A = np.delete(A, rows_to_drop, axis=0)
if cols_to_drop.size > 0:
    A = np.delete(A, cols_to_drop, axis=1)

# 6) Remove the same rows from A_index_df (only rows; columns in A_index_df are metadata)
if len(elec_idx_wo_mix) > 0:
    keep_mask = ~A_index_df['index'].astype(int).isin(elec_idx_wo_mix)
    A_index_df = A_index_df.loc[keep_mask].copy()

# 7) Reset the "index" column in A_index_df to reflect 0..n-1 after deletions
A_index_df['index'] = np.arange(len(A_index_df), dtype=int)

## Identifying background flows for cost calculation

In [10]:
# create a dict mapping each provider name to all its indices in A_index_df
mapping = A_index_df.groupby('provider name')['index'].apply(list)

# build a single flat list of all matching indices for the foreground processes
matched_indices = [
    idx
    for name in A_foreground_df['provider name']
    if name in mapping
    for idx in mapping[name]
]

In [11]:
# A is your numeric numpy array (rows × columns)
# matched_indices is the list of column indices of interest

# dictionary: column index → list of non-zero row indices
nonzero_rows = {col: list(np.nonzero(A[:, col])[0]) for col in matched_indices}

In [12]:
# foreground indices to exclude
foreground_set = set(matched_indices)

# collect all non-zero row indices from the matched columns
all_nonzero = set()
for col in matched_indices:
    all_nonzero.update(np.nonzero(A[:, col])[0])

# remove overlaps with foreground indices
filtered_nonzero_rows = list(all_nonzero - foreground_set)

print(filtered_nonzero_rows)

[1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 18, 19, 20, 21, 535, 27, 540, 29, 32, 35, 43, 46, 47, 48, 563, 567, 59, 63, 64, 65, 66, 67, 70, 71, 77, 79, 80, 81, 594, 596, 101, 104, 105, 110, 111, 115, 116, 117, 118, 119, 120, 122, 126, 127, 146, 171, 172, 173, 174, 175, 176, 179, 182, 184, 196, 207, 210, 215, 219, 222, 223, 224, 225, 236, 237, 264, 273, 288, 318, 319, 358, 366, 367, 374, 378, 380, 382, 383, 384, 387, 419, 420, 426, 429]


In [15]:
wanted = pd.Series(filtered_nonzero_rows, dtype=int)
result_df = (
    A_index_df.loc[A_index_df['index'].astype(int).isin(wanted),
                   ['index', 'provider name', 'flow name']]
    .reset_index(drop=True)
)

In [16]:
result_df

Unnamed: 0,index,provider name,flow name
0,1,"Diesel, combusted in industrial equipment","Diesel, combusted in industrial equipment"
1,3,"Steam, purchased by containerboard mills","Steam, purchased by containerboard mills"
2,4,"Natural gas, processed, for material use, at p...","Natural gas, production mixture, to material use"
3,5,"Natural gas, combusted in industrial boiler","Natural gas, combusted in industrial boiler"
4,6,"LPG, combusted in industrial boiler, at pulp a...","LPG, combusted in industrial boiler, at pulp a..."
...,...,...,...
89,540,LDPE Non-food Packaging Films,LDPE non-food packaging films
90,563,PP Non-food Packaging Films,PP non-food packaging films
91,567,"Acetic acid, at plant","Acetic acid, at plant"
92,594,"Natural gas, combusted in industrial boiler, a...","Natural gas, combusted in industrial boiler, a..."


In [17]:
result_df.to_csv('filtered_nonzero_rows_with_names.csv', index=False)