In [45]:
import os
import pandas as pd
import re
from rapidfuzz import fuzz, process


In [159]:
countries = ['Angola', 'Burundi', 'Cameroon', 'Central African Republic', 'Chad', 'Congo', 'Republic of the Congo', 'DRC', 'Drc', 'DR Congo', 'Equatorial Guinea', 'Gabon']
rename_countries = {
    'Republic Of The Congo': 'Congo',
    'Democratic Republic Of The Congo': 'Drc',
    'Dr Congo': 'Drc'
}

## 1. Load the African Hydropower Atlas

In [116]:
file_atlas = os.path.join(os.getcwd(), 'input', 'African_Hydropower_Atlas_v2-0.xlsx')

In [131]:
def remove_parentheses(text):
    if pd.isna(text):
        return ""
    # Remove all (...) and trim whitespace
    cleaned = re.sub(r"\s*\([^)]*\)", "", text)
    cleaned = re.sub(r"\s+", " ", cleaned).strip()
    return cleaned.lower()

In [137]:
data_atlas = pd.read_excel(file_atlas, sheet_name='1 - Spatial and technical data', index_col=None, skiprows=1, header=1)
data_atlas['Country'] = data_atlas['Country'].str.strip().str.title()
data_atlas['Country'] = data_atlas['Country'].replace(rename_countries)

data_atlas = data_atlas[data_atlas['Country'].isin(countries)]
print(f'Number of countries in the atlas: {len(data_atlas["Country"].unique())}. Countries: {data_atlas["Country"].unique()}')

data_atlas['Unit Name'] = data_atlas['Unit Name'].apply(remove_parentheses)

# data_atlas.set_index(['Country', 'Unit Name'], inplace=True)
print(f'Number of hydropower plants in the atlas: {len(data_atlas)}')



data_flow = pd.read_excel(file_atlas, sheet_name='4a - HydrofleetAll', index_col=None, skiprows=None, header=0)
data_flow = data_flow[data_flow['Country'].isin([c.upper() for c in countries])]
data_flow.rename(columns={'Name': 'Unit Name'}, inplace=True)
data_flow.set_index(['Country', 'Unit Name'], inplace=True)
# Rename columns to match the expected format
cols = pd.MultiIndex.from_product([['baseline', 'dry', 'wet'], range(1, 13)], names=['Scenario', 'Month'])
# Add scenarios to the columns
data_flow = data_flow.set_axis(cols, axis=1)
#print(data_flow.head())

Number of countries in the atlas: 8. Countries: ['Angola' 'Burundi' 'Cameroon' 'Central African Republic' 'Congo' 'Drc'
 'Equatorial Guinea' 'Gabon']
Number of hydropower plants in the atlas: 133


## 2. Load the Global Hydropower Tracker

In [103]:
file_db = os.path.join(os.getcwd(), 'generation', 'input', 'Global-Integrated-Power-April-2025.xlsx')

In [125]:
def clean_plant_name(name: str) -> str:
    if pd.isna(name):
        return ""

    name = name.lower()

    # Phrases to remove
    suffixes = [
        "hydroelectric plant",
        "hydroelectric dam",
        "wind farm",
        "solar farm",
        "solar project",
        "solar plant",
        "solar farms",
        "power station",
        "power plant",
        "hybrid solar farm",
        "thermal power station",
        "thermal plant",
        "central",  # often means "station" in French/Portuguese
        "centrale",
        "centrales d’energie renouvelable",
        "energy project",
        "solar",   # sometimes standalone e.g. "Lubango Solar"
        "dam"
    ]

    # Remove known phrases
    for s in suffixes:
        name = name.replace(s, "")

    # Remove anything in parentheses
    name = re.sub(r"\([^)]*\)", "", name)

    # Remove extra spaces and punctuation
    name = re.sub(r"[^\w\s]", "", name)  # Remove punctuation
    name = re.sub(r"\s+", " ", name)  # Normalize whitespace

    return name.strip()


In [148]:
if os.path.exists(file_db):
    data_raw_tracker = pd.read_excel(file_db, sheet_name='Power facilities', header=[0], index_col=None)
else:
    raise FileNotFoundError(f"File {file_db} does not exist.")

  for idx, row in parser.parse():


In [162]:


data_tracker = data_raw_tracker[data_raw_tracker['Country/area'].isin(countries)]
data_tracker = data_tracker.rename(columns={'Country/area': 'Country'})
data_tracker['Country'] = data_tracker['Country'].str.strip().str.title()
data_tracker['Country'] = data_tracker['Country'].replace(rename_countries)

print(f'Number of countries in the atlas: {len(data_tracker["Country"].unique())}. Countries: {data_tracker["Country"].unique()}')

data_tracker['Unit Name'] = data_tracker['Plant / Project name']
# Clean the plant names
data_tracker['Unit Name'] = data_tracker['Unit Name'].apply(clean_plant_name)


data_hydro_tracker = data_tracker[
    data_tracker['Type'] == 'hydropower'
].copy()
print(f'Number of hydropower plants in the tracker: {len(data_hydro_tracker)}')

data_non_hydro_tracker = data_tracker[
    ~data_tracker['Type'].str.lower().str.contains('hydropower', na=False)
].copy()
print(f'Number of non-hydropower plants in the tracker: {len(data_non_hydro_tracker)}')

Number of countries in the atlas: 9. Countries: ['Angola' 'Drc' 'Cameroon' 'Central African Republic' 'Chad'
 'Equatorial Guinea' 'Gabon' 'Congo' 'Burundi']
Number of hydropower plants in the tracker: 58
Number of non-hydropower plants in the tracker: 165


## 3. Merge the datasets

In [166]:
# Ensure consistent format
data_atlas.loc[:, 'Name_clean'] = data_atlas.loc[:, 'Unit Name'].str.lower().str.strip()
data_hydro_tracker.loc[:, 'Name_clean'] = data_hydro_tracker.loc[:, 'Unit Name'].str.lower().str.strip()

unified = []
diffs = []

# Loop by country
for country in data_atlas['Country'].unique():
    print(f'Processing country: {country}')
    df_atlas_c = data_atlas[data_atlas['Country'] == country].copy()
    df_tracker_c = data_hydro_tracker[data_hydro_tracker['Country'] == country].copy()

    if df_tracker_c.empty:
        print(f'No tracker data for {country}, skipping...')
        continue

    for _, atlas_row in df_atlas_c.iterrows():
        # Find best match in tracker for this plant

        result = process.extractOne(
            atlas_row['Name_clean'],
            df_tracker_c['Name_clean'],
            scorer=fuzz.token_sort_ratio,
            score_cutoff=60
        )
        if result:
            match_name, score, tracker_idx = result
        else:
            match_name, score, tracker_idx = None, None, None

        if match_name:
            tracker_row = df_tracker_c.loc[tracker_idx]

            # Combine atlas and tracker row into one unified row
            merged_row = {}

            merged_row['Both datasets'] = True

            # Add all columns from atlas row
            for col in data_atlas.columns:
                merged_row[f"{col}_atlas"] = atlas_row[col]

            # Add all columns from tracker row
            for col in data_tracker.columns:
                merged_row[f"{col}_tracker"] = tracker_row[col]

            unified.append(merged_row)
        else:
            merged_row = {f"{col}_atlas": atlas_row[col] for col in data_atlas.columns}
            unified.append(merged_row)

# Step 1: Collect all matched tracker names
matched_tracker_names = set(row[f"Name_clean_tracker"] for row in unified if "Name_clean_tracker" in row)

# Step 2: Filter tracker rows that weren't matched
unmatched_tracker = data_hydro_tracker[
    ~data_hydro_tracker['Name_clean'].isin(matched_tracker_names)
]

# Step 3: Add each unmatched tracker row to unified list (with '_tracker' prefix)
for _, tracker_row in unmatched_tracker.iterrows():
    merged_row = {f"{col}_tracker": tracker_row[col] for col in data_hydro_tracker.columns}
    unified.append(merged_row)


pd.DataFrame(unified).to_csv(os.path.join(os.getcwd(), 'hydro', 'output', 'hydro_atlas_tracker_unified.csv'), index=False)

Processing country: Angola
Processing country: Burundi
No tracker data for Burundi, skipping...
Processing country: Cameroon
Processing country: Central African Republic
Processing country: Congo
Processing country: Drc
Processing country: Equatorial Guinea
Processing country: Gabon


In [145]:
df_unified

Unnamed: 0.1,Unnamed: 0,Country,Unit Name,Status,Latitude,Longitude,River Name,River Basin,Spill From,River Channel ID,...,Captive Industry Use,Captive Non Industry Use,Location accuracy,City,"Local area (taluk, county)","Major area (prefecture, district)","Subnational unit (state, province)",GEM location ID,GEM unit/phase ID,GEM.Wiki URL
0,,Angola,gove,Existing,-13.451700,15.872200,Kunene,Kunene Basin,,4655.0,...,,,exact,,,,,L100001025775,G100001030660,https://www.gem.wiki/Gove_Dam_hydroelectric_plant
1,,Angola,capanda,Existing,-9.795300,15.466900,Kwanza,Kwanza Basin,,4323.0,...,,,exact,,Pungo Andongo,,Malanje Province,L100000600015,G100000600015,https://www.gem.wiki/Capanda_hydroelectric_plant
2,,Angola,baynes,Candidate,-17.188056,12.650556,Kunene,Kunene Basin,,4255.0,...,,,,,,,,,,
3,,Angola,lauca,Committed,-9.741136,15.130165,Kwanza,Kwanza Basin,Capanda,4323.0,...,,,exact,,,,Cuanza Sul Province,L100000600017,G100000600017,https://www.gem.wiki/Laúca_hydroelectric_plant
4,,Angola,lauca ecologica,Committed,-9.741136,15.130165,Kwanza,Kwanza Basin,,4323.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
111,,Gabon,booué,pre-construction,-0.088000,11.933900,,,,,...,,,approximate,Booué,Booué,Ogooue-Ivindo,Ogooue-Ivindo,L100000601693,G100000601712,https://www.gem.wiki/Booué_hydroelectric_plant
112,,Gabon,tséngueleledi,pre-construction,-0.137900,12.196900,,,,,...,,,approximate,Makokou,Makokou,Ogooue-Ivindo,Ogooue-Ivindo,L100000601696,G100000601715,https://www.gem.wiki/Tséngue-Leledi_hydroelect...
113,,Congo,nyanga,shelved - inferred 2 y,-3.133400,12.933300,,,,,...,,,approximate,Nyanga,Nyanga,,Niari,L100000604298,G100000604680,https://www.gem.wiki/Nyanga_hydroelectric_plant
114,,Congo,sounda,pre-construction,-4.081700,12.156200,,,,,...,,,approximate,,,,,L100001025649,G100001030435,https://www.gem.wiki/Sounda_hydroelectric_plant


In [127]:
process.extractOne(
            atlas_row['Name_clean'],
            df_tracker_c['Name_clean'],
            scorer=fuzz.token_sort_ratio,
            score_cutoff=50
        )

('gove dam', 66.66666666666667, 138218)

In [141]:
df_tracker_c['Name_clean']

138214    caculo cabaça
138215       cambambe i
138216      cambambe ii
138217          capanda
138218             gove
138219    jamba ia mina
138220     jamba ia oma
138221            laúca
138222           lomaúm
Name: Name_clean, dtype: object

In [142]:
atlas_row['Name_clean']

'baynes'

In [113]:
df_tracker_c['Name_clean']

1708                          biocom
2698                          luanda
3512                         cazenga
3513      caminho de ferro de luanda
3514                      de quileva
                     ...            
138218                      gove dam
138219                 jamba ia mina
138220                  jamba ia oma
138221                         laúca
138222                        lomaúm
Name: Name_clean, Length: 78, dtype: object

In [111]:
atlas_row['Name_clean']

'baynes (angola)'

In [112]:
df_tracker_c

Unnamed: 0,Type,Country,Subregion,Region,Plant / Project name,Unit / Phase name,Plant / Project name (local),Plant / Project name (other),Capacity (MW),Status,...,Location accuracy,City,"Local area (taluk, county)","Major area (prefecture, district)","Subnational unit (state, province)",GEM location ID,GEM unit/phase ID,GEM.Wiki URL,Unit Name,Name_clean
1708,bioenergy,Angola,Sub-Saharan Africa,Africa,Biocom power station,1,,,100.0,operating,...,exact,,Pungo Andongo,,Malanje Province,L100000201400,G100000201683,https://www.gem.wiki/Biocom_power_station,biocom,biocom
2698,bioenergy,Angola,Sub-Saharan Africa,Africa,Luanda power station,1,,Luanda Municipal Solid Waste,30.0,cancelled - inferred 4 y,...,approximate,,Cazenga Municipality,,Luanda Province,L100000201401,G100000201684,https://www.gem.wiki/Luanda_power_station,luanda,luanda
3512,oil/gas,Angola,Sub-Saharan Africa,Africa,Cazenga power station,1,,,100.0,pre-construction,...,exact,Cazenga,,,Luanda,L100000406733,G100000407057,https://www.gem.wiki/Cazenga_power_station,cazenga,cazenga
3513,oil/gas,Angola,Sub-Saharan Africa,Africa,Central Caminho de Ferro de Luanda (CFL) power...,1,,,125.0,operating,...,exact,Luanda,,,,L100000409040,G100000413546,https://www.gem.wiki/Central_Caminho_de_Ferro_...,caminho de ferro de luanda,caminho de ferro de luanda
3514,oil/gas,Angola,Sub-Saharan Africa,Africa,Central de Quileva power plant,1,,,84.0,operating,...,exact,Lobito,,,,L100000409055,G100000413531,https://www.gem.wiki/Central_de_Quileva_power_...,de quileva,de quileva
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138218,hydropower,Angola,Sub-Saharan Africa,Africa,Gove Dam hydroelectric plant,,,,60.0,operating,...,exact,,,,,L100001025775,G100001030660,https://www.gem.wiki/Gove_Dam_hydroelectric_plant,gove dam,gove dam
138219,hydropower,Angola,Sub-Saharan Africa,Africa,Jamba ia Mina hydroelectric plant,,,Jamba Ya Mina,224.0,announced,...,approximate,Jamba,Jamba,Huíla,Huíla,L100000600016,G100000604456,https://www.gem.wiki/Jamba_ia_Mina_hydroelectr...,jamba ia mina,jamba ia mina
138220,hydropower,Angola,Sub-Saharan Africa,Africa,Jamba ia Oma hydroelectric plant,,,Jamba Ya Oma,79.0,announced,...,exact,,,,Huíla Province,L100000604243,G100000604457,https://www.gem.wiki/Jamba_ia_Oma_hydroelectri...,jamba ia oma,jamba ia oma
138221,hydropower,Angola,Sub-Saharan Africa,Africa,Laúca hydroelectric plant,,,,2070.0,operating,...,exact,,,,Cuanza Sul Province,L100000600017,G100000600017,https://www.gem.wiki/Laúca_hydroelectric_plant,laúca,laúca
