# Imports

In [98]:
import pandas as pd 
from bs4 import BeautifulSoup
import requests
from rapidfuzz import fuzz, process
import unicodedata

# Constants and Funcs

In [99]:
fre_mulitplier = 78.5

countries = ['bangladesh', 'brazil', 'cambodia','india', 'china', 'indonesia', 'mexico']

city_country = ['brazil', 'cambodia','rwanda']

div_admin_countries = ['bangladesh']

def convert_to_flt(str):
    try:
        return float(str.replace(",", ""))  
    except ValueError:
        return None

In [100]:
def normalize(text):
    if not isinstance(text, str):
        return ""
    text = text.lower().strip().replace(" ", "")
    text = unicodedata.normalize('NFKD', text)
    return ''.join(c for c in text if not unicodedata.combining(c))

def build_normalized_mapping(df_source, df_target, source_col='L1', target_col='L1', threshold=85):
    source_names = df_source[source_col].dropna().unique()
    target_names = df_target[target_col].dropna().unique()
    
    target_lookup = {normalize(name): name for name in target_names}
    mapping = {}

    for src in source_names:
        norm_src = normalize(src)
        match, score, _ = process.extractOne(norm_src, list(target_lookup.keys()), scorer=fuzz.ratio)
        if score >= threshold:
            mapping[src] = target_lookup[match]
        else:
            mapping[src] = None

    return mapping

In [101]:
def make_df_admin(page):
    soup = BeautifulSoup(page.content, 'html.parser')
    # print(soup.prettify())

    rows = soup.find_all('tr', class_=['rname', 'admin2'])
    # rows = soup.find_all('tr')

    data = []

    header_cells = soup.find_all('th', class_='rpop')
    # import pdb; pdb.set_trace()
    years = []
    for header in header_cells:
        # Extract the year from the 'span' element inside the 'th' (e.g., "2005-06-30")
        span = header.find('span', class_='unit')
        # import pdb; pdb.set_trace()
        if span:
            # Extract year from the text, assuming it's in the form "Census\nYYYY-MM-DD"
            year = span.text.strip().split('-')[0][-4:]  # Get the year part
            years.append(year)


        
    for row in rows:
        try:
            # Get the name, status, and population for each census year
            name = row.find('td', class_='rname').text.strip()
            status = row.find('td', class_='rstatus').text.strip()

            # Extract area and population data for different census years
            # import pdb; pdb.set_trace()
            area = convert_to_flt(row.find('td', class_='rname')['data-area'])  # Get the 'data-area' attribute

            # Extract all population data (there could be multiple columns for different years)
            population_cells = row.find_all('td', class_='rpop')

            # Create a dictionary to store population data for each year
            population_data = {}
            for i, cell in enumerate(population_cells):
                population = cell.text.strip()
                population_data[f'Population {years[i]}'] = convert_to_flt(population)

            # Append the extracted data into the list
            row_data = [name, status, area] + list(population_data.values())
            data.append(row_data)
            
        except:
            continue
    # Create a DataFrame
    columns = ['Name', 'Status', 'Area'] + [year for year in years]  
    df = pd.DataFrame(data, columns=columns)

    latest_population_column = years[-1]

    return df, latest_population_column

In [102]:
def make_df_city(page, section_id):
    soup = BeautifulSoup(page.content, 'html.parser')
    soup = soup.find(id=section_id)
    # print(soup.prettify())

    # rows = soup.find_all('tr', class_=['rname', 'admin2'])
    rows = soup.find_all('tr')

    data = []

    header_cells = soup.find_all('th', class_='rpop')

    years = []

    for header in header_cells:
        # Extract the year from the 'span' element inside the 'th' (e.g., "2005-06-30")
        span = header.find('span', class_='unit')
        if span:
            # Extract year from the text, assuming it's in the form "Census\nYYYY-MM-DD"
            year = span.text.strip().split('-')[0][-4:]  # Get the year part
            if year not in years:
                years.append(year)

    # import pdb; pdb.set_trace()
        
    for row in rows:
        if section_id == 'adminareas':
            try:
                # Get the name, status, and population for each census year
                name = row.find('td', class_='rname').text.strip()
                status = row.find('td', class_='rstatus').text.strip()

                # Extract area and population data for different census years
                # import pdb; pdb.set_trace()
                area = row.find('td', class_='rarea').text.strip()  # Get the 'data-area' attribute

                # Extract all population data (there could be multiple columns for different years)
                population_cells = row.find_all('td', class_='rpop')

                # Create a dictionary to store population data for each year
                population_data = {}
                for i, cell in enumerate(population_cells):
                    population = cell.text.strip()
                    population_data[f'Population {years[i]}'] = convert_to_flt(population)

                # Append the extracted data into the list
                row_data = [name, status, area] + list(population_data.values())
                data.append(row_data)
                
            except Exception as e:
                # import pdb; pdb.set_trace()
                continue
            # Create a DataFrame
          

        elif section_id == "largecities":
            # import pdb; pdb.set_trace()
            try:
                # Get the name, status, and population for each census year
                name = row.find('td', {'class': None}).text.strip()
                # Extract all population data (there could be multiple columns for different years)
                population_tag = row.find('td', {'class': 'rpop'}).text.strip()

                # Append the extracted data into the list
                row_data = [name, population_tag]
                data.append(row_data)
                
            except Exception as e:
                # import pdb; pdb.set_trace()
                continue
            # Create a DataFrame

        elif section_id == "citysection":
            # import pdb; pdb.set_trace()
            try:
                # Get the name, status, and population for each census year
                name = row.find('td', class_='rname').text.strip()
                # status = row.find('td', class_='rstatus').text.strip()

                # Extract area and population data for different census years
                # import pdb; pdb.set_trace()
                area = row.find('td', class_='noviz').text.strip()  # Get the 'data-area' attribute

                # Extract all population data (there could be multiple columns for different years)
                population_cells = row.find_all('td', class_='rpop')

                # Create a dictionary to store population data for each year
                population_data = {}
                for i, cell in enumerate(population_cells):
                    population = cell.text.strip()
                    population_data[f'Population {years[i]}'] = convert_to_flt(population)

                # Append the extracted data into the list
                row_data = [name, area] + list(population_data.values())
                data.append(row_data)
                
            except Exception as e:
                # import pdb; pdb.set_trace()
                continue
            # Create a DataFrame

        
    
    if section_id == 'adminareas':
        columns = ['Name', 'Status', 'Area'] + [year for year in years]  
    elif section_id == 'largecities':
        columns = ['Name'] + [year for year in years] 
    elif section_id == 'citysection':
        columns = ['Name', 'Area'] + [year for year in years] 

    df = pd.DataFrame(data, columns=columns)

    latest_population_column = years[-1]

    return df, latest_population_column



In [103]:
def make_df_blank(page):
    soup = BeautifulSoup(page.content, 'html.parser')
    # print(soup.prettify())

    # rows = soup.find_all('tr', class_=['rname', 'admin2'])
    admin_section = soup.find('section', id='adminareas')
    rows = admin_section.find_all('tr', attrs={'itemscope': True})
    # rows = soup.find_all('tr')

    data = []

    header_cells = admin_section.find_all('th', class_='rpop')
    # import pdb; pdb.set_trace()
    years = []
    for header in header_cells:
        # Extract the year from the 'span' element inside the 'th' (e.g., "2005-06-30")
        span = header.find('span', class_='unit')
        # import pdb; pdb.set_trace()
        if span:
            # Extract year from the text, assuming it's in the form "Census\nYYYY-MM-DD"
            year = span.text.strip().split('-')[0][-4:]  # Get the year part
            years.append(year)

    years = list(dict.fromkeys(years))  # Remove duplicates while preserving order

    for row in rows:
        # import pdb; pdb.set_trace()
        try:
            # Get the name, status, and population for each census year
            name = row.find('td', class_='rname').find('span', itemprop='name').text.strip()

            status = row.find('td', class_='rname')['data-status'].strip()

            # Extract area and population data for different census years
            # import pdb; pdb.set_trace()
            area_text = row.find('td', class_='rarea').text.strip()  # Get the 'data-area' attribute
            area = convert_to_flt(area_text) 
            # Extract all population data (there could be multiple columns for different years)
            population_cells = row.find_all('td', class_='rpop')
            population_data = {}
            for i, cell in enumerate(population_cells):
                population = cell.text.strip()
                population_data[f'Population {years[i]}'] = convert_to_flt(population)

            # Append the full row data
            row_data = [name, status, area] + list(population_data.values())
            data.append(row_data)
            
        except:
            continue

    # import pdb; pdb.set_trace()
    # Create a DataFrame
    columns = ['Name', 'Status', 'Area'] + [year for year in years]  
    df = pd.DataFrame(data, columns=columns)

    latest_population_column = years[-1]

    return df, latest_population_column

In [104]:
def make_df_l2(page):
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find('table')
    if table is None:
        raise ValueError("No table found on the page.")

    # Extract table headers
    headers = []
    for th in table.find_all('th'):
        headers.append(th.get_text(strip=True))

    # Extract table rows
    rows = []
    for tr in table.find_all('tr')[1:]:  # Skip the header row
        cells = tr.find_all(['td', 'th'])
        row = [cell.get_text(strip=True) for cell in cells]
        if row:
            rows.append(row)

    # header = rows[0][1:]
    new_header = ['L1', 'L2'] + headers[1:]

    data = []

    state = rows[0][0]

    if len(rows) == 1:
        data.append([state, state] + rows[0][1:])
    else:
        for row in rows[1:]:
            if len(row) >= 2:  # make sure there's something to split
                new_row = [state] + [row[0]] + row[1:]
                data.append(new_row)

            
    
    

    # Create a DataFrame from the extracted data
    df = pd.DataFrame(data, columns=new_header)
    df = df.iloc[:-1]

    return df

In [105]:
def make_df_brazil_problem(page):
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find('table')
    if table is None:
        raise ValueError("No table found on the page.")

    # Get header
    header_cells = table.find('tr').find_all('th')
    headers = [cell.get_text(strip=True) for cell in header_cells if cell.get_text(strip=True)]

    rows = []
    for tr in table.find_all('tr')[1:]:  # Skip the header
        cells = tr.find_all(['td'])
        row = [cell.get_text(strip=True) for cell in cells]
        if row:
            row = [row[0]] + row
            rows.append(row)

    
    headers = ['L1', 'L2'] + headers[1:]

    
    # Auto-adjust column count
    max_len = max(len(row) for row in rows)
    if len(headers) < max_len:
        # Fill in missing column names as "Extra_1", "Extra_2", etc.
        headers += [f"Extra_{i+1}" for i in range(max_len - len(headers))]
    elif len(headers) > max_len:
        # Trim headers if too many
        headers = headers[:max_len]

    df = pd.DataFrame(rows, columns=headers)
    df = df.iloc[:-1]
    return df

In [106]:
def make_indo_l1l2(page):
    soup = BeautifulSoup(page.content, 'html.parser')

    rows = []
    current_l1 = None

    # Get dynamic headers from the first table
    table = soup.find('table')
    header_cells = table.find('tr').find_all('th')
    headers = ['Province'] + [cell.get_text(strip=True) for cell in header_cells if cell.get_text(strip=True)]

    # Loop through all tbody sections
    for tbody in soup.find_all('tbody'):
        if tbody.get('class') == ['admin1']:
            # This tbody is for Province (L1)
            tr = tbody.find('tr')
            td = tr.find('td', class_='rname')
            current_l1 = td.get_text(strip=True)
        elif tbody.get('class') == ['admin2']:
            # This tbody is for Districts (L2)
            for tr in tbody.find_all('tr'):
                cells = tr.find_all('td')
                if not cells:
                    continue
                row_data = [current_l1]  # start row with Province

                for cell in cells:
                    row_data.append(cell.get_text(strip=True))
                
                rows.append(row_data)

    # Adjust header if rows have more columns
    max_len = max(len(r) for r in rows)
    if len(headers) < max_len:
        headers += [f"Extra_{i+1}" for i in range(max_len - len(headers))]

    df = pd.DataFrame(rows, columns=headers)

    return df

# Webscraper

## For Asia and LATAM

In [107]:
for country in countries:
    print(country)
    if country in city_country:
        page = requests.get(f"https://www.citypopulation.de/en/{country}/cities/")
        admin_df, year = make_df_city(page, 'adminareas')
        # city_df, year = make_df_city(page, 'largecities')
        # city_section_df, year = make_df_city(page, 'citysection')

        # df = pd.concat([admin_df, city_df, city_section_df], axis=0)
        df = admin_df

    elif country in div_admin_countries:
        page = requests.get(f"https://www.citypopulation.de/en/{country}/div/admin/")
        df, year = make_df_blank(page)
    
    else:
        page = requests.get(f"https://www.citypopulation.de/en/{country}/admin/")
        df, year = make_df_admin(page)

        # country_admin_df_name = f"{country}_admin_df"
        # country_city_df_name = f"{country}_city_df"
        # country_city_section_df_name = f"{country}_city_section_df"
        # globals()[country_admin_df_name] = admin_df
        # globals()[country_city_df_name] = city_df
        # globals()[country_city_section_df_name] = city_section_df

# import pdb; pdb.set_trace()
    country_df_name = f"{country}_df"  
    country_year_name = f"{country}_year"
    globals()[country_df_name] = df
    globals()[country_year_name] = year

    


bangladesh
brazil
cambodia
india
china
indonesia
mexico


## For Africa

In [108]:
page = requests.get(f"https://www.citypopulation.de/en/kenya/sub/admin/")
kenya_df, year = make_df_admin(page)
kenya_l2 = make_indo_l1l2(page)
kenya_l2['L0'] = 'Kenya'
kenya_l2

Unnamed: 0,Province,Name,Status,PopulationCensus2009-08-24,PopulationCensus2019-08-24,Extra_1,L0
0,Baringo,Baringo Central,Subcounty,89174,96951,→,Kenya
1,Baringo,Baringo North,Subcounty,93789,104871,→,Kenya
2,Baringo,East Pokot,Subcounty,57740,79923,→,Kenya
3,Baringo,Koibatek,Subcounty,105273,129535,→,Kenya
4,Baringo,Marigat(incl. Lake Baringo),Subcounty,74413,90958,→,Kenya
...,...,...,...,...,...,...,...
338,West Pokot,Kipkomo,Subcounty,66985,102633,→,Kenya
339,West Pokot,Pokot Central,Subcounty,101022,119016,→,Kenya
340,West Pokot,Pokot North,Subcounty,156011,134485,→,Kenya
341,West Pokot,Pokot South,Subcounty,49172,80661,→,Kenya


In [109]:
page = requests.get(f"https://www.citypopulation.de/en/liberia/")
liberia_df, year = make_df_blank(page)
# liberia_l2 = make_indo_l1l2(page)
liberia_df

Unnamed: 0,Name,Status,Area,1984,2008,2022
0,Bomi,Cnty,1942.0,66420.0,84119.0,133705.0
1,Bong,Cnty,8769.0,255813.0,333481.0,467561.0
2,Gbarpolu,Cnty,9685.0,48399.0,83388.0,95995.0
3,Grand Bassa,Cnty,7932.0,159648.0,221693.0,293689.0
4,Grand Cape Mount,Cnty,5160.0,79322.0,127076.0,178867.0
5,Grand Gedeh,Cnty,10480.0,63028.0,125258.0,216692.0
6,Grand Kru,Cnty,3894.0,62791.0,57913.0,109342.0
7,Lofa,Cnty,9978.0,199242.0,276863.0,367376.0
8,Margibi,Cnty,2615.0,151792.0,209923.0,304946.0
9,Maryland,Cnty,2296.0,69267.0,135938.0,172587.0


In [110]:
page = requests.get(f"https://www.citypopulation.de/en/rwanda/admin/")
rwanda_df, year = make_df_admin(page)
rwanda_l2 = make_indo_l1l2(page)
rwanda_l2['L0'] = 'Rwanda'
rwanda_df

Unnamed: 0,Name,Status,Area,2002,2012,2022
0,Est (Intara y'Iburasirazuba) [Eastern],Province,9458.0,1700137.0,2595703.0,3563145.0
1,Kigali,City,730.0,765325.0,1132686.0,1745555.0
2,Nord (Intara y'Amajyaruguru) [Northern],Province,3276.0,1560862.0,1726370.0,2038511.0
3,Ouest (Intara y'Iburengerazuba) [Western],Province,5882.0,2043555.0,2471239.0,2896484.0
4,Sud (Intara y'Amajyepfo) [Southern],Province,5963.0,2058674.0,2589975.0,3002699.0


In [111]:
page = requests.get(f"https://www.citypopulation.de/en/southafrica/admin/")
southafrica_df, year = make_df_admin(page)
southafrica_l2 = make_indo_l1l2(page)
southafrica_l2['L0'] = 'South Africa'
southafrica_df

Unnamed: 0,Name,Status,Area,1996,2001,2011,2022
0,Eastern Cape,Province,168966.0,6147244.0,6278651.0,6562053.0,7230204.0
1,Free State (Oranje Free State),Province,129825.0,2633504.0,2706775.0,2745590.0,2964412.0
2,Gauteng,Province,18178.0,7834620.0,9390528.0,12272263.0,15099422.0
3,KwaZulu-Natal,Province,94361.0,8572302.0,9584129.0,10267300.0,12423907.0
4,Limpopo (Northern Transvaal),Province,125754.0,4576133.0,4995462.0,5404868.0,6572721.0
5,Mpumalanga (Eastern Transvaal),Province,76495.0,3124203.0,3365957.0,4039939.0,5143324.0
6,Northern Cape,Province,372889.0,1011864.0,991876.0,1145861.0,1355945.0
7,North West,Province,104882.0,2726828.0,2982064.0,3509953.0,3804548.0
8,Western Cape,Province,129462.0,3956875.0,4524335.0,5822734.0,7433020.0


In [112]:
page = requests.get(f"https://www.citypopulation.de/en/gambia/admin/")
gambia_df, year = make_df_admin(page)
gambia_l2 = make_indo_l1l2(page)
gambia_l2['L0'] = 'Gambia'
gambia_df

Unnamed: 0,Name,Status,Area,1993,2003,2013
0,Banjul,Local Government Area,12.0,42326.0,35061.0,31054.0
1,Basse (Upper River),Local Government Area,2070.0,155059.0,182586.0,237220.0
2,Brikama (Western),Local Government Area,1764.0,234917.0,389594.0,688744.0
3,Janjanbureh,Local Government Area,1428.0,88247.0,107212.0,125204.0
4,Kanifeng,Local Government Area,76.0,228214.0,322735.0,377134.0
5,Kerewan (North Bank),Local Government Area,2255.0,156462.0,172835.0,220080.0
6,Kuntaur,Local Government Area,1467.0,67774.0,78491.0,96703.0
7,Mansakonko (Lower River),Local Government Area,1618.0,65146.0,72167.0,81042.0


In [113]:
page = requests.get(f"https://www.citypopulation.de/en/zambia/admin/")
zambia_df, year = make_df_admin(page)
zambia_l2 = make_indo_l1l2(page)
zambia_l2['L0'] = 'Zambia'
zambia_df

Unnamed: 0,Name,Status,Area,2000,2010,2022
0,Central,Province,94394.0,1012257.0,1307111.0,2261336.0
1,Copperbelt,Province,31328.0,1581221.0,1972317.0,2768192.0
2,Eastern,Province,68948.8,1306173.0,1696555.0,2462682.0
3,Luapula,Province,50567.0,775353.0,991927.0,1519478.0
4,Lusaka,Province,21986.0,1391329.0,2191225.0,3093617.0
5,Muchinga,Province,70243.2,449296.0,607763.0,922213.0
6,Northern,Province,77650.0,809400.0,1105824.0,1623853.0
7,North-Western,Province,125826.0,583350.0,727044.0,1278357.0
8,Southern,Province,85283.0,1212124.0,1589926.0,2388091.0
9,Western,Province,126386.0,765088.0,902974.0,1375604.0


In [114]:
page = requests.get(f"https://www.citypopulation.de/en/malawi/admin/")
malawi_df, year = make_df_admin(page)
malawi_l2 = make_indo_l1l2(page)
malawi_l2['L0'] = 'Malawi'
malawi_df

Unnamed: 0,Name,Status,Area,1977,1987,1998,2008,2018,2023
0,Central,Region,35641.0,2143716.0,3110986.0,4066340.0,5497252.0,7523340.0,8533879.0
1,Northern,Region,27131.0,648853.0,911787.0,1233560.0,1679491.0,2289780.0,2532270.0
2,Southern,Region,31780.0,2754891.0,3965734.0,4633968.0,5852755.0,7750629.0,8743362.0


In [115]:
page = requests.get(f"https://www.citypopulation.de/en/ghana/admin/")
ghana_df, year = make_df_admin(page)
ghana_l2 = make_indo_l1l2(page)
ghana_l2['L0'] = 'Ghana'
ghana_df

Unnamed: 0,Name,Status,Area,2010,2021
0,Ahafo (← Brong Ahafo),Region,5196.0,484210.0,564668.0
1,Ashanti,Region,24389.0,4780380.0,5440463.0
2,Bono (← Brong Ahafo),Region,11113.0,922617.0,1208649.0
3,Bono East (← Brong Ahafo),Region,23248.0,904156.0,1203400.0
4,Central,Region,9826.0,2201863.0,2859821.0
5,Eastern,Region,19323.0,2633154.0,2925653.0
6,Greater Accra,Region,3245.0,4010054.0,5455692.0
7,North East (← Northern),Region,9070.0,465005.0,658946.0
8,Northern,Region,26524.0,1544946.0,2310939.0
9,Oti (← Volta),Region,11066.0,636889.0,747248.0


In [116]:
page = requests.get(f"https://www.citypopulation.de/en/eswatini/")
eswatini_df, year = make_df_blank(page)
# eswatini_l2 = make_indo_l1l2(page)
eswatini_df

Unnamed: 0,Name,Status,Area,1976,1986,1997,2007,2017,2024
0,Hhohho,Reg,3569.0,95759.0,178936.0,255455.0,282734.0,320651.0,355700.0
1,Lubombo,Reg,5947.0,81800.0,153958.0,194323.0,207731.0,212531.0,237500.0
2,Manzini,Reg,4068.0,101277.0,192596.0,280972.0,319530.0,355945.0,397000.0
3,Shiselweni,Reg,3779.0,95735.0,155569.0,198978.0,208454.0,204111.0,212200.0


In [117]:
page = requests.get(f"https://www.citypopulation.de/en/drcongo/cities/")
congo_df, year = make_df_city(page, 'adminareas')
# eswatini_l2 = make_indo_l1l2(page)
congo_df

Unnamed: 0,Name,Status,Area,1984,2010,2015,2020
0,Bas-Uele,Prov,148331,545458.0,1004000.0,1138000.0,1369600.0
1,Équateur,Prov,103902,635298.0,1315000.0,1528000.0,1856000.0
2,Haut-Katanga,Prov,132425,1391617.0,3788000.0,4617000.0,5718800.0
3,Haut-Lomami,Prov,108204,891021.0,2426000.0,2957000.0,3662800.0
4,Haut-Uele,Prov,89683,893111.0,1643000.0,1864000.0,2242500.0
5,Ituri,Prov,65658,1749256.0,3219000.0,3650000.0,4392200.0
6,Kasaï,Prov,95631,1096783.0,2388000.0,2801000.0,3417000.0
7,Kasaï-Central,Prov,59500,1298463.0,2827000.0,3317000.0,4045300.0
8,Kasaï-Oriental,Prov,9545,1080610.0,2634000.0,3145000.0,3864300.0
9,Kinshasa,Cap,9965,2664309.0,8683000.0,11575000.0,14565700.0


In [118]:
page = requests.get(f"https://www.citypopulation.de/en/ethiopia/admin/")
# congo_df = make_df_city(page, 'adminareas')
ethiopia_l2 = make_indo_l1l2(page)
ethiopia_l2['L0'] = 'Ethiopia'
ethiopia_l2

Unnamed: 0,Province,Name,Status,PopulationCensus2007-05-28,PopulationProjection2022-07-01,Extra_1,L0
0,Adis Abeba[Addis Ababa],Addis Ketema,Sub City,255372,359735,→,Ethiopia
1,Adis Abeba[Addis Ababa],Akaki Kaliti,Sub City,181270,255348,→,Ethiopia
2,Adis Abeba[Addis Ababa],Arada,Sub City,211501,298044,→,Ethiopia
3,Adis Abeba[Addis Ababa],Bole,Sub City,308995,435421,→,Ethiopia
4,Adis Abeba[Addis Ababa],Gulele,Sub City,267624,377032,→,Ethiopia
...,...,...,...,...,...,...,...
738,Tigray,Tsegede,District,103852,127171,→,Ethiopia
739,Tigray,Tselemti,District,138858,166395,→,Ethiopia
740,Tigray,Welkayit,District,138926,168600,→,Ethiopia
741,Tigray,Were Lehe,District,146104,182387,→,Ethiopia


In [119]:
page = requests.get(f"https://www.citypopulation.de/en/lesotho/")
lesotho_df, year = make_df_blank(page)
# eswatini_l2 = make_indo_l1l2(page)

lesotho_df

Unnamed: 0,Name,Status,Area,1986,1996,2006,2016,2023
0,Berea,Dist,2222.0,149289.0,241946.0,250006.0,262616.0,271700.0
1,Butha-Buthe,Dist,1767.0,106880.0,109905.0,110320.0,118242.0,124100.0
2,Leribe,Dist,2828.0,274935.0,302664.0,293369.0,337521.0,371100.0
3,Mafeteng,Dist,2119.0,206423.0,213455.0,192621.0,178222.0,166400.0
4,Maseru,Dist,4279.0,311829.0,393154.0,431998.0,519186.0,586000.0
5,Mohale's Hoek,Dist,3530.0,174998.0,185459.0,176928.0,165590.0,156200.0
6,Mokhotlong,Dist,4075.0,80514.0,86468.0,97713.0,100442.0,102300.0
7,Qacha's Nek,Dist,2349.0,69517.0,72886.0,69749.0,74566.0,78100.0
8,Quthing,Dist,2916.0,120264.0,127560.0,124048.0,115469.0,108400.0
9,Thaba-Tseka,Dist,4270.0,110528.0,128778.0,129881.0,135347.0,139200.0


In [120]:
page = requests.get(f"https://www.citypopulation.de/en/namibia/cities/")
namibia_df, year = make_df_blank(page)
# eswatini_l2 = make_indo_l1l2(page)
namibia_df

Unnamed: 0,Name,Status,Area,1991,2001,2011,2023
0,Erongo,Reg,63639.0,55470.0,107663.0,150809.0,240206.0
1,Hardap,Reg,109713.0,66495.0,68249.0,79507.0,106680.0
2,ǁKaras,Reg,161395.0,61162.0,69329.0,77421.0,109893.0
3,Kavango East,Reg,23988.0,,,136823.0,218421.0
4,Kavango West,Reg,24591.0,,,86529.0,123266.0
5,Khomas,Reg,36950.0,167071.0,250262.0,342141.0,494605.0
6,Kunene,Reg,115616.0,64017.0,68735.0,86856.0,120762.0
7,Ohangwena,Reg,10709.0,179634.0,228384.0,245446.0,337729.0
8,Omaheke,Reg,84745.0,52735.0,68039.0,71233.0,102881.0
9,Omusati,Reg,26600.0,189919.0,228842.0,243166.0,316671.0


In [121]:
page = requests.get(f"https://www.citypopulation.de/en/nigeria/admin/")
# nigeria_df = make_df_blank(page)
nigeria_l2 = make_indo_l1l2(page)
nigeria_l2['L0'] = 'Nigeria'
nigeria_l2

Unnamed: 0,Province,Name,Status,PopulationCensus1991-11-26,PopulationCensus2006-03-21,PopulationProjection2022-03-21,Extra_1,L0
0,Abia,Aba North,Local Government Area,86331,106844,155600,→,Nigeria
1,Abia,Aba South,Local Government Area,413852,427421,622400,→,Nigeria
2,Abia,Arochukwu,Local Government Area,97800,169339,246600,→,Nigeria
3,Abia,Bende,Local Government Area,132271,192621,280500,→,Nigeria
4,Abia,Ikwuano,Local Government Area,52214,137897,200800,→,Nigeria
...,...,...,...,...,...,...,...,...
770,Zamfara,Maru,Local Government Area,...,293141,521500,→,Nigeria
771,Zamfara,Shinkafi,Local Government Area,...,135964,241900,→,Nigeria
772,Zamfara,Talata Mafara,Local Government Area,138844,215650,383700,→,Nigeria
773,Zamfara,Tsafe,Local Government Area,163512,266929,474900,→,Nigeria


In [122]:
page = requests.get(f"https://www.citypopulation.de/en/rwanda/admin/")
# nigeria_df = make_df_blank(page)
rwanda_l2 = make_indo_l1l2(page)
rwanda_l2['L0'] = 'Rwanda'
rwanda_l2

Unnamed: 0,Province,Name,Status,PopulationCensus2002-08-16,PopulationCensus2012-08-15,PopulationCensus2022-08-15,Extra_1,L0
0,Est(Intara y'Iburasirazuba) [Eastern],Bugesera,District,266775,361914,551103,→,Rwanda
1,Est(Intara y'Iburasirazuba) [Eastern],Gatsibo,District,283456,433020,551164,→,Rwanda
2,Est(Intara y'Iburasirazuba) [Eastern],Kayonza,District,209723,344157,457156,→,Rwanda
3,Est(Intara y'Iburasirazuba) [Eastern],Kirehe,District,229468,340368,460860,→,Rwanda
4,Est(Intara y'Iburasirazuba) [Eastern],Ngoma,District,235109,336928,404048,→,Rwanda
5,Est(Intara y'Iburasirazuba) [Eastern],Nyagatare,District,255104,465855,653861,→,Rwanda
6,Est(Intara y'Iburasirazuba) [Eastern],Rwamagana,District,220502,313461,484953,→,Rwanda
7,Kigali,Gasabo,District,320516,529561,879505,→,Rwanda
8,Kigali,Kicukiro,District,207819,318564,491731,→,Rwanda
9,Kigali,Nyarugenge,District,236990,284561,374319,→,Rwanda


In [123]:
page = requests.get(f"https://www.citypopulation.de/en/senegal/admin/")
# nigeria_df = make_df_blank(page)
senegal_l2 = make_indo_l1l2(page)
senegal_l2['L0'] = 'Senegal'
senegal_l2

Unnamed: 0,Province,Name,Status,PopulationCensus2013-11-19,PopulationCensus2023-08-18,Extra_1,L0
0,Dakar,Dakar,Department,1146052,1278469,→,Senegal
1,Dakar,Guédiawaye,Department,329658,372708,→,Senegal
2,Dakar,Keur Massar(← Pikine (plus Jaxaay)),Department,524060,770314,→,Senegal
3,Dakar,Pikine,Department,671673,764597,→,Senegal
4,Dakar,Rufisque,Department,465753,818337,→,Senegal
5,Diourbel,Bambey,Department,299476,376467,→,Senegal
6,Diourbel,Diourbel,Department,268215,344109,→,Senegal
7,Diourbel,Mbacké(M'Backé),Department,929764,1359756,→,Senegal
8,Fatick,Fatick,Department,339238,408566,→,Senegal
9,Fatick,Foundiougne,Department,279436,375388,→,Senegal


## Brazil l2

In [None]:
page = requests.get(f"https://www.citypopulation.de/en/brazil/regiaonorte/admin/12__acre/")
arce_df = make_df_l2(page)

arce_df

Unnamed: 0,L1,L2,Status,PopulationCensus1991-09-01,PopulationCensus2000-08-01,PopulationCensus2010-08-01,PopulationCensus2022-08-01,PopulationEstimate2024-07-01,Unnamed: 9
0,Acre,Acrelândia,Municipality,5467,7935,12538,14021,14657,→
1,Acre,Assis Brasil,Municipality,2917,3490,6072,8100,8573,→
2,Acre,Brasiléia,Municipality,13992,17013,21398,26000,27841,→
3,Acre,Bujari,Municipality,3182,5826,8471,12917,13766,→
4,Acre,Capixaba,Municipality,2316,5206,8798,10392,10922,→
5,Acre,Cruzeiro do Sul,Municipality,47812,67441,78507,91888,98382,→
6,Acre,Epitaciolândia,Municipality,7429,11028,15100,18757,19739,→
7,Acre,Feijó,Municipality,17769,26722,32412,35426,37644,→
8,Acre,Jordão,Municipality,4210,4454,6577,9222,9787,→
9,Acre,Mâncio Lima,Municipality,7842,11095,15206,19294,20329,→


In [21]:
page = requests.get(f"https://www.citypopulation.de/en/brazil/regiaonordeste/admin/27__alagoas/")
alagoas_df = make_df_l2(page)
alagoas_df

Unnamed: 0,L1,L2,Status,PopulationCensus1991-09-01,PopulationCensus2000-08-01,PopulationCensus2010-08-01,PopulationCensus2022-08-01,PopulationEstimate2024-07-01,Unnamed: 9
0,Alagoas,Água Branca,Municipality,18157,18660,19389,19008,19550,→
1,Alagoas,Anadia,Municipality,16337,17849,17318,13966,14193,→
2,Alagoas,Arapiraca,Municipality,164921,186466,213671,234696,243661,→
3,Alagoas,Atalaia,Municipality,38563,40552,44326,37512,38530,→
4,Alagoas,Barra de Santo Antônio,Municipality,7423,11351,14230,16365,16735,→
...,...,...,...,...,...,...,...,...,...
97,Alagoas,Taquarana,Municipality,16809,17046,18615,19032,19422,→
98,Alagoas,Teotônio Vilela,Municipality,29664,36881,41152,38053,39161,→
99,Alagoas,Traipu,Municipality,22680,23439,25992,23565,24124,→
100,Alagoas,União dos Palmares,Municipality,57425,58620,62326,59280,60874,→


In [22]:
page = requests.get(f"https://www.citypopulation.de/en/brazil/regiaonorte/admin/16__amap%C3%A1/")
amapá_df = make_df_l2(page)
amapá_df

Unnamed: 0,L1,L2,Status,PopulationCensus1991-09-01,PopulationCensus2000-08-01,PopulationCensus2010-08-01,PopulationCensus2022-08-01,PopulationEstimate2024-07-01,Unnamed: 9
0,Amapá,Amapá,Municipality,6655,7121,8069,7943,8434,→
1,Amapá,Calçoene,Municipality,5177,6730,9000,10612,11391,→
2,Amapá,Cutias,Municipality,1676,3280,4696,4461,4725,→
3,Amapá,Ferreira Gomes,Municipality,2350,3562,5802,6666,7145,→
4,Amapá,Itaubal,Municipality,1314,2894,4265,5599,6043,→
5,Amapá,Laranjal do Jari,Municipality,16772,28515,39942,35114,37969,→
6,Amapá,Macapá,Municipality,168225,283308,398204,442933,487200,→
7,Amapá,Mazagão,Municipality,8911,11986,17032,21924,23575,→
8,Amapá,Oiapoque,Municipality,7555,12886,20509,27482,30481,→
9,Amapá,Pedra Branca do Amapari,Municipality,1460,4009,10772,12847,13798,→


In [23]:

page = requests.get(f"https://www.citypopulation.de/en/brazil/regiaocentrooeste/admin/distrito_federal/5300108__bras%C3%ADlia/")
test_df = make_df_brazil_problem(page)
test_df

Unnamed: 0,L1,L2,Status,PopulationCensus1991-09-01,PopulationCensus2000-08-01,PopulationCensus2010-08-01,PopulationCensus2022-08-01,PopulationEstimate2024-07-01,Extra_1
0,Brasília,Brasília,Municipality,1601094,2051146,2572159,2817381,2982818,


In [24]:
brazil_sites = ['https://www.citypopulation.de/en/brazil/regiaonorte/admin/12__acre/', 
                'https://www.citypopulation.de/en/brazil/regiaonordeste/admin/27__alagoas/',
                'https://www.citypopulation.de/en/brazil/regiaonorte/admin/16__amap%C3%A1/',
                'https://www.citypopulation.de/en/brazil/regiaonorte/admin/13__amazonas/',
                'https://www.citypopulation.de/en/brazil/regiaonordeste/admin/29__bahia/',
                'https://www.citypopulation.de/en/brazil/regiaonordeste/admin/23__cear%C3%A1/',
                'https://www.citypopulation.de/en/brazil/regiaosudeste/admin/32__esp%C3%ADrito_santo/',
                'https://www.citypopulation.de/en/brazil/regiaocentrooeste/admin/52__goi%C3%A1s/',
                'https://www.citypopulation.de/en/brazil/regiaonordeste/admin/21__maranh%C3%A3o/',
                'https://www.citypopulation.de/en/brazil/regiaocentrooeste/admin/51__mato_grosso/',
                'https://www.citypopulation.de/en/brazil/regiaocentrooeste/admin/50__mato_grosso_do_sul/',
                'https://www.citypopulation.de/en/brazil/regiaosudeste/admin/31__minas_gerais/',
                'https://www.citypopulation.de/en/brazil/regiaonorte/admin/15__par%C3%A1/',
                'https://www.citypopulation.de/en/brazil/regiaonordeste/admin/25__para%C3%ADba/',
                'https://www.citypopulation.de/en/brazil/regiaosud/admin/41__paran%C3%A1/',
                'https://www.citypopulation.de/en/brazil/regiaonordeste/admin/26__pernambuco/',
                'https://www.citypopulation.de/en/brazil/regiaonordeste/admin/22__piau%C3%AD/',
                'https://www.citypopulation.de/en/brazil/regiaosudeste/admin/33__rio_de_janeiro/',
                'https://www.citypopulation.de/en/brazil/regiaonordeste/admin/24__rio_grande_do_norte/',
                'https://www.citypopulation.de/en/brazil/regiaosud/admin/43__rio_grande_do_sul/',
                'https://www.citypopulation.de/en/brazil/regiaonorte/admin/11__rond%C3%B4nia/',
                'https://www.citypopulation.de/en/brazil/regiaonorte/admin/14__roraima/',
                'https://www.citypopulation.de/en/brazil/regiaosud/admin/42__santa_catarina/',
                'https://www.citypopulation.de/en/brazil/regiaosudeste/admin/35__s%C3%A3o%20paulo/',
                'https://www.citypopulation.de/en/brazil/regiaonordeste/admin/28__sergipe/',
                'https://www.citypopulation.de/en/brazil/regiaonorte/admin/17__tocantins/']

brazil_problem_sites =['https://www.citypopulation.de/en/brazil/regiaocentrooeste/admin/distrito_federal/5300108__bras%C3%ADlia/',
                       ]

brazil_df = pd.DataFrame()

for site in brazil_sites:
    page = requests.get(site)
    df = make_df_l2(page)
    df = df[['L1', 'L2']]
    brazil_df = pd.concat([brazil_df, df], axis=0)

for site in brazil_problem_sites:
    page = requests.get(site)
    df = make_df_brazil_problem(page)
    df = df[['L1', 'L2']]
    brazil_df = pd.concat([brazil_df, df], axis=0)

brazil_df

Unnamed: 0,L1,L2
0,Acre,Acrelândia
1,Acre,Assis Brasil
2,Acre,Brasiléia
3,Acre,Bujari
4,Acre,Capixaba
...,...,...
135,Tocantins,Tupirama
136,Tocantins,Tupiratins
137,Tocantins,Wanderlândia
138,Tocantins,Xambioá


## Indo L1-L3


In [26]:
page = requests.get(f"https://www.citypopulation.de/en/indonesia/admin/")
l1l2_test_df = make_indo_l1l2(page)
l1l2_test_df

Unnamed: 0,Province,Name,Status,PopulationCensus2005-06-30,PopulationCensus2010-05-01,PopulationCensus2015-05-01,PopulationCensus2020-09-15,Extra_1
0,Aceh(Nanggroe Aceh Darussala),Aceh Barat[West Aceh],Regency,150450,173558,193484,198736,→
1,Aceh(Nanggroe Aceh Darussala),Aceh Barat Daya[Southwest Aceh],Regency,115676,126036,140366,150775,→
2,Aceh(Nanggroe Aceh Darussala),Aceh Besar,Regency,296541,351418,391870,405535,→
3,Aceh(Nanggroe Aceh Darussala),Aceh Jaya,Regency,60660,76782,86368,93159,→
4,Aceh(Nanggroe Aceh Darussala),Aceh Selatan[South Aceh],Regency,191539,202251,224588,232414,→
...,...,...,...,...,...,...,...,...
509,Yogyakarta(DI Yogyakarta),Bantul,Regency,859968,911503,970565,985770,→
510,Yogyakarta(DI Yogyakarta),Gunung Kidul,Regency,681554,675382,714656,747161,→
511,Yogyakarta(DI Yogyakarta),Kota Yogyakarta,City,433539,388627,412347,373589,→
512,Yogyakarta(DI Yogyakarta),Kulon Progo,Regency,373757,388869,411832,436395,→


In [27]:
page = requests.get(f"https://www.citypopulation.de/en/indonesia/papua/admin/")
l2l3_test_df = make_indo_l1l2(page)
l2l3_test_df

Unnamed: 0,Province,Name,Status,PopulationCensus2010-05-01,Extra_1
0,Asmat,Agats,District,12905,→
1,Asmat,Akat,District,5375,→
2,Asmat,Atsy,District,13838,→
3,Asmat,Fayit,District,7025,→
4,Asmat,Pantai Kasuari,District,16026,→
...,...,...,...,...,...
380,Yalimo,Abenaho,District,24577,→
381,Yalimo,Apalapsili,District,5840,→
382,Yalimo,Benawa,District,5713,→
383,Yalimo,Elelim,District,5946,→


In [28]:
page = requests.get(f"https://www.citypopulation.de/en/indonesia/admin/")
indo_l1l2 = make_indo_l1l2(page)
indo_l1l2.rename(columns={'Province': 'L1', 'Name': 'L2'}, inplace=True)
indo_l1l2 = indo_l1l2[['L1', 'L2']]
indo_l1l2

Unnamed: 0,L1,L2
0,Aceh(Nanggroe Aceh Darussala),Aceh Barat[West Aceh]
1,Aceh(Nanggroe Aceh Darussala),Aceh Barat Daya[Southwest Aceh]
2,Aceh(Nanggroe Aceh Darussala),Aceh Besar
3,Aceh(Nanggroe Aceh Darussala),Aceh Jaya
4,Aceh(Nanggroe Aceh Darussala),Aceh Selatan[South Aceh]
...,...,...
509,Yogyakarta(DI Yogyakarta),Bantul
510,Yogyakarta(DI Yogyakarta),Gunung Kidul
511,Yogyakarta(DI Yogyakarta),Kota Yogyakarta
512,Yogyakarta(DI Yogyakarta),Kulon Progo


In [29]:
indo_l2l3 = pd.DataFrame()

indo_l2_sites = ['https://www.citypopulation.de/en/indonesia/aceh/admin/',
                 'https://www.citypopulation.de/en/indonesia/bali/admin/',
                 'https://www.citypopulation.de/en/indonesia/banten/admin/',
                 'https://www.citypopulation.de/en/indonesia/bengkulu/admin/',
                 'https://www.citypopulation.de/en/indonesia/gorontalo/admin/',
                 'https://www.citypopulation.de/en/indonesia/jakarta/admin/',
                 'https://www.citypopulation.de/en/indonesia/jambi/admin/',
                 'https://www.citypopulation.de/en/indonesia/jawabarat/admin/',
                 'https://www.citypopulation.de/en/indonesia/jawatengah/admin/',
                 'https://www.citypopulation.de/en/indonesia/jawatimur/admin/',
                 'https://www.citypopulation.de/en/indonesia/kalimantanbarat/admin/',
                 'https://www.citypopulation.de/en/indonesia/kalimantanselatan/admin/',
                 'https://www.citypopulation.de/en/indonesia/kalimantantengah/admin/',
                 'https://www.citypopulation.de/en/indonesia/kalimantantimur/admin/',
                 'https://www.citypopulation.de/en/indonesia/kalimantanutara/admin/',
                 'https://www.citypopulation.de/en/indonesia/kepulauanbangkabelitung/admin/',
                 'https://www.citypopulation.de/en/indonesia/kepulauanriau/admin/',
                 'https://www.citypopulation.de/en/indonesia/lampung/admin/',
                 'https://www.citypopulation.de/en/indonesia/maluku/admin/',
                 'https://www.citypopulation.de/en/indonesia/malukuutara/admin/',
                 'https://www.citypopulation.de/en/indonesia/nusatenggarabarat/admin/',
                 'https://www.citypopulation.de/en/indonesia/nusatenggaratimur/admin/',
                 'https://www.citypopulation.de/en/indonesia/papua/admin/',
                 'https://www.citypopulation.de/en/indonesia/papuabarat/admin/',
                 'https://www.citypopulation.de/en/indonesia/riau/admin/',
                 'https://www.citypopulation.de/en/indonesia/sulawesibarat/admin/',
                 'https://www.citypopulation.de/en/indonesia/sulawesiselatan/admin/',
                 'https://www.citypopulation.de/en/indonesia/sulawesitengah/admin/',
                 'https://www.citypopulation.de/en/indonesia/sulawesitenggara/admin/',
                 'https://www.citypopulation.de/en/indonesia/sulawesiutara/admin/',
                 'https://www.citypopulation.de/en/indonesia/sumaterabarat/admin/',
                 'https://www.citypopulation.de/en/indonesia/sumateraselatan/admin/',
                 'https://www.citypopulation.de/en/indonesia/sumaterautara/admin/',
                 'https://www.citypopulation.de/en/indonesia/yogyakarta/admin/',
                 ] #kill me


for site in indo_l2_sites:
    try:
        page = requests.get(site)
        df = make_indo_l1l2(page)
        df.rename(columns={'Province': 'L2', 'Name': 'L3'}, inplace=True)
        df = df[['L2', 'L3']]
        indo_l2l3 = pd.concat([indo_l2l3, df], axis=0)
    except Exception as e:
        print(f"Error processing {site}: {e}")

indo_l2l3

Unnamed: 0,L2,L3
0,Aceh Barat[West Aceh],Arongan Lambalek
1,Aceh Barat[West Aceh],Bubon
2,Aceh Barat[West Aceh],Johan Pahlawan
3,Aceh Barat[West Aceh],Kaway Xvi
4,Aceh Barat[West Aceh],Meureubo
...,...,...
73,Sleman,Prambanan
74,Sleman,Seyegan
75,Sleman,Sleman
76,Sleman,Tempel


In [30]:
indo = indo_l2l3.merge(indo_l1l2, on='L2', how='left')
indo = indo[['L1', 'L2', 'L3']]
indo

Unnamed: 0,L1,L2,L3
0,Aceh(Nanggroe Aceh Darussala),Aceh Barat[West Aceh],Arongan Lambalek
1,Aceh(Nanggroe Aceh Darussala),Aceh Barat[West Aceh],Bubon
2,Aceh(Nanggroe Aceh Darussala),Aceh Barat[West Aceh],Johan Pahlawan
3,Aceh(Nanggroe Aceh Darussala),Aceh Barat[West Aceh],Kaway Xvi
4,Aceh(Nanggroe Aceh Darussala),Aceh Barat[West Aceh],Meureubo
...,...,...,...
6648,Yogyakarta(DI Yogyakarta),Sleman,Prambanan
6649,Yogyakarta(DI Yogyakarta),Sleman,Seyegan
6650,Yogyakarta(DI Yogyakarta),Sleman,Sleman
6651,Yogyakarta(DI Yogyakarta),Sleman,Tempel


## Vietnam l1-l3

### l1-l2

In [31]:
page = requests.get(f"https://www.citypopulation.de/en/vietnam/admin/")
viet_l1l2 = make_indo_l1l2(page)
viet_l1l2.rename(columns={'Province': 'L1', 'Name': 'L2'}, inplace=True)
viet_l1l2 = viet_l1l2[['L1', 'L2']]
viet_l1l2['L1'].unique()

array(['An Giang', 'Bắc Giang', 'Bắc Kạn', 'Bạc Liêu', 'Bắc Ninh',
       'Bà Rịa - Vũng Tàu', 'Bến Tre', 'Bình Định', 'Bình Dương',
       'Bình Phước', 'Bình Thuận', 'Cà Mau', 'Cần Thơ', 'Cao Bằng',
       'Đắk Lắk', 'Đắk Nông', 'Đà Nẵng', 'Điện Biên', 'Đồng Nai',
       'Đồng Tháp', 'Gia Lai', 'Hà Giang', 'Hải Dương', 'Hải Phòng',
       'Hà Nam', 'Hà Nội[Hanoi]', 'Hà Tĩnh', 'Hậu Giang', 'Hòa Bình',
       'Hưng Yên', 'Khánh Hòa', 'Kiên Giang', 'Kon Tum', 'Lai Châu',
       'Lâm Đồng', 'Lạng Sơn', 'Lào Cai', 'Long An', 'Nam Định',
       'Nghệ An', 'Ninh Bình', 'Ninh Thuận', 'Phú Thọ', 'Phú Yên',
       'Quảng Bình', 'Quảng Nam', 'Quảng Ngãi', 'Quảng Ninh', 'Quảng Trị',
       'Sóc Trăng', 'Sơn La', 'Tây Ninh', 'Thái Bình', 'Thái Nguyên',
       'Thanh Hóa', 'Thành Phố Hồ Chí Minh[Ho Chi Minh City]',
       'Thừa Thiên - Huế', 'Tiền Giang', 'Trà Vinh', 'Tuyên Quang',
       'Vĩnh Long', 'Vĩnh Phúc', 'Yên Bái'], dtype=object)

### l2-l3

In [32]:
viet_l2l3 = pd.DataFrame()

viet_sites = [
              'https://www.citypopulation.de/en/vietnam/cantho/admin/',
              'https://www.citypopulation.de/en/vietnam/danang/admin/',
              'https://www.citypopulation.de/en/vietnam/haiphong/admin/',
              'https://www.citypopulation.de/en/vietnam/hanoi/admin/',
              'https://www.citypopulation.de/en/vietnam/thanhphohochiminh/admin/',
              'https://www.citypopulation.de/en/vietnam/angiang/admin/',
              'https://www.citypopulation.de/en/vietnam/bariavungtau/admin/',
              'https://www.citypopulation.de/en/vietnam/bacgiang/admin/',
              'https://www.citypopulation.de/en/vietnam/backan/admin/',
              'https://www.citypopulation.de/en/vietnam/baclieu/admin/',
              'https://www.citypopulation.de/en/vietnam/bacninh/admin/',
              'https://www.citypopulation.de/en/vietnam/bentre/admin/',
              'https://www.citypopulation.de/en/vietnam/binhdinh/admin/',
              'https://www.citypopulation.de/en/vietnam/binhduong/admin/',
              'https://www.citypopulation.de/en/vietnam/binhphuoc/admin/',
              'https://www.citypopulation.de/en/vietnam/binhthuan/admin/',
              'https://www.citypopulation.de/en/vietnam/camau/admin/',
              'https://www.citypopulation.de/en/vietnam/caobang/admin/',
              'https://www.citypopulation.de/en/vietnam/daklak/admin/',
              'https://www.citypopulation.de/en/vietnam/daknong/admin/',
              'https://www.citypopulation.de/en/vietnam/dienbien/admin/',
              'https://www.citypopulation.de/en/vietnam/dongnai/admin/',
              'https://www.citypopulation.de/en/vietnam/dongthap/admin/',
              'https://www.citypopulation.de/en/vietnam/gialai/admin/',
              'https://www.citypopulation.de/en/vietnam/hagiang/admin/',
              'https://www.citypopulation.de/en/vietnam/hanam/admin/',
              'https://www.citypopulation.de/en/vietnam/hatinh/admin/',
              'https://www.citypopulation.de/en/vietnam/haiduong/admin/',
              'https://www.citypopulation.de/en/vietnam/haugiang/admin/',
              'https://www.citypopulation.de/en/vietnam/hoabinh/admin/',
              'https://www.citypopulation.de/en/vietnam/hungyen/admin/',
              'https://www.citypopulation.de/en/vietnam/khanhhoa/admin/',
              'https://www.citypopulation.de/en/vietnam/kiengiang/admin/',
              'https://www.citypopulation.de/en/vietnam/kontum/admin/',
              'https://www.citypopulation.de/en/vietnam/laichau/admin/',
              'https://www.citypopulation.de/en/vietnam/lamdong/admin/',
              'https://www.citypopulation.de/en/vietnam/langson/admin/',
              'https://www.citypopulation.de/en/vietnam/laocai/admin/',
              'https://www.citypopulation.de/en/vietnam/longan/admin/',
              'https://www.citypopulation.de/en/vietnam/namdinh/admin/',
              'https://www.citypopulation.de/en/vietnam/nghean/admin/',
              'https://www.citypopulation.de/en/vietnam/ninhbinh/admin/',
              'https://www.citypopulation.de/en/vietnam/ninhthuan/admin/',
              'https://www.citypopulation.de/en/vietnam/phutho/admin/',
              'https://www.citypopulation.de/en/vietnam/phuyen/admin/',
              'https://www.citypopulation.de/en/vietnam/quangbinh/admin/',
              'https://www.citypopulation.de/en/vietnam/quangnam/admin/',
              'https://www.citypopulation.de/en/vietnam/quangngai/admin/',
              'https://www.citypopulation.de/en/vietnam/quangninh/admin/',
              'https://www.citypopulation.de/en/vietnam/quangtri/admin/',
              'https://www.citypopulation.de/en/vietnam/soctrang/admin/',
              'https://www.citypopulation.de/en/vietnam/sonla/admin/',
              'https://www.citypopulation.de/en/vietnam/tayninh/admin/',
              'https://www.citypopulation.de/en/vietnam/thaibinh/admin/',
              'https://www.citypopulation.de/en/vietnam/thainguyen/admin/',
              'https://www.citypopulation.de/en/vietnam/thanhhoa/admin/',
              'https://www.citypopulation.de/en/vietnam/thuathienhue/admin/',
              'https://www.citypopulation.de/en/vietnam/tiengiang/admin/',
              'https://www.citypopulation.de/en/vietnam/travinh/admin/',
              'https://www.citypopulation.de/en/vietnam/tuyenquang/admin/',
              'https://www.citypopulation.de/en/vietnam/vinhlong/admin/',
              'https://www.citypopulation.de/en/vietnam/vinhphuc/admin/',
              'https://www.citypopulation.de/en/vietnam/yenbai/admin/',
              ]

print(len(viet_sites))
for site in viet_sites:
    try:
        curr_l1 = site.split('/')[-3]
        page = requests.get(site)
        df = make_indo_l1l2(page)
        df['L1'] = curr_l1
        df.rename(columns={'Province': 'L2', 'Name': 'L3'}, inplace=True)
        df = df[['L1', 'L2', 'L3']]
        viet_l2l3 = pd.concat([viet_l2l3, df], axis=0)
    except Exception as e:
        print(f"Error processing {site}: {e}")

viet_l2l3

63


Unnamed: 0,L1,L2,L3
0,cantho,Bình Thủy,An Thới
1,cantho,Bình Thủy,Bình Thủy
2,cantho,Bình Thủy,Bùi Hữu Nghĩa
3,cantho,Bình Thủy,Long Hòa
4,cantho,Bình Thủy,Long Tuyền
...,...,...,...
175,yenbai,Yên Bình,Xuân Lai
176,yenbai,Yên Bình,Xuân Long
177,yenbai,Yên Bình,Yên Bình
178,yenbai,Yên Bình,Yên Bình


In [33]:
viet = viet_l2l3.merge(viet_l1l2, on='L2', how='left')
viet = viet[['L1', 'L2', 'L3']]
viet 

KeyError: "['L1'] not in index"

In [None]:
viet_l1_dict = build_normalized_mapping(viet_l2l3, viet_l1l2, threshold=60)

viet_l1_dict['hanoi'] = 'Hà Nội[Hanoi]'
viet_l1_dict

viet_l2l3["L1"] = viet_l2l3['L1'].map(viet_l1_dict)
viet_l2l3

Unnamed: 0,L1,L2,L3
0,Cần Thơ,Bình Thủy,An Thới
1,Cần Thơ,Bình Thủy,Bình Thủy
2,Cần Thơ,Bình Thủy,Bùi Hữu Nghĩa
3,Cần Thơ,Bình Thủy,Long Hòa
4,Cần Thơ,Bình Thủy,Long Tuyền
...,...,...,...
175,Yên Bái,Yên Bình,Xuân Lai
176,Yên Bái,Yên Bình,Xuân Long
177,Yên Bái,Yên Bình,Yên Bình
178,Yên Bái,Yên Bình,Yên Bình


In [None]:
viet_l2l3 = viet_l2l3.rename(columns={"L3": "solo L3", 'L2' : 'Solo L2'})
viet_l2l3['L2'] = viet_l2l3['L1'] + ', ' + viet_l2l3['Solo L2']
viet_l2l3['L3'] = viet_l2l3['L1'] + ', ' + viet_l2l3['Solo L2'] + ', ' + viet_l2l3['solo L3']
viet_l2l3 = viet_l2l3[['L1', 'L2', 'L3']]
viet_l2l3

Unnamed: 0,L1,L2,L3
0,Cần Thơ,"Cần Thơ, Bình Thủy","Cần Thơ, Bình Thủy, An Thới"
1,Cần Thơ,"Cần Thơ, Bình Thủy","Cần Thơ, Bình Thủy, Bình Thủy"
2,Cần Thơ,"Cần Thơ, Bình Thủy","Cần Thơ, Bình Thủy, Bùi Hữu Nghĩa"
3,Cần Thơ,"Cần Thơ, Bình Thủy","Cần Thơ, Bình Thủy, Long Hòa"
4,Cần Thơ,"Cần Thơ, Bình Thủy","Cần Thơ, Bình Thủy, Long Tuyền"
...,...,...,...
175,Yên Bái,"Yên Bái, Yên Bình","Yên Bái, Yên Bình, Xuân Lai"
176,Yên Bái,"Yên Bái, Yên Bình","Yên Bái, Yên Bình, Xuân Long"
177,Yên Bái,"Yên Bái, Yên Bình","Yên Bái, Yên Bình, Yên Bình"
178,Yên Bái,"Yên Bái, Yên Bình","Yên Bái, Yên Bình, Yên Bình"


In [None]:
test = viet_l2l3[viet_l2l3.duplicated()]
test

Unnamed: 0,L1,L2,L3
75,Cần Thơ,"Cần Thơ, Vĩnh Thạnh","Cần Thơ, Vĩnh Thạnh, Thạnh An"
176,Hà Nội[Hanoi],"Hà Nội[Hanoi], Gia Lâm","Hà Nội[Hanoi], Gia Lâm, Yên Viên"
124,Bắc Giang,"Bắc Giang, Sơn Động","Bắc Giang, Sơn Động, An Châu"
147,Bắc Giang,"Bắc Giang, Tân Yên","Bắc Giang, Tân Yên, Cao Thượng"
160,Bắc Giang,"Bắc Giang, Tân Yên","Bắc Giang, Tân Yên, Nhã Nam"
211,Bắc Giang,"Bắc Giang, Yên Thế","Bắc Giang, Yên Thế, Bố Hạ"
52,Bạc Liêu,"Bạc Liêu, Phước Long","Bạc Liêu, Phước Long, Phước Long"
74,Cà Mau,"Cà Mau, Thới Bình","Cà Mau, Thới Bình, Thới Bình"
10,Đồng Tháp,"Đồng Tháp, Cao Lãnh","Đồng Tháp, Cao Lãnh, Mỹ Thọ"
135,Đồng Tháp,"Đồng Tháp, Tháp Mười","Đồng Tháp, Tháp Mười, Mỹ An"


## Thailand l1-l2

In [35]:
page = requests.get(f"https://www.citypopulation.de/en/thailand/admin/")
thai_l1l2 = make_indo_l1l2(page)
thai_l1l2.rename(columns={'Province': 'L1', 'Name': 'L2'}, inplace=True)
thai_l1l2 = thai_l1l2[['L1', 'L2']]
thai_l1l2

Unnamed: 0,L1,L2
0,Amnat Charoen,Chanuman
1,Amnat Charoen,Hua Taphan
2,Amnat Charoen,Lue Amnat
3,Amnat Charoen,Mueang Amnat Charoen
4,Amnat Charoen,Pathum Ratchawongsa
...,...,...
923,Yasothon,Maha Chana Chai
924,Yasothon,Mueang Yasothon
925,Yasothon,Pa Tio
926,Yasothon,Sai Mun


# Excel Writer

## for l1 asia + africa. mostly for multiplier

In [None]:
# with pd.ExcelWriter('data/all_citypopulation_data.xlsx', engine='openpyxl') as writer:
#     bangladesh_df.to_excel(writer, sheet_name='Bangladesh', index=False)
#     brazil_df.to_excel(writer, sheet_name='Brazil', index=False)
#     cambodia_df.to_excel(writer, sheet_name='Cambodia', index=False)
#     china_df.to_excel(writer, sheet_name='China', index=False)
#     india_df.to_excel(writer, sheet_name='India', index=False)
#     indonesia_df.to_excel(writer, sheet_name='Indonesia', index=False)
#     mexico_df.to_excel(writer, sheet_name='Mexico', index=False)
#     kenya_df.to_excel(writer, sheet_name='Kenya', index=False)
#     liberia_df.to_excel(writer, sheet_name='Liberia', index=False)
#     rwanda_df.to_excel(writer, sheet_name='Rwanda', index=False)
#     southafrica_df.to_excel(writer, sheet_name='South Africa', index=False)
#     gambia_df.to_excel(writer, sheet_name='Gambia', index=False)
#     zambia_df.to_excel(writer, sheet_name='Zambia', index=False)
#     malawi_df.to_excel(writer, sheet_name='Malawi', index=False)
#     ghana_df.to_excel(writer, sheet_name='Ghana', index=False)
#     eswatini_df.to_excel(writer, sheet_name='Eswatini', index=False)
#     workbook = writer.book



## for brazil l1-l2. for contactous set-up

In [25]:
brazil_df.to_excel('data/brazil_df.xlsx', index=False)

In [None]:
# with pd.ExcelWriter('data/brazil_l2_population.xlsx', engine='openpyxl') as writer:
#     for site in brazil_sites:
#         page = requests.get(site)
#         df = make_df_l2(page)
#         state = df['L1'][0]
#         df.to_excel(writer, sheet_name=state, index=False)
#     workbook = writer.book

## for indo l1-l3, for contacous cleaning

In [None]:
# indo.to_excel('data/indo_l1l3.xlsx', index=False)

## for vietnam setup

In [64]:
viet_l2l3.drop_duplicates(inplace=True)
viet_l2l3.to_excel('data/vietam_setup.xlsx', index=False)

### thailand l1-l2

In [36]:
thai_l1l2.to_excel('data/thailand_setup.xlsx', index=False)

# Africa L0 to L2 setup


In [124]:
africa_l2l3 = pd.concat([kenya_l2, rwanda_l2, southafrica_l2, gambia_l2, zambia_l2, malawi_l2, ghana_l2, ethiopia_l2, nigeria_l2, rwanda_l2, senegal_l2], axis=0)
africa_l0l2 = africa_l2l3[['L0', 'Province', 'Name']]
africa_l0l2.rename(columns={'Province': 'L1', 'Name': 'L2'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  africa_l0l2.rename(columns={'Province': 'L1', 'Name': 'L2'}, inplace=True)


In [125]:
congo_df

Unnamed: 0,Name,Status,Area,1984,2010,2015,2020
0,Bas-Uele,Prov,148331,545458.0,1004000.0,1138000.0,1369600.0
1,Équateur,Prov,103902,635298.0,1315000.0,1528000.0,1856000.0
2,Haut-Katanga,Prov,132425,1391617.0,3788000.0,4617000.0,5718800.0
3,Haut-Lomami,Prov,108204,891021.0,2426000.0,2957000.0,3662800.0
4,Haut-Uele,Prov,89683,893111.0,1643000.0,1864000.0,2242500.0
5,Ituri,Prov,65658,1749256.0,3219000.0,3650000.0,4392200.0
6,Kasaï,Prov,95631,1096783.0,2388000.0,2801000.0,3417000.0
7,Kasaï-Central,Prov,59500,1298463.0,2827000.0,3317000.0,4045300.0
8,Kasaï-Oriental,Prov,9545,1080610.0,2634000.0,3145000.0,3864300.0
9,Kinshasa,Cap,9965,2664309.0,8683000.0,11575000.0,14565700.0


In [126]:
liberia_df['L0'] = 'Liberia'
eswatini_df['L0'] = 'Eswatini'
congo_df['L0'] = 'Democratic Republic of the Congo'
lesotho_df['L0'] = 'Lesotho'
namibia_df['L0'] = 'Namibia'

In [130]:
africa_problems = pd.concat([liberia_df, eswatini_df, congo_df, lesotho_df, namibia_df], axis=0)
africa_problems = africa_problems[['L0', 'Name']]
africa_problems.rename(columns={'Name': 'L1'}, inplace=True)

In [132]:

africa_l0l2.to_excel('data/africa_l0l2.xlsx', index=False)
africa_problems.to_excel('data/africa_problems.xlsx', index=False)