### Imports

In [1]:
import pandas as pd
import numpy as np
import json
import time 
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.action_chains import ActionChains
import warnings
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import re
import requests as req
from bs4 import BeautifulSoup as bs
import random
import http.client, urllib.parse
import csv
import seaborn as sns
import os
#from sample_building_functions import *


warnings.simplefilter('ignore')

### Defining some useful functions

In [2]:
def outliers_border_tukey(data):
    """
    Detecta outliers utilizando el método de Tukey.

    Parámetros:
    data (list or np.array): un arreglo unidimensional de valores numéricos.
    
    - Written by chatGPT, modified by DiSanchz
    """
    # Calculamos el rango intercuartil (IQR)
    q1, q3 = np.percentile(data, [25, 75])
    iqr = q3 - q1

    # Establecemos los límites inferior y superior
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    
    print(f'The upper and lower bounds for outliers are: {(lower_bound, upper_bound)}')

    return (lower_bound, upper_bound)

## 1. Loading data on all cities _vanilla_ sample DataFrame "all_cities_filter"

In [3]:
all_cities_filter = pd.read_csv("../data/all_cities_filtered.csv")

## 2. complete data with GDP and HDI figures

### Loading reference indicators' data

In [4]:
GDP_raw = pd.read_csv('../data/ignore/GDP_per_capita_2019.csv', encoding= 'unicode_escape', sep=';')
HDI_raw = pd.read_csv('../data/ignore/HDI_index_historical.csv', encoding= 'unicode_escape')

### 2.1 GDP

In [5]:
GDP_raw['country'] =  [i.lower() for i in GDP_raw['GDP per capita, current prices\n (U.S. dollars per capita)']]
GDP_raw.drop(['GDP per capita, current prices\n (U.S. dollars per capita)'], inplace=True, axis=1)

"Cleaning" country names. A dictionary of alternative country names is created to map each contry name as referred in "all_cities_filter" compared to GDP data source for those cases where it may diverge.

In [6]:
fixed_names = {}
fixed_names ['burkinafaso'] = 'burkina faso'
fixed_names ['centralafrica'] = 'central african republic'
fixed_names ['drcongo'] = 'congo, dem. rep. of the'
fixed_names ['equatorialguinea'] = 'equatorial guinea'
fixed_names ['guineabissau'] = 'guinea-bissau'
fixed_names ['ivorycoast'] = "cote divoire"
fixed_names ['sierraleone'] = "sierra leone"
fixed_names ['ivorycoast'] = 'south sudan'
fixed_names['costarica'] = 'costa rica'
fixed_names['southsudan'] = 'south sudan'
fixed_names['turkey'] = 'türkiye, republic of'
fixed_names['domrep'] = 'dominican republic'
fixed_names['elsalvador'] = 'el salvador'
fixed_names['puertorico'] = 'puerto rico'  
fixed_names['usa'] = 'united states'  
fixed_names['Azerbaijan.html'] = 'azerbaijan'
fixed_names['kyrgyzstan'] = 'kyrgyz republic'
fixed_names['laos'] = 'lao p.d.r.'
fixed_names['saudiarabia'] = 'saudi arabia'
fixed_names['srilanka'] = 'sri lanka'
fixed_names['taiwan'] = 'taiwan province of china'
fixed_names['bosnia'] = 'bosnia and herzegovina'
fixed_names['czechrep'] = 'czech republic'
fixed_names['uk'] = 'united kingdom'
fixed_names['northmacedonia'] = 'north macedonia '
fixed_names['russia'] = 'russian federation'
fixed_names['slovakia'] = 'slovak republic'
fixed_names['newzealand'] = 'new zealand'
fixed_names['papuanewguinea'] = 'papua new guinea'

Retrieving GDP data and inserting it into the cities DF

In [7]:
GDP_column = []

for i in all_cities_filter['country']:
    try:
        try:
            GDP_column.append(GDP_raw[GDP_raw.eq(i).any(1)].iloc[0][0])
        except:
            GDP_column.append(GDP_raw[GDP_raw.eq(fixed_names.get(i)).any(1)].iloc[0][0])
    except:
        GDP_column.append(np.nan)
        pass
        
all_cities_GDP = all_cities_filter.copy()
all_cities_GDP['GDP_19'] = GDP_column
all_cities_GDP_valid = all_cities_GDP[all_cities_GDP.GDP_19.notnull()]

GDP figures are not available for 'cuba' & 'northkorea' thus in principle these should be discarded from the sample until alternative estimates can be found

In [8]:
all_cities_GDP_null = all_cities_GDP[all_cities_GDP.GDP_19.isnull()]
all_cities_GDP_null.country.unique()

array(['cuba', 'northkorea'], dtype=object)

In [9]:
all_cities_GDP_valid

Unnamed: 0,cityname,country,latest_figure,figure_description,town_code,GDP_19
0,Adrar,algeria,68276,(C)2008-04-14Area,0,3953576
1,Aflou,algeria,93585,(C)2008-04-14Area,1,3953576
2,Aïn Béïda,algeria,116064,(C)2008-04-14Area,2,3953576
3,Aïn Defla,algeria,55259,(C)2008-04-14Area,3,3953576
4,Aïn M'lila,algeria,65371,(C)2008-04-14Area,4,3953576
...,...,...,...,...,...,...
5488,Tauranga,newzealand,158300,(Ep)2022-06-30Area,5488,42342994
5489,Wellington,newzealand,212000,(Ep)2022-06-30Area,5489,42342994
5490,Whangārei,newzealand,54900,(Ep)2022-06-30Area,5490,42342994
5491,Lae,papuanewguinea,148934,(Cf)2011-07-10Area,5491,2877577


### 2.2 HDI

In [10]:
HDI_raw['country_format']= [i.lower().replace(" ", "") for i in HDI_raw['country']]
HDI_short=HDI_raw[['country', 'country_format', 'hdi_2019']]

"Cleaning" country names. A dictionary of alternative country names is created to map each contry name as referred in "all_cities_filter" compared to GDP data source for those cases where it may diverges.

In [11]:
fixed_names = {}
fixed_names ['burkinafaso'] = 'burkina faso'
fixed_names ['centralafrica'] = 'centralafricanrepublic'
fixed_names ['drcongo'] = 'congo(democraticrepublicofthe)'
fixed_names ['equatorialguinea'] = 'equatorial guinea'
fixed_names ['guineabissau'] = 'guinea-bissau'
fixed_names ['ivorycoast'] = "cote"
fixed_names ['sierraleone'] = "sierra leone"
fixed_names ['ivorycoast'] = 'cote'
fixed_names['costarica'] = 'costa rica'
fixed_names['southsudan'] = 'south sudan'
fixed_names['turkey'] = 'türkiye, republic of'
fixed_names['domrep'] = 'dominican republic'
fixed_names['elsalvador'] = 'el salvador'
fixed_names['puertorico'] = 'puerto rico'  
fixed_names['usa'] = 'unitedstates'  
fixed_names['Azerbaijan.html'] = 'azerbaijan'
fixed_names['kyrgyzstan'] = 'kyrgyz republic'
fixed_names['laos'] = 'lao p.d.r.'
fixed_names['saudiarabia'] = 'saudi arabia'
fixed_names['srilanka'] = 'sri lanka'
fixed_names['taiwan'] = 'taiwan province of china'
fixed_names['bosnia'] = 'bosniaandherzegovina'
fixed_names['czechrep'] = 'czechia'
fixed_names['uk'] = 'unitedkingdom'
fixed_names['northmacedonia'] = 'north macedonia '
fixed_names['russia'] = 'russianfederation'
fixed_names['slovakia'] = 'slovak republic'
fixed_names['newzealand'] = 'new zealand'
fixed_names['papuanewguinea'] = 'papua new guinea'

In [12]:
HDI_column = []

for i in all_cities_GDP_valid['country']:
    try:
        try:
            HDI_column.append(HDI_short[HDI_short.eq(i).any(1)].iloc[0][2])
        except:
            try:
                #print(f"{i} not found in first attempt, moving to look in alternative dictionary")
                #print(f"{i} found as {fixed_names.get(i)} in alternative source, proceeding with second attempt")
                #HDI_column.append(HDI_short[HDI_short.eq(fixed_names.get(i)).any(1)].iloc[0][2])
                HDI_column.append(HDI_short[HDI_short['country_format']== fixed_names.get(i)].iloc[0][2])
            except:
                HDI_column.append(np.nan)
                #print(f"{i} not found in second attempt")
    except:
        
        pass
        
all_cities_GDP_HDI = all_cities_GDP_valid.copy()
all_cities_GDP_HDI['HDI_19'] = HDI_column
all_cities_GDP_HDI_valid = all_cities_GDP_HDI[all_cities_GDP_HDI.HDI_19.notnull()]

HDI are not available for 'puertorico' & 'taiwan' thus in principle these should be  discarded from the sample until alternative estimates can be found

In [13]:
all_cities_GDP_HDI_null = all_cities_GDP_HDI[all_cities_GDP_HDI.HDI_19.isnull()]
all_cities_GDP_HDI_null.country.unique()

array(['puertorico', 'taiwan'], dtype=object)

### 2.3 Notes

A total of 78 instances had to be discarded due to the inexistence of oficial figures for the selected reference indicators (HDI and GDP per capita). These 78 instances corresponds to all dose associated to the values 'puertorico', 'taiwan', 'cuba' and 'northkorea' of the feature 'country' in the dataframe 'all_cities_filter'.

In [14]:
all_cities_GDP_HDI_null.shape[0] + all_cities_GDP_null.shape[0]

78

## 3. Population figures adjustment

Extracting the year in which the population figure for a given city was made/estimated

In [15]:
# Creating new column with year of estimate for latest_figure
all_cities_GDP_HDI_valid['year_latest_figure'] = [int(re.findall(r"\)(\d{4})", i)[0]) for i in all_cities_GDP_HDI_valid['figure_description']]

In [16]:
pop_raw = pd.read_csv('../data/ignore/WPP2022.csv', sep=',', encoding='utf8', quotechar='"')
pop_raw['location_fixed'] = [i.lower().replace(" ", "") for i in pop_raw['Location']]

"Cleaning" country names. A dictionary of alternative country names is created to map each contry name as referred in "all_cities_filter" compared to population data source for those cases where it may diverges.

In [17]:
fixed_names = {}
fixed_names ['burkinafaso'] = 'burkina faso'
fixed_names ['centralafrica'] = 'centralafricanrepublic'
fixed_names ['drcongo'] = 'democraticrepublicofthecongo'
fixed_names ['equatorialguinea'] = 'equatorial guinea'
fixed_names ['guineabissau'] = 'guinea-bissau'
fixed_names ['ivorycoast'] = "côted'ivoire"
fixed_names ['laos'] = "laopeople'sdemocraticrepublic"
fixed_names['tanzania'] = 'unitedrepublicoftanzania'
fixed_names['bolivia'] = 'bolivia(plurinationalstateof)'
fixed_names['turkey'] = 'türkiye'
fixed_names['domrep'] = 'dominicanrepublic'
fixed_names['elsalvador'] = 'el salvador'
fixed_names['puertorico'] = 'puerto rico'  
fixed_names['usa'] = 'unitedstatesofamerica'  
fixed_names['Azerbaijan.html'] = 'azerbaijan'
fixed_names['kyrgyzstan'] = 'kyrgyz republic'
fixed_names['saudiarabia'] = 'saudi arabia'
fixed_names['srilanka'] = 'sri lanka'
fixed_names['taiwan'] = 'taiwan province of china'
fixed_names['bosnia'] = 'bosniaandherzegovina'
fixed_names['czechrep'] = 'czechia'
fixed_names['uk'] = 'unitedkingdom'
fixed_names['iran'] = 'iran(islamicrepublicof)'
fixed_names['russia'] = 'russianfederation'
fixed_names['newzealand'] = 'new zealand'
fixed_names['venezuela'] = 'venezuela(bolivarianrepublicof)'

Defining auxiliary functions for population estimation alignment to baseline year 2019

In [18]:
def lookfor_pop(year, country):

    tmp = pop_raw[pop_raw['location_fixed'] == country] # Locate population data for a given country
    return int(str(tmp[tmp['Time'] == year].iloc[0][-3]).replace(".","")) # Find row corresponding to year 'year', retrieve corresponding popn and return it as 'int'
    
    
def pop_2019(country):

    return lookfor_pop(2019, country)

Computing rates of population change for each country for the year of last estimate compared to 2019

In [19]:
pop_adjust = {}

for i in list(all_cities_GDP_HDI_valid['country'].unique()):
    
    try:
        a = pop_2019(i) # pop of country i in 2019
        b = lookfor_pop(all_cities_GDP_HDI_valid[all_cities_GDP_HDI_valid['country'] == i ].iloc[0][-1], i) # pop of country i in estimates year
        year = all_cities_GDP_HDI_valid[all_cities_GDP_HDI_valid['country'] == i ].iloc[0][-1]
        c = a/b 
        pop_adjust[i] = (a, b, c, year)
        
        # if figure c cannot be compute using "original name" the corrected name is retrieved from "fixed_names" and the process repated
    except:
        a = pop_2019(fixed_names.get(i))
        b = lookfor_pop(all_cities_GDP_HDI_valid[all_cities_GDP_HDI_valid['country'] == i ].iloc[0][-1], fixed_names.get(i))
        year = all_cities_GDP_HDI_valid[all_cities_GDP_HDI_valid['country'] == i ].iloc[0][-1]
        c = a/b
        pop_adjust[i] = (a, b, c, year)  
        
        # a = population in 2019
        # b = population year of last estimates
        # c = ratio a/b
        # year = year of last population estimate for settlements

Visual inspection of results as stored in "pop_adjust"

In [20]:
pop_adjust

{'algeria': (42705368, 34569592, 1.2353448660892499, 2008),
 'benin': (12290444, 1030873, 11.92236483058534, 2013),
 'botswana': (2499702, 2630296, 0.9503500746684024, 2022),
 'burkinafaso': (20951639, 20951639, 1.0, 2019),
 'burundi': (11874838, 8278109, 1.434486789193039, 2008),
 'cameroon': (25782341, 17275171, 1.4924506970148081, 2005),
 'centralafrica': (5209324, 5457154, 0.9545862183841615, 2021),
 'chad': (16126866, 11496128, 1.4028084934336151, 2009),
 'drcongo': (8990689, 54815607, 0.16401695597387073, 2004),
 'djibouti': (1073994, 901103, 1.19186596870724, 2009),
 'egypt': (105618671, 109262178, 0.9666535386105886, 2021),
 'equatorialguinea': (1553031, 71927, 21.591766652300247, 2001),
 'eritrea': (3498818, 2291561, 1.5268273460754482, 1997),
 'ethiopia': (114120594, 123379924, 0.924952701381142, 2022),
 'gabon': (2242785, 1902226, 1.1790318290255732, 2013),
 'ghana': (3152229, 25574719, 0.12325566509645718, 2010),
 'guinea': (12877539, 11333365, 1.13625026635955, 2014),
 'gu

Manually correcting wrong data for some countries for which population variation figure were abnormal

In [21]:
pop_adjust['benin'] = (12290444, 10308730, c, 2013)
pop_adjust['drcongo'] = (89906890, 54815607, c, 2004)
pop_adjust['equatorialguinea'] = (1553031, 719270, c, 2001)
pop_adjust['ghana'] = (31522290, 25574719, c, 2010)
pop_adjust['namibia'] = (2446644, 2132340, c, 2011)
pop_adjust['nigeria'] = (203304492, 97685360, c, 1991)
pop_adjust['togo'] = (8243094, 8442580, c, 2020)
pop_adjust['ecuador'] = (17343740, 14989585, c, 2010)
pop_adjust['nicaragua'] = (6663924, 6850540, c, 2021)
pop_adjust['afghanistan'] = (37769499, 38972230, c, 2020)
pop_adjust['Azerbaijan.html'] = (10232753, 9863480, c, 2015)
pop_adjust['india'] = (1383112050, 1257621191, c, 2011)
pop_adjust['malaysia'] = (32804020, 28717731, c, 2010)
pop_adjust['mongolia'] = (3232430, 3347782, c, 2021)
pop_adjust['belgium'] = (11510568, 11655930,c, 2022)
pop_adjust['northmacedonia'] = (2114176, 2103330, c, 2021)
pop_adjust['romania'] = (19524211, 19328560, c, 2021)
pop_adjust['uganda']=(42949080, 44404611, c, 2020)

Recomputing incorrect rates of change

In [22]:
for i in pop_adjust.keys():
    pop_adjust[i] = (pop_adjust.get(i)[0], pop_adjust.get(i)[1], pop_adjust.get(i)[0]/pop_adjust.get(i)[1], pop_adjust.get(i)[3])

Computing estimates for population figures in baseline year (2019) for each instance and adding it to the dataset as a new column.

In [23]:
all_cities_GDP_HDI_valid.reset_index(inplace=True, drop=True)

figure_est_2019 = []
adjust_factor_col = []

for i in range(all_cities_GDP_HDI_valid.shape[0]):

    country = all_cities_GDP_HDI_valid.iloc[i][1]
    adjust_factor = pop_adjust.get(country)[2]
    figure_est = all_cities_GDP_HDI_valid.iloc[i][2] * adjust_factor
    
    adjust_factor_col.append(adjust_factor)
    figure_est_2019.append(int(figure_est))
    
all_cities_GDP_HDI_valid['2019_figure_est'] = figure_est_2019

In [24]:
all_cities_GDP_HDI_valid

Unnamed: 0,cityname,country,latest_figure,figure_description,town_code,GDP_19,HDI_19,year_latest_figure,2019_figure_est
0,Adrar,algeria,68276,(C)2008-04-14Area,0,3953576,0.748,2008,84344
1,Aflou,algeria,93585,(C)2008-04-14Area,1,3953576,0.748,2008,115609
2,Aïn Béïda,algeria,116064,(C)2008-04-14Area,2,3953576,0.748,2008,143379
3,Aïn Defla,algeria,55259,(C)2008-04-14Area,3,3953576,0.748,2008,68263
4,Aïn M'lila,algeria,65371,(C)2008-04-14Area,4,3953576,0.748,2008,80755
...,...,...,...,...,...,...,...,...,...
5410,Tauranga,newzealand,158300,(Ep)2022-06-30Area,5488,42342994,0.937,2022,151392
5411,Wellington,newzealand,212000,(Ep)2022-06-30Area,5489,42342994,0.937,2022,202749
5412,Whangārei,newzealand,54900,(Ep)2022-06-30Area,5490,42342994,0.937,2022,52504
5413,Lae,papuanewguinea,148934,(Cf)2011-07-10Area,5491,2877577,0.560,2011,182050


Re-filtering data to eliminate discard any settlement whose population estimate for 2019 is out of the defined population bounds (50k < x < 500k)

In [25]:
all_cities_50 = all_cities_GDP_HDI_valid[all_cities_GDP_HDI_valid['2019_figure_est']>50000]
all_cities_filter = all_cities_50[all_cities_50['2019_figure_est']<500000]
all_cities_filter.reset_index(inplace=True, drop=True)
siei_data_1 = all_cities_filter.copy()

199 instances were discarded after re-applying the population range filter

In [26]:
siei_data_1.shape[0] - all_cities_GDP_HDI_valid.shape[0]

-199

In [27]:
siei_data_1 

Unnamed: 0,cityname,country,latest_figure,figure_description,town_code,GDP_19,HDI_19,year_latest_figure,2019_figure_est
0,Adrar,algeria,68276,(C)2008-04-14Area,0,3953576,0.748,2008,84344
1,Aflou,algeria,93585,(C)2008-04-14Area,1,3953576,0.748,2008,115609
2,Aïn Béïda,algeria,116064,(C)2008-04-14Area,2,3953576,0.748,2008,143379
3,Aïn Defla,algeria,55259,(C)2008-04-14Area,3,3953576,0.748,2008,68263
4,Aïn M'lila,algeria,65371,(C)2008-04-14Area,4,3953576,0.748,2008,80755
...,...,...,...,...,...,...,...,...,...
5211,Tauranga,newzealand,158300,(Ep)2022-06-30Area,5488,42342994,0.937,2022,151392
5212,Wellington,newzealand,212000,(Ep)2022-06-30Area,5489,42342994,0.937,2022,202749
5213,Whangārei,newzealand,54900,(Ep)2022-06-30Area,5490,42342994,0.937,2022,52504
5214,Lae,papuanewguinea,148934,(Cf)2011-07-10Area,5491,2877577,0.560,2011,182050


## 4. Instance-Image check

Checking that there is one image in the specified directory for each instance in the dataframe.

In [28]:
generic_image_dir = "../data/ignore/images_750_800_12/"

images_check = []

for i in range(siei_data_1.shape[0]):

    town_code = siei_data_1.iloc[i][4]
    
    image_path = os.path.join(generic_image_dir, str(town_code) + ".png")
    
    if os.path.isfile(image_path):
        pass
    else:
        images_check.append(town_code)

A total of 90 Images could not be retrieved from the initial round of API calls. The town codes of these are listed below. A second round of API call will be carried out in a separate notebook to retrieve these or at least check what might have gone wrong in these particular cases.

In [29]:
len(images_check)

90

In [31]:
resultados = siei_data_1[siei_data_1['town_code'].isin(images_check)]
resultados.country.value_counts()

kazakhstan     13
brazil          7
colombia        7
sudan           6
argentina       6
india           6
germany         4
nigeria         3
chile           3
vietnam         3
ukraine         2
egypt           2
srilanka        2
japan           2
iran            2
belarus         2
honduras        2
costarica       2
philippines     2
tajikistan      1
thailand        1
nepal           1
russia          1
drcongo         1
malaysia        1
kyrgyzstan      1
peru            1
paraguay        1
mexico          1
elsalvador      1
sierraleone     1
ethiopia        1
bangladesh      1
Name: country, dtype: int64

In [32]:
siei_data_1.drop(siei_data_1[siei_data_1['town_code'].isin(images_check)].index, inplace=True)
siei_data_1.reset_index(inplace=True, drop=True)

Saving resulted enriched dataframe as csv for later use.

In [33]:
siei_data_1.to_csv("../data/siei_data_v1.csv")