# Nairobi Multiple Deprivation Indicators - Preprocessing

This code was developed by Eqi Luo, for his MSc Research. In this code, we would prepare and standardize the deprivation incicators for the PCA.  

In [1]:
# Firstly, import the necessary packages for this analysis

import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from skimage import exposure
import os
from osgeo import gdal



In [2]:
# define dataset directory
BASE_PATH = r"C:\Users\EQiLu\MSc Thesis\100m Input Mid-term"
# set the working path
os.chdir(BASE_PATH)

## Part1: loading, checking, structring the input dataset

In [3]:
# Define a function to read the first band of geotiff file as a numpy array
# Here we only read the first band because each indicator layer has only one single band! 

def gtiff_to_array(file_path):
    "read the first band and return a 2-D array of width * height"
    data = gdal.Open(file_path)
    band = data.GetRasterBand(1).ReadAsArray()
    return band

In [4]:
# Read all the geotiff files and convert them into np arrays
# Each tiff layer is one indicator representing multiple deprivation

skilled_birth = gtiff_to_array('nai_skilledbirth_atten_2014.tif')
poverty = gtiff_to_array('nai_poverty_2008.tif')
women_literacy = gtiff_to_array('nai_women_literacy_2014.tif')
men_literacy = gtiff_to_array('nai_men_literacy_2014.tif')
vaccination = gtiff_to_array('nai_dt3_vaccination_2014.tif')
access_itn = gtiff_to_array('nai_access_itn_2014.tif')
stunted_child = gtiff_to_array('nai_stuned_children_2014.tif')
family_planning = gtiff_to_array('nai_unmet_family_plan_2014.tif')
impr_housing = gtiff_to_array('nai_imprvd_housing_2015.tif')
water_source = gtiff_to_array('nai_water_source_2014.tif')
open_defecation = gtiff_to_array('nai_open_defecation_2014.tif')
pit_latrines = gtiff_to_array('nai_density_pit_latrines.tif')
armed_conflicts = gtiff_to_array('nai_density_crime_2019.tif')
pm_25 = gtiff_to_array('nai_pm25_2016.tif')
waterways = gtiff_to_array('nai_water_density_2020.tif')
dump_sites = gtiff_to_array('nai_density_dumpsites_2017.tif')
pop_density = gtiff_to_array('nai_pop_density_2020.tif')
bld_density = gtiff_to_array('nai_bld_density_2020.tif')
ndvi = gtiff_to_array('nai_ndvi_2019.tif')
night_light = gtiff_to_array('nai_nightlight_2016.tif')
bus_stations = gtiff_to_array('nai_den_bus_stop_2019.tif')
dis_roads = gtiff_to_array('nai_OSM_dis_major_roads_2020.tif')
poor_roads = gtiff_to_array('nai_poor_road_density_2020.tif')
dis_education = gtiff_to_array('nai_dis_education_2020.tif')
dis_health = gtiff_to_array('nai_dis_health_2020.tif')
dis_financial = gtiff_to_array('nai_dis_financial_2020.tif')
built_up = gtiff_to_array('nai_grid3_builtup_2020.tif')
slum = gtiff_to_array('slum_boundary_nairobi.tif')

In [5]:
# Check the sahpe of each geotiff file. 
# The standarized size of each raster layer should be 434*253(height * width)

print(skilled_birth.shape)
print(poverty.shape)
print(women_literacy.shape)
print(men_literacy.shape)
print(vaccination.shape)
print(access_itn.shape)
print(stunted_child.shape)
print(family_planning.shape)
print(impr_housing.shape)
print(water_source.shape)
print(open_defecation.shape)
print(pit_latrines.shape)
print(armed_conflicts.shape)
print(pm_25.shape)
print(waterways.shape)
print(dump_sites.shape)
print(pop_density.shape)
print(bld_density.shape)
print(ndvi.shape)
print(night_light.shape)
print(bus_stations.shape)
print(dis_roads.shape)
print(poor_roads.shape)
print(dis_education.shape)
print(dis_health.shape)
print(dis_financial.shape)


print(built_up.shape)
print(slum.shape)



(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)
(253, 434)


In [6]:
# Stack the indicator layers into a 3-d array, where the shape is NUM_INDICATORS * HEIGHT * WIDTH

# Note that we could also use np.concatenate function to compile the layers into a stack, but with some modification 

dataset = np.stack((skilled_birth, poverty, women_literacy, men_literacy, vaccination, access_itn, stunted_child, family_planning, 
                    impr_housing, water_source, open_defecation, pit_latrines, armed_conflicts, pm_25, waterways, dump_sites,
                    pop_density, bld_density, ndvi, night_light, bus_stations, dis_roads, poor_roads, dis_education, dis_health,
                    dis_financial, built_up, slum),axis = 0)

In [7]:
# Now we check the shape of the dataset
print(dataset.shape)
print(dataset)

(28, 253, 434)
[[[-99999. -99999. -99999. ... -99999. -99999. -99999.]
  [-99999. -99999. -99999. ... -99999. -99999. -99999.]
  [-99999. -99999. -99999. ... -99999. -99999. -99999.]
  ...
  [-99999. -99999. -99999. ... -99999. -99999. -99999.]
  [-99999. -99999. -99999. ... -99999. -99999. -99999.]
  [-99999. -99999. -99999. ... -99999. -99999. -99999.]]

 [[-99999. -99999. -99999. ... -99999. -99999. -99999.]
  [-99999. -99999. -99999. ... -99999. -99999. -99999.]
  [-99999. -99999. -99999. ... -99999. -99999. -99999.]
  ...
  [-99999. -99999. -99999. ... -99999. -99999. -99999.]
  [-99999. -99999. -99999. ... -99999. -99999. -99999.]
  [-99999. -99999. -99999. ... -99999. -99999. -99999.]]

 [[-99999. -99999. -99999. ... -99999. -99999. -99999.]
  [-99999. -99999. -99999. ... -99999. -99999. -99999.]
  [-99999. -99999. -99999. ... -99999. -99999. -99999.]
  ...
  [-99999. -99999. -99999. ... -99999. -99999. -99999.]
  [-99999. -99999. -99999. ... -99999. -99999. -99999.]
  [-99999. 

In [8]:
## Create an array which stores the indices (x,y coordinate information) of each pixel and concatenate it into the dataset

index = np.indices((253,434))
dataset = np.concatenate((dataset,index),axis=0)
print(dataset.shape)
print(dataset)

# After this step, the new array shape should be 30+2(with two more layers storing the coordinates) * height * width 

(30, 253, 434)
[[[-9.9999e+04 -9.9999e+04 -9.9999e+04 ... -9.9999e+04 -9.9999e+04
   -9.9999e+04]
  [-9.9999e+04 -9.9999e+04 -9.9999e+04 ... -9.9999e+04 -9.9999e+04
   -9.9999e+04]
  [-9.9999e+04 -9.9999e+04 -9.9999e+04 ... -9.9999e+04 -9.9999e+04
   -9.9999e+04]
  ...
  [-9.9999e+04 -9.9999e+04 -9.9999e+04 ... -9.9999e+04 -9.9999e+04
   -9.9999e+04]
  [-9.9999e+04 -9.9999e+04 -9.9999e+04 ... -9.9999e+04 -9.9999e+04
   -9.9999e+04]
  [-9.9999e+04 -9.9999e+04 -9.9999e+04 ... -9.9999e+04 -9.9999e+04
   -9.9999e+04]]

 [[-9.9999e+04 -9.9999e+04 -9.9999e+04 ... -9.9999e+04 -9.9999e+04
   -9.9999e+04]
  [-9.9999e+04 -9.9999e+04 -9.9999e+04 ... -9.9999e+04 -9.9999e+04
   -9.9999e+04]
  [-9.9999e+04 -9.9999e+04 -9.9999e+04 ... -9.9999e+04 -9.9999e+04
   -9.9999e+04]
  ...
  [-9.9999e+04 -9.9999e+04 -9.9999e+04 ... -9.9999e+04 -9.9999e+04
   -9.9999e+04]
  [-9.9999e+04 -9.9999e+04 -9.9999e+04 ... -9.9999e+04 -9.9999e+04
   -9.9999e+04]
  [-9.9999e+04 -9.9999e+04 -9.9999e+04 ... -9.9999e+04 -9.

In [9]:
# Mask allthe non-data value. Note that in this case, the nondata value of all indicators is -99999

dataset = np.ma.masked_equal(dataset,-99999)
print(dataset) 

[[[-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  ...
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]]

 [[-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  ...
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]]

 [[-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  ...
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]]

 ...

 [[-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  ...
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]
  [-- -- -- ... -- -- --]]

 [[0.0 0.0 0.0 ... 0.0 0.0 0.0]
  [1.0 1.0 1.0 ... 1.0 1.0 1.0]
  [2.0 2.0 2.0 ... 2.0 2.0 2.0]
  ...
  [250.0 250.0 250.0 ... 250.0 250.0 250.0]
  [251.0 251.0 251.0 ... 251.0 251.0 251.0]
  [252.0 252.0 252.0 ... 252.0 252.0 252.0]]

 [[0.0 1.0 2.0 ... 431.0 432.0 433.0]
  [0.0 1.0 2.0 ... 431.0 432.0 433.0]
  [0.0 1.0 2.0 ... 431.0 4

In [10]:
# We need to transform the 3-d array into 2-d array, since later it will be the input of clustering analysis. 
# Now the shape should be (number of indicators+2, width*height)

dataset = dataset.reshape(30,-1)

print(dataset)
print(dataset.shape)

[[-- -- -- ... -- -- --]
 [-- -- -- ... -- -- --]
 [-- -- -- ... -- -- --]
 ...
 [-- -- -- ... -- -- --]
 [0.0 0.0 0.0 ... 252.0 252.0 252.0]
 [0.0 1.0 2.0 ... 431.0 432.0 433.0]]
(30, 109802)


In [11]:
# Transpose the array - as now the column should be the indicators, whereas the rows represent the individual pixel

training_data = np.transpose(dataset)
print(training_data.shape)
print(training_data)

(109802, 30)
[[-- -- -- ... -- 0.0 0.0]
 [-- -- -- ... -- 0.0 1.0]
 [-- -- -- ... -- 0.0 2.0]
 ...
 [-- -- -- ... -- 252.0 431.0]
 [-- -- -- ... -- 252.0 432.0]
 [-- -- -- ... -- 252.0 433.0]]


In [12]:
# Convert the data from numpy array into dataframe
# Add the lable to each corresponding column(indicator) 

labels = ['skilled_birth', 'poverty', 'women_literacy', 'men_literacy', 'vaccination', 'access_itn', 'stunted_child', 'family_planning',
          'impr_housing', 'water_source', 'open_defecation', 'pit_latrines', 'armed_conflicts', 'pm_25', 'waterways', 'dump_sites',
          'pop_density', 'bld_density', 'ndvi', 'night_light', 'bus_stations', 'dis_roads', 'poor_roads', 'dis_education', 'dis_health',
          'dis_financial', 'built_up', 'slum', 'rows', 'columns']

df = pd.DataFrame(training_data, columns = labels) 
print(df)

        skilled_birth  poverty  women_literacy  men_literacy  vaccination  \
0                 NaN      NaN             NaN           NaN          NaN   
1                 NaN      NaN             NaN           NaN          NaN   
2                 NaN      NaN             NaN           NaN          NaN   
3                 NaN      NaN             NaN           NaN          NaN   
4                 NaN      NaN             NaN           NaN          NaN   
...               ...      ...             ...           ...          ...   
109797            NaN      NaN             NaN           NaN          NaN   
109798            NaN      NaN             NaN           NaN          NaN   
109799            NaN      NaN             NaN           NaN          NaN   
109800            NaN      NaN             NaN           NaN          NaN   
109801            NaN      NaN             NaN           NaN          NaN   

        access_itn  stunted_child  family_planning  impr_housing  \
0      

In [13]:
# Drop the rows which contians any 'Nondata'

df = df.dropna()
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 62888 entries, 275 to 109498
Data columns (total 30 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   skilled_birth    62888 non-null  float64
 1   poverty          62888 non-null  float64
 2   women_literacy   62888 non-null  float64
 3   men_literacy     62888 non-null  float64
 4   vaccination      62888 non-null  float64
 5   access_itn       62888 non-null  float64
 6   stunted_child    62888 non-null  float64
 7   family_planning  62888 non-null  float64
 8   impr_housing     62888 non-null  float64
 9   water_source     62888 non-null  float64
 10  open_defecation  62888 non-null  float64
 11  pit_latrines     62888 non-null  float64
 12  armed_conflicts  62888 non-null  float64
 13  pm_25            62888 non-null  float64
 14  waterways        62888 non-null  float64
 15  dump_sites       62888 non-null  float64
 16  pop_density      62888 non-null  float64
 17  bld_densi

In [14]:
df.describe()

Unnamed: 0,skilled_birth,poverty,women_literacy,men_literacy,vaccination,access_itn,stunted_child,family_planning,impr_housing,water_source,...,bus_stations,dis_roads,poor_roads,dis_education,dis_health,dis_financial,built_up,slum,rows,columns
count,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,...,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0
mean,0.733944,0.087087,0.945702,0.930647,0.758267,0.390458,0.14408,0.069822,0.617168,0.980663,...,5.639918,1024.676028,1.088303,1339.037163,2228.350538,1625.973784,1.272898,1.970837,123.807865,209.419889
std,0.006172,0.047273,0.017933,0.010149,0.034434,0.100261,0.018497,0.007009,0.080535,0.012195,...,10.341506,913.695894,1.460026,1230.105571,1994.888937,1299.752568,0.445452,0.168264,51.244319,111.698579
min,0.702342,0.002084,0.907093,0.891019,0.663898,0.210798,0.11047,0.05426,0.347661,0.893894,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,0.730828,0.050088,0.929869,0.92343,0.733857,0.297587,0.130703,0.064729,0.562751,0.974618,...,0.0,360.555115,0.144366,500.0,806.225769,632.455505,1.0,2.0,85.0,111.0
50%,0.735716,0.088813,0.946521,0.932282,0.751328,0.404965,0.143776,0.070187,0.62413,0.984406,...,0.0,800.0,0.572383,921.954468,1697.056274,1280.624878,1.0,2.0,122.0,216.0
75%,0.738669,0.109609,0.963856,0.938222,0.785709,0.464139,0.153868,0.074143,0.677595,0.988734,...,7.417899,1442.220459,1.386329,1772.004517,3000.0,2262.741699,2.0,2.0,164.0,298.0
max,0.74147,0.259697,0.97396,0.949055,0.84647,0.596174,0.21372,0.090319,0.783054,0.995236,...,98.722023,5830.95166,12.452886,7006.425781,11412.711914,8420.213867,2.0,2.0,252.0,433.0


In [15]:
print(df)

        skilled_birth   poverty  women_literacy  men_literacy  vaccination  \
275          0.732475  0.139231        0.939562      0.915470     0.794153   
276          0.732762  0.138914        0.939609      0.915438     0.794153   
277          0.732935  0.138598        0.939609      0.915438     0.794150   
278          0.733172  0.138281        0.939609      0.915500     0.794150   
279          0.733441  0.137965        0.939632      0.915500     0.794150   
...               ...       ...             ...           ...          ...   
109494       0.737453  0.103516        0.973103      0.926475     0.762950   
109495       0.738017  0.105053        0.973103      0.926475     0.764222   
109496       0.738769  0.106591        0.972991      0.926967     0.764222   
109497       0.739287  0.108128        0.972991      0.926967     0.765465   
109498       0.739779  0.109666        0.972991      0.927393     0.765465   

        access_itn  stunted_child  family_planning  impr_housin

## Split the dataset into built-up and non-builtup

Built up dataset

In [16]:
df_builtup = df[df['built_up'] == 1]
print(df_builtup.shape)

(45726, 30)


In [17]:
# Standardize and save the indicators before running PCA 

df_builtup.iloc[:,0:26] = StandardScaler().fit_transform(df_builtup.iloc[:,0:26])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_builtup.iloc[:,0:26] = StandardScaler().fit_transform(df_builtup.iloc[:,0:26])


In [18]:
df_builtup.describe()

Unnamed: 0,skilled_birth,poverty,women_literacy,men_literacy,vaccination,access_itn,stunted_child,family_planning,impr_housing,water_source,...,bus_stations,dis_roads,poor_roads,dis_education,dis_health,dis_financial,built_up,slum,rows,columns
count,45726.0,45726.0,45726.0,45726.0,45726.0,45726.0,45726.0,45726.0,45726.0,45726.0,...,45726.0,45726.0,45726.0,45726.0,45726.0,45726.0,45726.0,45726.0,45726.0,45726.0
mean,5.456863e-15,1.449333e-15,-4.793902e-13,-1.189399e-12,1.286598e-15,7.710599e-14,-2.857028e-14,-4.1124e-14,6.812553e-16,3.40885e-15,...,4.438022e-14,3.938881e-15,-4.346031e-16,4.156604e-15,-1.950308e-15,3.133958e-15,1.0,1.960066,121.850129,195.647115
std,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,...,1.000011,1.000011,1.000011,1.000011,1.000011,1.000011,0.0,0.195805,48.931926,107.971445
min,-5.067489,-1.715603,-2.178689,-3.414483,-2.655319,-1.70825,-2.070906,-2.206159,-3.33264,-6.567502,...,-0.6363746,-1.216246,-0.6967286,-1.069795,-1.057688,-1.205358,1.0,1.0,1.0,0.0
25%,-0.4741504,-0.8443559,-0.9344946,-0.7977365,-0.7214668,-0.8794731,-0.6647242,-0.716489,-0.6806262,-0.6034111,...,-0.6363746,-0.7870521,-0.5968144,-0.6663477,-0.6889071,-0.7637079,1.0,2.0,86.0,104.0
50%,0.3248833,0.04895283,0.2831324,0.2029637,-0.211547,0.05046951,-0.0141399,0.05077904,0.1387328,0.29355,...,-0.5562085,-0.2147933,-0.3364583,-0.301654,-0.2939564,-0.2846311,1.0,2.0,120.0,193.0
75%,0.7370598,0.5570529,0.9501543,0.7631349,0.8023417,0.7924609,0.5316022,0.635165,0.7554268,0.722224,...,0.322512,0.5763495,0.1571132,0.3020748,0.3727083,0.4607091,1.0,2.0,155.0,278.0
max,1.159465,3.288444,1.483544,1.87776,2.446225,2.099215,3.17961,2.936538,1.972611,1.310318,...,7.993462,5.65086,7.940236,5.778423,4.51756,4.33866,1.0,2.0,252.0,433.0


In [19]:
df_builtup.to_csv('0501_builtup_indicators.csv')

In [20]:
# Visualize the correlation matrix of each indicator 

correlation1 = df_builtup.iloc[:,0:26].corr()
correlation1.style.background_gradient(cmap='coolwarm').set_precision(4)

Unnamed: 0,skilled_birth,poverty,women_literacy,men_literacy,vaccination,access_itn,stunted_child,family_planning,impr_housing,water_source,open_defecation,pit_latrines,armed_conflicts,pm_25,waterways,dump_sites,pop_density,bld_density,ndvi,night_light,bus_stations,dis_roads,poor_roads,dis_education,dis_health,dis_financial
skilled_birth,1.0,-0.6508,0.0,0.4263,0.045,-0.052,-0.0439,-0.0749,0.2729,0.5779,-0.5441,0.1526,0.1051,0.3668,0.2305,0.2618,0.3147,0.2507,-0.3133,0.4935,0.3568,-0.3174,0.1379,-0.4128,-0.699,-0.5567
poverty,-0.6508,1.0,-0.152,-0.7149,-0.0038,0.2425,-0.0679,0.156,-0.315,-0.5695,0.4153,-0.2273,-0.1441,-0.2334,-0.2774,-0.2877,-0.36,-0.2171,0.3192,-0.7073,-0.4342,0.3127,-0.1507,0.642,0.7616,0.5953
women_literacy,0.0,-0.152,1.0,-0.0851,-0.7626,-0.7091,0.2584,-0.1227,-0.1839,0.1453,0.2128,-0.0368,-0.0209,-0.4357,-0.1941,-0.1952,-0.2675,-0.1867,0.3814,-0.2591,-0.0879,-0.014,-0.0682,-0.1773,-0.115,0.0544
men_literacy,0.4263,-0.7149,-0.0851,1.0,0.0819,-0.0798,0.0648,-0.3143,0.3059,0.2546,-0.3593,0.1759,0.0764,0.2598,0.2047,0.2466,0.2634,0.1391,-0.3759,0.6669,0.331,-0.3265,0.0978,-0.3881,-0.601,-0.5845
vaccination,0.045,-0.0038,-0.7626,0.0819,1.0,0.57,-0.1412,0.2055,0.0841,0.0353,-0.1649,0.071,0.0595,0.3865,0.2626,0.2107,0.3053,0.1772,-0.3606,0.3442,0.1992,-0.0353,0.0176,0.1167,0.116,-0.0783
access_itn,-0.052,0.2425,-0.7091,-0.0798,0.57,1.0,-0.4475,0.0489,0.0413,-0.0884,-0.257,-0.0071,-0.0084,0.5945,0.0914,0.0883,0.1072,0.1102,-0.3503,0.1087,-0.016,0.1055,0.0871,0.2623,0.1284,-0.1138
stunted_child,-0.0439,-0.0679,0.2584,0.0648,-0.1412,-0.4475,1.0,0.3978,-0.17,-0.0094,0.4909,0.1697,0.0084,-0.2519,-0.0437,0.0677,0.0412,-0.0696,0.0867,0.0807,0.0535,-0.1473,0.0481,-0.0558,0.0722,-0.003
family_planning,-0.0749,0.156,-0.1227,-0.3143,0.2055,0.0489,0.3978,1.0,0.1404,0.0541,0.3779,0.1451,0.1246,0.0289,0.0647,0.0862,0.0506,-0.052,0.084,0.033,0.003,0.0551,0.0936,0.0522,0.1853,-0.0097
impr_housing,0.2729,-0.315,-0.1839,0.3059,0.0841,0.0413,-0.17,0.1404,1.0,0.14,-0.2379,-0.0819,0.1098,0.2985,0.2367,0.0555,0.0845,0.0051,-0.0659,0.3163,0.1087,-0.1762,-0.0215,-0.3285,-0.4607,-0.4977
water_source,0.5779,-0.5695,0.1453,0.2546,0.0353,-0.0884,-0.0094,0.0541,0.14,1.0,-0.4378,0.1729,0.0923,0.0961,0.2123,0.2465,0.2592,0.2154,-0.2095,0.3918,0.248,-0.2206,0.1351,-0.4515,-0.5994,-0.4363


Non built-up dataset

In [21]:
df_non_builtup = df[df['built_up'] == 2]

In [22]:
print(df_non_builtup.shape)

(17162, 30)


In [23]:
df_non_builtup.iloc[:,0:27] = StandardScaler().fit_transform(df_non_builtup.iloc[:,0:27])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_non_builtup.iloc[:,0:27] = StandardScaler().fit_transform(df_non_builtup.iloc[:,0:27])


In [24]:
df_non_builtup.to_csv('0501_non_builtup_indicators.csv')

In [25]:
df_non_builtup.describe()

Unnamed: 0,skilled_birth,poverty,women_literacy,men_literacy,vaccination,access_itn,stunted_child,family_planning,impr_housing,water_source,...,bus_stations,dis_roads,poor_roads,dis_education,dis_health,dis_financial,built_up,slum,rows,columns
count,17162.0,17162.0,17162.0,17162.0,17162.0,17162.0,17162.0,17162.0,17162.0,17162.0,...,17162.0,17162.0,17162.0,17162.0,17162.0,17162.0,17162.0,17162.0,17162.0,17162.0
mean,2.225621e-16,-2.135143e-16,2.901672e-13,-2.408274e-13,-2.660731e-16,-3.527756e-14,2.02028e-14,-4.103504e-14,-6.278805e-16,2.240541e-15,...,3.13354e-16,7.332052e-16,1.892428e-15,-5.665877e-16,2.116811e-16,1.1113870000000001e-17,0.0,1.999534,129.024007,246.115721
std,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,...,1.000029,1.000029,1.000029,1.000029,1.000029,1.000029,0.0,0.021586,56.620912,113.166749
min,-4.267417,-2.306817,-1.969181,-3.339261,-3.029937,-2.08052,-1.340131,-2.243776,-3.439215,-5.014653,...,-0.3085297,-1.232923,-0.8833975,-1.412823,-1.378424,-1.551899,0.0,1.0,0.0,0.0
25%,-0.5424146,-0.6056461,-0.7420749,-0.5411376,-0.6666068,-0.8304516,-0.8585777,-0.7569047,-0.6831994,-0.3351817,...,-0.3085297,-0.7503392,-0.7852978,-0.7868204,-0.6981392,-0.7476635,0.0,2.0,83.0,151.0
50%,0.2239732,-0.1046601,-0.2471506,0.1950436,-0.1721361,0.2026926,-0.1128579,0.04849249,-0.001444189,0.2188391,...,-0.3085297,-0.2677553,-0.3605847,-0.3264306,-0.2446157,-0.2271702,0.0,2.0,132.0,282.0
75%,0.7554537,0.4429155,1.113199,0.6436959,0.725164,0.5735127,0.5355824,0.5651955,0.671969,0.7187936,...,-0.3085297,0.4712601,0.4565428,0.6931573,0.3720528,0.5352888,0.0,2.0,179.0,332.0
max,1.474515,3.194609,1.88471,1.70994,2.953001,1.925209,3.297747,2.886071,2.259517,1.260185,...,13.61115,3.592916,4.573736,3.344507,3.797508,4.254902,0.0,2.0,252.0,433.0


Whole Nairobi 

In [26]:
df1 = df

In [27]:
df1.iloc[:,0:26] = StandardScaler().fit_transform(df1.iloc[:,0:26])

In [28]:
df1.describe()

Unnamed: 0,skilled_birth,poverty,women_literacy,men_literacy,vaccination,access_itn,stunted_child,family_planning,impr_housing,water_source,...,bus_stations,dis_roads,poor_roads,dis_education,dis_health,dis_financial,built_up,slum,rows,columns
count,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,...,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0,62888.0
mean,-8.331226e-15,8.659468e-16,1.609159e-12,5.12124e-13,1.923891e-15,3.440806e-14,-1.132994e-13,-1.36924e-13,-1.484245e-15,1.465315e-16,...,2.151058e-14,-1.659763e-15,2.09663e-15,-6.009693e-15,-2.322154e-15,-1.16965e-14,1.272898,1.970837,123.807865,209.419889
std,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,...,1.000008,1.000008,1.000008,1.000008,1.000008,1.000008,0.445452,0.168264,51.244319,111.698579
min,-5.120573,-1.798154,-2.153008,-3.90461,-2.740627,-1.791943,-1.81707,-2.220281,-3.346499,-7.115267,...,-0.5453715,-1.121472,-0.7454058,-1.088563,-1.117039,-1.250997,1.0,1.0,0.0,0.0
25%,-0.5049035,-0.7826765,-0.882928,-0.7110804,-0.708918,-0.9263003,-0.7231584,-0.7266153,-0.6757071,-0.4957095,...,-0.5453715,-0.7268569,-0.6465262,-0.6820909,-0.7128899,-0.7643964,1.0,2.0,85.0,111.0
50%,0.2870783,0.03650574,0.04566171,0.1610857,-0.2015317,0.1447001,-0.01638907,0.05201586,0.08644706,0.3069411,...,-0.5453715,-0.2459,-0.3533665,-0.3390652,-0.2663299,-0.2657057,1.0,2.0,122.0,216.0
75%,0.7654805,0.4764374,1.012306,0.7463783,0.7969729,0.7348973,0.5291846,0.616474,0.7503298,0.6618575,...,0.1719281,0.4569877,0.2041253,0.3519786,0.3868163,0.4899186,2.0,2.0,164.0,298.0
max,1.219347,3.651382,1.57575,1.813804,2.561574,2.051824,3.765038,2.924372,2.059826,1.194985,...,9.000898,5.260299,7.783884,4.607274,4.603983,5.227375,2.0,2.0,252.0,433.0


In [29]:
df1.to_csv('0510_whole_nairobi_indicators.csv')

In [31]:
# Visualize the correlation matrix of each indicator 

correlation = df_builtup.iloc[:,0:26].corr()
correlation.style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,skilled_birth,poverty,women_literacy,men_literacy,vaccination,access_itn,stunted_child,family_planning,impr_housing,water_source,open_defecation,pit_latrines,armed_conflicts,pm_25,waterways,dump_sites,pop_density,bld_density,ndvi,night_light,bus_stations,dis_roads,poor_roads,dis_education,dis_health,dis_financial
skilled_birth,1.0,-0.65,0.0,0.43,0.05,-0.05,-0.04,-0.07,0.27,0.58,-0.54,0.15,0.11,0.37,0.23,0.26,0.31,0.25,-0.31,0.49,0.36,-0.32,0.14,-0.41,-0.7,-0.56
poverty,-0.65,1.0,-0.15,-0.71,-0.0,0.24,-0.07,0.16,-0.32,-0.57,0.42,-0.23,-0.14,-0.23,-0.28,-0.29,-0.36,-0.22,0.32,-0.71,-0.43,0.31,-0.15,0.64,0.76,0.6
women_literacy,0.0,-0.15,1.0,-0.09,-0.76,-0.71,0.26,-0.12,-0.18,0.15,0.21,-0.04,-0.02,-0.44,-0.19,-0.2,-0.27,-0.19,0.38,-0.26,-0.09,-0.01,-0.07,-0.18,-0.12,0.05
men_literacy,0.43,-0.71,-0.09,1.0,0.08,-0.08,0.06,-0.31,0.31,0.25,-0.36,0.18,0.08,0.26,0.2,0.25,0.26,0.14,-0.38,0.67,0.33,-0.33,0.1,-0.39,-0.6,-0.58
vaccination,0.05,-0.0,-0.76,0.08,1.0,0.57,-0.14,0.21,0.08,0.04,-0.16,0.07,0.06,0.39,0.26,0.21,0.31,0.18,-0.36,0.34,0.2,-0.04,0.02,0.12,0.12,-0.08
access_itn,-0.05,0.24,-0.71,-0.08,0.57,1.0,-0.45,0.05,0.04,-0.09,-0.26,-0.01,-0.01,0.59,0.09,0.09,0.11,0.11,-0.35,0.11,-0.02,0.11,0.09,0.26,0.13,-0.11
stunted_child,-0.04,-0.07,0.26,0.06,-0.14,-0.45,1.0,0.4,-0.17,-0.01,0.49,0.17,0.01,-0.25,-0.04,0.07,0.04,-0.07,0.09,0.08,0.05,-0.15,0.05,-0.06,0.07,-0.0
family_planning,-0.07,0.16,-0.12,-0.31,0.21,0.05,0.4,1.0,0.14,0.05,0.38,0.15,0.12,0.03,0.06,0.09,0.05,-0.05,0.08,0.03,0.0,0.06,0.09,0.05,0.19,-0.01
impr_housing,0.27,-0.32,-0.18,0.31,0.08,0.04,-0.17,0.14,1.0,0.14,-0.24,-0.08,0.11,0.3,0.24,0.06,0.08,0.01,-0.07,0.32,0.11,-0.18,-0.02,-0.33,-0.46,-0.5
water_source,0.58,-0.57,0.15,0.25,0.04,-0.09,-0.01,0.05,0.14,1.0,-0.44,0.17,0.09,0.1,0.21,0.25,0.26,0.22,-0.21,0.39,0.25,-0.22,0.14,-0.45,-0.6,-0.44
