Understanding shapefiles https://en.wikipedia.org/wiki/Shapefile
This notebook is for looking at the shapefile for wards in Gauteng

In [21]:
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
from numpy import sort, where

In [22]:
# Read shapefile
# When other files included in same directory, the shapefile that's read in includes all associated info (e.g. ward id, municipality, etc)
# https://stackoverflow.com/questions/60617819/how-to-read-shapefile-in-geopandas-when-having-dbf-prj-shp-and-shx-files
shapefile = gpd.read_file('../data/shapefiles/za-wards-2020/SA_Wards2020.shp')


In [23]:
shapefile

In [24]:
shapefile.plot()

In [25]:
# Filter to Gauteng
gauteng = shapefile[shapefile.Province == 'Gauteng']
gauteng

In [26]:
gauteng.plot()

In [27]:
gauteng_ward_codes = gauteng['WardID'].to_numpy()
gauteng_ward_codes = sort(gauteng_ward_codes.astype(int))
gauteng_ward_codes

In [28]:
# Compare to GCRO data
# Data file paths
GCRO_2021 = '../data/surveys/gcro-2021.dta'
GCRO_2018 = '../data/surveys/gcro-2018.dta'
GCRO_2015 = '../data/surveys/gcro-2015.dta'
df_gcro_2021 = pd.read_stata(GCRO_2021, convert_categoricals=False)
df_gcro_2018 = pd.read_stata(GCRO_2018, convert_categoricals=False)
df_gcro_2015 = pd.read_stata(GCRO_2015, convert_categoricals=False)

In [29]:
gcro_ward_codes_2021 = sort(df_gcro_2021.ward_code.unique())
gcro_ward_codes_2021

In [30]:
gcro_ward_codes_2018 = sort(df_gcro_2018.ward.astype(int).unique())
gcro_ward_codes_2018

In [31]:
gcro_ward_codes_2015 = sort(df_gcro_2015.WardNumber.astype(int).unique())
gcro_ward_codes_2015

In [32]:
from numpy import ndarray, array, append


def compare_to_gauteng_ward_codes(codes: ndarray, name: str):
    print('-----------------------------------------------')
    mdb_codes = array([])
    for code in codes:
        if code not in gauteng_ward_codes:
            mdb_codes = append(mdb_codes, [code])

    print(f'Missing codes in MDB data {len(mdb_codes)}')

    print('-----------------------------------------------')
    data_codes = array([])
    for code in gauteng_ward_codes:
        if code not in codes:
            data_codes = append(data_codes, code)
    print(f'Missing codes in {name} {len(data_codes)}')

    return [mdb_codes, data_codes]


In [33]:
compare_to_gauteng_ward_codes(gcro_ward_codes_2021, 'GCRO 2021')

In [34]:
compare_to_gauteng_ward_codes(gcro_ward_codes_2018, 'GCRO 2018')


In [35]:
compare_to_gauteng_ward_codes(gcro_ward_codes_2015, 'GCRO 2015')


In [36]:
# Example of problematic code from 2021
gauteng[gauteng.WardID == '74805001']

In [37]:
df_gcro_2021[df_gcro_2021.ward_code == 74205001]

# New Dataframe with clustered GCRO data to ward code and polygon

In [38]:
jhb = gauteng[gauteng.Municipali == 'City of Johannesburg Metropolitan Municipality']
jhb

In [39]:
# Do just service index
service_index_gcro_2018 = df_gcro_2018.groupby('ward')['F1servic'].mean()
service_index_gcro_2018

In [40]:
import csv
path = '../outputs/clustered-data.csv'
with open(path, mode='w') as clustered_file:
    cluster_writer = csv.writer(clustered_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

    cluster_writer.writerow(['ward_id', 'municipality', 'district', 'geometry', 'service_index'])

    # Do only Joburg 2018
    for ward_id in jhb.WardID:
        # Check if ward in GCRO data
        if int(ward_id) not in gcro_ward_codes_2018:
            continue

        # If so, write service index
        service_index = service_index_gcro_2018[ward_id]
        geo_data = jhb[jhb.WardID == ward_id]
        cluster_writer.writerow([ward_id, geo_data.Municipali.item(), geo_data.District.item(), geo_data.geometry.item(), service_index])

print("ALL DONE")