In [195]:
"""
General Data Science Packages
"""
import numpy as np
import pandas as pd
import geopandas as gpd
# import fiona
# import shapely
# from shapely.geometry import shape

"""
Data Managment Packages
"""
# import time
# import os
import ast

"""
Geocoding Packages
"""
# import geopy as gp
# from geopy.geocoders import Nominatim
# from geopy.extra.rate_limiter import RateLimiter
# from functools import partial

"""
Distance Calculations
"""
from geopy.distance import geodesic
from geopy.distance import great_circle

"""
Check Python Version
"""
!python --version

Python 3.8.3


# 4. Generate Trajectories

## 4.0. Load Data

In [6]:
# function to map over the string elements of the dataframe
def str_eval(e):
    """
    A function to evaluate string element in a dataframe literally, such as the multilocation tuples.
    
    RETURNS: The Python literal. 
    """
    # 01 | Ensure the element is read by ast.literal_eval() as a string to avoid errors.
    e = str(e)
    
    # 02 | Filter out any null values
    if ((e != 'nan') | (e != 'None')):
        
        # 03 | Evaluate the string literally
        try:
            ml = ast.literal_eval(e)
        
        # If unable to read the string, print it so it can be traced back to address issues
        except:
            ml = None
    
    # Return null values as null values
    else:
        ml = None
        
    return ml

In [7]:
survey_ = pd.read_csv('data_raw/raw_survey/210615_raw_survey.csv')
survey = survey_.copy()

# check for any potential row-wise errors
print('\nDuplicate rows?:')
print(survey.duplicated().value_counts())

print('\nUnique SITE_CODE column?:')
print(survey['SITE_CODE'].is_unique)

# duplicate the respondent ID column and set it as the index
survey['ID'] = survey['SITE_CODE']
survey = survey.set_index('ID')

# check if the uniqueness was maintained
print('\nUnique ID index?:')
print(survey.index.is_unique)

survey


Duplicate rows?:
False    1644
dtype: int64

Unique SITE_CODE column?:
True

Unique ID index?:
True


Unnamed: 0_level_0,SITE_CODE,TARGET_LATITUDE,TARGET_LONGITUDE,SITE_LATITUDE,SITE_LONGITUDE,SURVEY DURATION IN MINUTES,DATE_UPLOADED,3 Migrant,4 Gender,4 Other Gender,...,236 Current Residence,Migrant Quota Category,237 Returned,238 Places Returned To,Unnamed: 326,Unnamed: 327,Unnamed: 328,Unnamed: 329,Unnamed: 330,Unnamed: 331
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
276785,276785,-1.21003,36.78920,-1.28892,36.80449,139.96,3/4/21,Yes,Male,-1,...,More than 2 years,Medium-Term Resident,No,-1,,,,,,
276788,276788,-1.21010,36.78923,-1.28872,36.80430,74.59,3/4/21,Yes,Male,-1,...,-1,,-1,-1,,,,,,
276802,276802,-1.21010,36.78923,-1.28884,36.80433,244.90,3/5/21,Yes,Male,-1,...,One year exactly or/More than a year,New Resident,No,-1,,,,,,
276814,276814,-1.20987,36.78899,-1.28885,36.80435,125.87,3/4/21,Yes,Male,-1,...,More than 2 years,Medium-Term Resident,Yes& I Have,Nakuru^1^~Kisumu^2^,,,,,,
276822,276822,-1.20976,36.78888,-1.28873,36.80428,80.18,3/4/21,Yes,Female,-1,...,One year exactly or/More than a year,New Resident,No,-1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297661,297661,-26.18491,28.05547,-26.18380,28.05302,62.78,4/16/21,Yes,Female,-1,...,-1,,-1,-1,,,,,,
297663,297663,-26.18395,28.05459,-26.18328,28.05690,73.30,4/16/21,No,Male,-1,...,More than 2 years,Medium-Term Resident,No,-1,,,,,,
297666,297666,-26.18565,28.05295,-26.18387,28.05308,72.04,4/16/21,No,Male,-1,...,More than 2 years,Medium-Term Resident,Yes& I Have,Yeoville^4^~Bloemfontein^2^,,,,,,
297667,297667,-26.18630,28.05320,-26.18374,28.05300,54.73,4/16/21,No,Male,-1,...,One year exactly or/More than a year,New Resident,Yes& I Have,Sandton^10^,,,,,,


In [51]:
mls_ = pd.read_csv('data_gen/survey_geocoded/survey_multilocations_geocoded.csv')
mls = mls_.copy()

mls = mls_.copy()
mls = mls.set_index('ID')
mls = mls.applymap(str_eval)

print('\nRows, Columns: {}'.format(mls.shape))
mls.head(3)


Rows, Columns: (1644, 132)


Unnamed: 0_level_0,109,109_q,109_ll,127,127_q,127_ll,134,134_q,134_ll,142,...,229y_ll,229z,229z_q,229z_ll,231,231_q,231_ll,238,238_q,238_ll
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
276785,,,,,,,"((Kawango, Kisumu, Kenya), (Kawango, Kisumu, K...","((Kawango, Kisumu, Kenya), (Kawango, Kisumu, K...",,,...,,"((-1, Nairobi, Kenya),)","((nan, Nairobi, Kenya),)",,"((Kawangware, Nairobi, Kenya),)","((Kawangware, Nairobi, Kenya),)","((-1.2784631000000002, 36.751643),)","((-1, Nairobi, Kenya),)","((nan, Nairobi, Kenya),)",
276788,,,,"((N/A, N/A, N/A),)",,,"((N/A, N/A, N/A),)",,,,...,,"((-1, -1, -1),)",,,"((-1, -1, -1),)",,,"((-1, -1, -1),)",,
276802,,,,,,,"((Awendo, Migori, Kenya),)","((Awendo, Migori, Kenya),)","((-0.9079812, 34.53172),)",,...,,"((-1, Nairobi, Kenya),)","((nan, Nairobi, Kenya),)",,"((Roysambu, Nairobi, Kenya),)","((Roysambu, Nairobi, Kenya),)","((-1.2188396000000001, 36.88673920000001),)","((-1, Nairobi, Kenya),)","((nan, Nairobi, Kenya),)",


## 4.1. Generate Trajectories

In [12]:
# function to split
def delimit(string):
    """
    A function that splits a string by its delimiters
    
    RETURNS: Separated strings in an array
    """
    # 01 | split the string into discrete substring entries using delimiter '~'
    sub_s = str(string).split('~')
    
    # 02 | split each substring entry into subarrays with discrete component using delimiter '^'
    a = [str(item).split('^') for item in sub_s]
    
    return a

In [183]:
# for every input id
def gen_trajectories(rid):
    
    # New indices
    inds = []
    for i in range(0, 11):
        n = int(rid) + (i * 0.01)
        inds.append(n)

    # New columns for the trajectory tables
    newcols = \
    ['respondentID',
     'birthyear',
    'age',
    'duration',
    'interval',
    'location_name',
    'location_type',
    'location_latlon',
    'distance_from_origin_location_km',
    'distance_from_previous_location_km',
    'distance_from_properties_owned_km',
    'mother_tongue',
    'n_moves',
    'work_type',
    'employment_status',
    'work_aspiration',
    'financial_support',
    'children',
    'children_since_leaving']

    # Create the blank dataframe
    df = pd.DataFrame(index=inds, columns=newcols)

    # RESPONDENT-WIDE ATTRIBUTES ==========

    # Information across all columns (doesn't change based on location)
    df.loc[:, 'respondentID'] = rid
    df.loc[:, 'birthyear'] = survey.loc[rid, '104 Year of Birth'].item()
    df.loc[:, 'age'] = 2021 - survey.loc[rid, '104 Year of Birth'].item()
    df.loc[:, 'mother_tongue'] = survey.loc[rid, '115 Mother Tongue']
    df.loc[:, 'n_moves'] = survey.loc[rid, '200 Number']
    df.loc[:, 'work_type'] = survey.loc[rid, '307 Kind of Work']
    df.loc[:, 'employment_status'] = survey.loc[rid, '303 Current Employment Status']
    df.loc[:, 'work_aspiration'] = survey.loc[rid, '309 Work Aspirations']
    df.loc[:, 'children'] = survey.loc[rid, '125 Children']

    # Extract and convert the column for 
    try:
        fin_raw = delimit(survey.loc[rid, '311 Frequency and Amount'])[0][4]
        fin_filter = filter(str.isdigit, fin_raw)
        fin_string = "".join(fin_filter)
        fin_num = float(fin_string)
        df.loc[:, 'financial_support'] = fin_num
    except:
        df.loc[:, 'financial_support'] = None

    # Children if not invalid
    try:
        df.loc[:, 'children_since_leaving'] = delimit(survey.loc[rid, '127 Born Since Leaving'])[0][0]
    except:
        df.loc[:, 'children_since_leaving'] = None

    # Start the duration at 0
    dur = 0

    # PREVIOUS LOCATIONS ==========

    # prepare lists to pull from for the first loop
    df.loc[inds[0], 'location_type'] = 'origin'
    df.loc[inds[1], 'location_type'] = '1st migration location'
    df.loc[inds[2], 'location_type'] = '2nd migration location'
    df.loc[inds[3], 'location_type'] = '3rd migration location'
    df.loc[inds[4], 'location_type'] = '4th migration location'

    # prepare lists to pull from for the second loop
    s_cols = ['201 Place and Duration', '202 Departure and Duration', '206 Migration ', '211 Migrated where', '214b Migration']
    ml_cols = ['201', '202', '206', '211', '214b']
    ll_cols = ['201_ll', '202_ll', '206_ll', '211_ll', '214b_ll']

    mls_all = ()

    for ml_col in ml_cols:
        if mls.loc[rid, ml_col] is not None:
            mls_all = mls_all + mls.loc[rid, ml_col]
        else:
            pass

    lls_all = ()

    for ll_col in ll_cols:
        if mls.loc[rid, ll_col] is not None:
            lls_all = lls_all + mls.loc[rid, ll_col]
        else:
            pass

    dur_all = ()
    dur = 0

    for s_col in s_cols:
        if survey.loc[rid, s_col] is not None:
            entries = delimit(survey.loc[rid, s_col])
            durs = [(int(entry[3])*12 + int(entry[4])) for entry in entries if len(entry) >= 5]
            dur_all = dur_all + tuple(durs)
        else:
            pass 

    # for whatever is left to reach five
    for n in range(0, 5): 
        # try to take the data available in the following columns
        try:
            # look up based on lists and place into dataframe
            df.loc[inds[n], 'location_name'] = mls_all[n]
            df.loc[inds[n], 'location_latlon'] = lls_all[n]

            # duration calculations
            ivl = dur_all[n]
            df.loc[inds[n], 'duration'] = dur
            df.loc[inds[n], 'interval'] = ivl
            dur = dur + ivl
        except:
            pass

    # CURRENT LOCATION ==========

    # switch the column based on the identified city
    if survey.loc[rid, '8 Identify City '] == 'Nairobi ':
        N_s = '9 Nairobi Neighbourhoods'
        N_ml = '9N'
        N_ll = '9N_ll'
    elif survey.loc[rid, '8 Identify City '] == 'Johannesburg':
        N_s = '9 Johannesburg Neighbourhoods'
        N_ml = '9J'
        N_ll = '9J_ll'
    elif survey.loc[rid, '8 Identify City '] == 'Accra ':
        N_s = '9 Accra Neighbourhoods '
        N_ml = '9A'
        N_ll = '9A_ll'
    else:
        N_s = 0

    # duration calculation
    list_231 = delimit(survey.loc[rid, '231 Neighbourhoods Lived In'])
    ivls_231 = []

    for l in list_231:
        ivls_231.append(int(l[1]))

    ivl = sum(ivls_231)

    # fill in at locations
    col = '236 Current Residence'
    for n in [5, 6]:
        df.loc[inds[n], 'location_type'] = 'current location'
        if (N_s != 0):
            df.loc[inds[n], 'location_name'] = mls.loc[rid, N_ml][0]
            df.loc[inds[n], 'location_latlon'] = mls.loc[rid, N_ll][0]
        else:
            df.loc[inds[n], 'location_name'] = None
            df.loc[inds[n], 'location_latlon'] = None

    df.loc[inds[5], 'interval'] = 0
    df.loc[inds[5], 'duration'] = dur

    df.loc[inds[6], 'interval'] = ivl
    dur = dur + ivl
    df.loc[inds[6], 'duration'] = dur


    # FUTURE LOCATIONS ==========
    
    df.loc[inds[7], 'location_type'] = 'aspiration 2-6 months'
    df.loc[inds[8], 'location_type'] = 'planned move 2 yrs'
    df.loc[inds[9], 'location_type'] = 'planned move 10 yrs'
    df.loc[inds[10], 'location_type'] = 'planned move retirement'

    ml_fut = ['707', '712', '713', '715']
    ll_fut = ['707_ll', '712_ll', '713_ll', '715_ll']
    ivl_fut = [6, 18, 96, 120]

    for n in range(4):
        ni = 7+n
        try:
            if ((mls.loc[rid, col] != None) & (mls.loc[rid, ll] != None)):
                df.loc[inds[ni], 'location_name'] = mls.loc[rid, ml_fut[n]][0]
                df.loc[inds[ni], 'location_latlon'] = mls.loc[rid, ll_fut[n]][0]
            else:
                pass
        except:
            pass

        df.loc[inds[ni], 'interval'] = ivl_fut[n]
        dur = dur + ivl_fut[n]
        df.loc[inds[ni], 'duration'] = dur

    return df

In [184]:
rids = survey['SITE_CODE'].to_list()

trj_list = []
trj_error = []

for rid in rids:
    try:
        trj_rid = gen_trajectories(rid)
        trj_list.append(trj_rid)
    except:
        trj_error.append(rid)

trj = pd.concat(trj_list, axis=0)

print('Number of error respondentIDs:')
print(len(trj_error))

display(trj)

Number of error respondentIDs:
576


Unnamed: 0,respondentID,birthyear,age,duration,interval,location_name,location_type,location_latlon,distance_from_origin_location_km,distance_from_previous_location_km,distance_from_properties_owned_km,mother_tongue,n_moves,work_type,employment_status,work_aspiration,financial_support,children,children_since_leaving
276785.00,276785,1996,25,0,204,"(Kawango, Kisumu, Kenya)",origin,"(0.0, 31.983333000000002)",,,,Dhluo,2,-3,-3,-3,,No,-1
276785.01,276785,1996,25,204,48,"(Manyatta, Kisumu, Kenya)",1st migration location,"(0.33330859999999995, 34.4813935)",,,,Dhluo,2,-3,-3,-3,,No,-1
276785.02,276785,1996,25,252,51,"(Kawangware, Nairobi, Kenya)",2nd migration location,"(-1.2784631000000002, 36.751643)",,,,Dhluo,2,-3,-3,-3,,No,-1
276785.03,276785,1996,25,,,,3rd migration location,,,,,Dhluo,2,-3,-3,-3,,No,-1
276785.04,276785,1996,25,,,,4th migration location,,,,,Dhluo,2,-3,-3,-3,,No,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297668.06,297668,1991,30,380,16,"(Berea, Johannesburg, South Africa)",current location,"(-26.1822222, 28.0533333)",,,,Xhosa,2,Working full-time (formal/informal sector),Working full-time (formal/informal sector),Hr Specialist,2000,Yes,0
297668.07,297668,1991,30,386,6,,aspiration 2-6 months,,,,,Xhosa,2,Working full-time (formal/informal sector),Working full-time (formal/informal sector),Hr Specialist,2000,Yes,0
297668.08,297668,1991,30,404,18,,planned move 2 yrs,,,,,Xhosa,2,Working full-time (formal/informal sector),Working full-time (formal/informal sector),Hr Specialist,2000,Yes,0
297668.09,297668,1991,30,500,96,,planned move 10 yrs,,,,,Xhosa,2,Working full-time (formal/informal sector),Working full-time (formal/informal sector),Hr Specialist,2000,Yes,0


## 4.2. Calculate Distances

In [221]:
for i in list(trj.index):
    try:
        trj.loc[i, 'lat'] = trj.loc[i, 'location_latlon'][0]
        trj.loc[i, 'lon'] = trj.loc[i, 'location_latlon'][1]
    except:
        pass

print('\nlocation_latlon\n==========')
print(trj['location_latlon'].isna().value_counts())
print('\nlat\n==========')
print(trj['lat'].isna().value_counts())
print('\nlon\n==========')
print(trj['lon'].isna().value_counts())


location_latlon
True     7612
False    4136
Name: location_latlon, dtype: int64

lat
True     7612
False    4136
Name: lat, dtype: int64

lon
True     7612
False    4136
Name: lon, dtype: int64


In [219]:
trj['p_kmGeo'] = None
trj['p_kmGrC'] = None
trj['O_kmGeo'] = None
trj['O_kmGrC'] = None

for r in rids:
    
    lats = trj.loc[trj.respondentID.eq(r), 'lat'].dropna()
    lons = trj.loc[trj.respondentID.eq(r), 'lon'].dropna()
    
    valid = list(lats.index)

    if len(valid) > 0:
        for i in range(len(valid) - 1):
            o = tuple([lats[valid[0]], lons[valid[0]]])
            t1 = tuple([lats[valid[i]], lons[valid[i]]])
            t2 = tuple([lats[valid[i+1]], lons[valid[i+1]]])

            trj.loc[valid[i+1], 'p_kmGeo'] = geodesic(t1, t2).km
            trj.loc[valid[i+1], 'p_kmGrC'] = great_circle(t1, t2).km
            trj.loc[valid[i+1], 'O_kmGeo'] = geodesic(o, t2).km
            trj.loc[valid[i+1], 'O_kmGrC'] = great_circle(o, t2).km

            trj.loc[valid[i+1], 'distance_from_origin_location_km'] = geodesic(o, t2).km
            trj.loc[valid[i+1], 'distance_from_previous_location_km'] = geodesic(t1, t2).km

        trj.loc[valid[0], 'p_kmGeo'] = 0
        trj.loc[valid[0], 'p_kmGrC'] = 0
        trj.loc[valid[0], 'O_kmGeo'] = 0
        trj.loc[valid[0], 'O_kmGrC'] = 0

        trj.loc[valid[0], 'distance_from_origin_location_km'] = 0
        trj.loc[valid[0], 'distance_from_previous_location_km'] = 0
    else:
        pass
    
trj[['distance_from_origin_location_km', 'distance_from_previous_location_km']]

trj.to_csv('data_gen/trajectories/trajectories.csv')

trj

Unnamed: 0,respondentID,birthyear,age,duration,interval,location_name,location_type,location_latlon,distance_from_origin_location_km,distance_from_previous_location_km,...,work_aspiration,financial_support,children,children_since_leaving,p_kmGeo,p_kmGrC,O_kmGeo,O_kmGrC,lat,lon
276785.00,276785,1996,25,0,204,"(Kawango, Kisumu, Kenya)",origin,"(0.0, 31.983333000000002)",0,0,...,-3,,No,-1,0,0,0,0,0.000000,31.983333
276785.01,276785,1996,25,204,48,"(Manyatta, Kisumu, Kenya)",1st migration location,"(0.33330859999999995, 34.4813935)",280.513,280.513,...,-3,,No,-1,280.513,280.232,280.513,280.232,0.333309,34.481394
276785.02,276785,1996,25,252,51,"(Kawangware, Nairobi, Kenya)",2nd migration location,"(-1.2784631000000002, 36.751643)",549.265,309.23,...,-3,,No,-1,309.23,309.577,549.265,548.897,-1.278463,36.751643
276785.03,276785,1996,25,,,,3rd migration location,,,,...,-3,,No,-1,,,,,,
276785.04,276785,1996,25,,,,4th migration location,,,,...,-3,,No,-1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297668.06,297668,1991,30,380,16,"(Berea, Johannesburg, South Africa)",current location,"(-26.1822222, 28.0533333)",1254.65,0,...,Hr Specialist,2000,Yes,0,0,0,1254.65,1255.23,-26.182222,28.053333
297668.07,297668,1991,30,386,6,,aspiration 2-6 months,,,,...,Hr Specialist,2000,Yes,0,,,,,,
297668.08,297668,1991,30,404,18,,planned move 2 yrs,,,,...,Hr Specialist,2000,Yes,0,,,,,,
297668.09,297668,1991,30,500,96,,planned move 10 yrs,,,,...,Hr Specialist,2000,Yes,0,,,,,,
