In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import ast # for converting strings ito tuples

# import seaborn as sns
import matplotlib.pyplot as plt
# import matplotlib.patches as mpatches
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore')

import os

In [2]:
from datetime import datetime

# Creating a datetime object
d = datetime.now()

# Converting a to string using strftime
datenow = d.strftime('%Y%m%d')
timenow = d.strftime("%H%M%S")

print(datenow)
print(timenow)

20240604
124827


# Data

In [3]:
survey_ = pd.read_csv('data_raw/raw_survey/210615_raw_survey.csv')
survey = survey_.copy()

# check for any potential row-wise errors
print('\nDuplicate rows?:')
print(survey.duplicated().value_counts())

print('\nUnique SITE_CODE column?:')
print(survey['SITE_CODE'].is_unique)

# duplicate the respondent ID column and set it as the index
survey['ID'] = survey['SITE_CODE']
survey = survey.set_index('ID')

# check if the uniqueness was maintained
print('\nUnique ID index?:')
print(survey.index.is_unique)

survey


Duplicate rows?:
False    1644
Name: count, dtype: int64

Unique SITE_CODE column?:
True

Unique ID index?:
True


Unnamed: 0_level_0,SITE_CODE,TARGET_LATITUDE,TARGET_LONGITUDE,SITE_LATITUDE,SITE_LONGITUDE,SURVEY DURATION IN MINUTES,DATE_UPLOADED,3 Migrant,4 Gender,4 Other Gender,...,236 Current Residence,Migrant Quota Category,237 Returned,238 Places Returned To,Unnamed: 326,Unnamed: 327,Unnamed: 328,Unnamed: 329,Unnamed: 330,Unnamed: 331
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
276785,276785,-1.21003,36.78920,-1.28892,36.80449,139.96,3/4/21,Yes,Male,-1,...,More than 2 years,Medium-Term Resident,No,-1,,,,,,
276788,276788,-1.21010,36.78923,-1.28872,36.80430,74.59,3/4/21,Yes,Male,-1,...,-1,,-1,-1,,,,,,
276802,276802,-1.21010,36.78923,-1.28884,36.80433,244.90,3/5/21,Yes,Male,-1,...,One year exactly or/More than a year,New Resident,No,-1,,,,,,
276814,276814,-1.20987,36.78899,-1.28885,36.80435,125.87,3/4/21,Yes,Male,-1,...,More than 2 years,Medium-Term Resident,Yes& I Have,Nakuru^1^~Kisumu^2^,,,,,,
276822,276822,-1.20976,36.78888,-1.28873,36.80428,80.18,3/4/21,Yes,Female,-1,...,One year exactly or/More than a year,New Resident,No,-1,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297661,297661,-26.18491,28.05547,-26.18380,28.05302,62.78,4/16/21,Yes,Female,-1,...,-1,,-1,-1,,,,,,
297663,297663,-26.18395,28.05459,-26.18328,28.05690,73.30,4/16/21,No,Male,-1,...,More than 2 years,Medium-Term Resident,No,-1,,,,,,
297666,297666,-26.18565,28.05295,-26.18387,28.05308,72.04,4/16/21,No,Male,-1,...,More than 2 years,Medium-Term Resident,Yes& I Have,Yeoville^4^~Bloemfontein^2^,,,,,,
297667,297667,-26.18630,28.05320,-26.18374,28.05300,54.73,4/16/21,No,Male,-1,...,One year exactly or/More than a year,New Resident,Yes& I Have,Sandton^10^,,,,,,


In [4]:
## COLUMN NAMES
for i,c in enumerate(survey.columns):
    print("{}\t{}".format(i, c))

0	SITE_CODE
1	TARGET_LATITUDE
2	TARGET_LONGITUDE
3	SITE_LATITUDE
4	SITE_LONGITUDE
5	SURVEY DURATION IN MINUTES
6	DATE_UPLOADED
7	3 Migrant
8	4 Gender
9	4 Other Gender
10	5 Quota Check 
11	7 Date
12	8 Identify City 
13	9 Johannesburg Neighbourhoods
14	9 Accra Neighbourhoods 
15	9 Nairobi Neighbourhoods
16	10 Housing Type
17	10 Other Housing
18	100: Village, City or Town
19	100: Village, City or Town Name
20	100b: Neighbourhood
21	101 Nearest Town
22	102 Time to City by Bus
23	103 Country of Origin
24	104 Year of Birth
25	106 Ethnic Group
26	106 Language
27	107 Marital Status
28	108 Number of Partners
29	109 Partner Location
30	111 Education 
31	111 Other
32	112 Additional Training
33	113 Additional Training
34	113 Other Additional Training
35	115 Mother Tongue
36	116 Languages
37	121 Learnt Languages 
38	121 Which Languages
39	123 Languages in Neighbourhood
40	123 Which Languages
41	125 Children
42	125 Number of Children 
43	126 Children by Birth
44	127 Born Since Leaving
45	132 People 

# A. Relations

In [5]:
relation_cols = ['155 Other How Are You Related to Others Staying With You', 
                 '204  Who', 
                 '208b Who',
                 '212 Who']

display( survey[relation_cols] )

relation_s = pd.Series()

for i,c in enumerate(relation_cols):
    relation_s = pd.concat([relation_s, survey[c]])

display(relation_s)

relation_c = pd.DataFrame(relation_s.value_counts(), columns=['count'])

display(relation_c)
relation_c.to_csv('data_cat/to_categorize/To_Categorize_Relations_{}.csv'.format(datenow))

Unnamed: 0_level_0,155 Other How Are You Related to Others Staying With You,204 Who,208b Who,212 Who
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
276785,Sibilings And Cousin,Friend,-1,-1
276788,,-3,-3,-3
276802,-1,-1,-1,-1
276814,I Stay With My Household,Sister And Family Were Living There,-1,-1
276822,,-1,-1,-1
...,...,...,...,...
297661,Family Members,-1,-1,-1
297663,-1,-1,-1,-1
297666,-1,Brother,-1,Friend
297667,-1,Mother~ Father,-1,-1


276785        Sibilings And Cousin
276788                         NaN
276802                          -1
276814    I Stay With My Household
276822                         NaN
                    ...           
297661                          -1
297663                          -1
297666                      Friend
297667                          -1
297668                          -1
Length: 6576, dtype: object

Unnamed: 0,count
-1,5306
Friends,96
Na,72
Brother,61
Friend,49
...,...
Auncle,1
Parents Came Before For Economic Reasons,1
Uncle And Friends,1
Parents Were Staying Here.,1


# B. Reasons

## Prep table for manual categorization

In [6]:
# selected columns from original survey
reason_cols = ['203b Specify Other',
               '205c Specify Other',
               '207b Specify other',
               '210b Specify other',
               '215 Specify other',
               '217 Reason for leaving']

reason_allcols = ['203a Reason for Location', '203b Specify Other',
               '205b Reason for leaving', '205c Specify Other',
               '207a Reason for Choosing Location', '207b Specify other',
               '210a Reason for Leaving', '210b Specify other',
               '215 Reason for choosing location', '215 Specify other',
               '216 Relations', '217 Reason for leaving']

reason_addcols = ['203a Reason for Location',
               '205b Reason for leaving', 
               '207a Reason for Choosing Location',
               '210a Reason for Leaving',
               '215 Reason for choosing location', 
               '216 Relations']

print('\nCOLUMNS USED TO CATEGORIZE REASONS:')
display( survey[reason_allcols] )

print('\nMATCH NUMBER OF ROWS:')
print( "Anticipated rows: \t{}".format(survey[reason_allcols].shape[0] * len(reason_cols)) )

# concatenate oll data together
reason_s = pd.Series()
for i,c in enumerate(reason_cols):
    reason_s = pd.concat([reason_s, survey[c]])

print( "Rows to categorize: \t{}".format(reason_s.shape[0]) )

# unique entries in column to categorize
reasonadd_s = pd.Series()
for i,c in enumerate(reason_addcols):
    reasonadd_s = pd.concat([reasonadd_s, survey[c]])

print( "Rows of existing categories: \t{}".format(reasonadd_s.shape[0]) )
display( pd.DataFrame( reasonadd_s.value_counts() ))

# unique existing categories
print('\nEXISTING CATEGORIES IN SURVEY:')
reasonadd_u = reasonadd_s.unique()
for i,c in enumerate(reasonadd_u):
    print('{}\t{}'.format(i,c))

# dataframe and CSV to make manual categorizations
reason_c = pd.DataFrame(reason_s.value_counts(), columns=['count'])

print('\nGROUP TABLE TO CATEGORIZE REASONS:')
display(reason_c)
reason_c.to_csv('data_cat/to_categorize/To_Categorize_Reasons_{}.csv'.format(datenow))


COLUMNS USED TO CATEGORIZE REASONS:


Unnamed: 0_level_0,203a Reason for Location,203b Specify Other,205b Reason for leaving,205c Specify Other,207a Reason for Choosing Location,207b Specify other,210a Reason for Leaving,210b Specify other,215 Reason for choosing location,215 Specify other,216 Relations,217 Reason for leaving
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
276785,Access to work,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
276788,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3,-3
276802,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
276814,Family reasons (unification / escape),-1,Access to work,-1,Access to work,-1,Education (for self or others),-1,-1,-1,-1,-1
276822,Access to work,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
297661,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
297663,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
297666,Other,So I could familiarise myself with South Africa.,Access to work,-1,Access to work,-1,Needed more space,-1,-1,-1,-1,-1
297667,Family reasons (unification / escape),-1,Family reasons (unification / escape),-1,Education (for self or others),-1,-1,-1,-1,-1,-1,-1



MATCH NUMBER OF ROWS:
Anticipated rows: 	9864
Rows to categorize: 	9864
Rows of existing categories: 	9864


Unnamed: 0,count
-1,8318
Access to work,835
Family reasons (unification / escape),257
Education (for self or others),124
Cost of rent (too high& somewhere else much cheaper),68
Other (specify),65
Other,54
Needed more space,37
Evicted,16
Crime or insecurity,14



EXISTING CATEGORIES IN SURVEY:
0	Access to work
1	-3
2	-1
3	Family reasons (unification / escape)
4	Education (for self or others)
5	Cost of rent (too high& somewhere else much cheaper)
6	Evicted
7	Other
8	Political reason
9	Place no longer available
10	Crime or insecurity
11	Needed more space
12	Cultural reasons (language& to be close to other countrymen)
13	DK/RA
14	Other (specify)
15	Political reasons
16	Legal or Immigration Difficulties (need to escape officialsâ€™ attention)
17	Education (for self or others) 
18	Other 
19	 Family reasons (unification / escape)
20	Yes
21	No

GROUP TABLE TO CATEGORIZE REASONS:


Unnamed: 0,count
-1,9704
-3,6
marriage,5
Business,4
Marriage,3
...,...
moved with the father of my children,1
Went to learn a trade or mastercraft there. That is hairdressing,1
The economy of my country was not favouring me.,1
FOR APPRENTICESHIP,1


In [7]:
pd.DataFrame( survey[reason_allcols].replace(to_replace=['-1', '-3', -1, -3, "DK/RA", "Other (specify)", "Other"], value=np.nan).count() )

Unnamed: 0,0
203a Reason for Location,772
203b Specify Other,51
205b Reason for leaving,246
205c Specify Other,38
207a Reason for Choosing Location,260
207b Specify other,27
210a Reason for Leaving,89
210b Specify other,10
215 Reason for choosing location,22
215 Specify other,3


## Manual edits to data

See Google Drive:
[**02_data > 2023_data_cleaning > 2311_openended_to_categorical**](https://drive.google.com/drive/folders/1P_k3eSk6kZPg1Wjo4deoDWwhzUtF_5xF?usp=drive_link)

## Reintergrate manual edits to data

In [8]:
to_join = pd.read_csv("data_cat/categorized/Categorized_Reasons.csv")
display(to_join)

Unnamed: 0,original_text,edited_text,count,reason_category,reason_theme
0,Education,Education,2,Access to education or training,Occupation
1,FOR APPRENTICESHIP,for apprenticeship,1,Access to education or training,Occupation
2,for education,for education,1,Access to education or training,Occupation
3,For training on carpentry,For training on carpentry,1,Access to education or training,Occupation
4,My brother stayed here and i came for training...,My brother stayed here and i came for training...,1,Access to education or training,Occupation
...,...,...,...,...,...
137,To Visit Someone,To Visit Someone,1,Visitation,Social connections
138,visiting,visiting,1,Visitation,Social connections
139,visiting a friend,visiting a friend,1,Visitation,Social connections
140,-3,-3,6,,


In [9]:
all_cats = to_join['reason_category'].unique()
all_thms = to_join['reason_theme'].unique()

print("\nCATEGORIES")
for i, c in enumerate(all_cats):
    print("{}\t{}".format(i,c))

print("\nTHEMES") 
for i, c in enumerate(all_thms):
    print("{}\t{}".format(i,c))


CATEGORIES
0	Access to education or training
1	Access to work or economic opportunities
2	Asylum, war, crime, or violence
3	Climate
4	Completion of education
5	Divorce or death in family
6	Eviction, rehabilitation, or inadequate housing
7	Exploration, restart, or independence
8	Exploration, restart, or independence 
9	Familial ties, reunification, or inheritance
10	Friends in the area
11	Job change or loss
12	Marriage or moving in with partner
13	New business or expansion
14	New or improved home or property
15	Remain in area
16	Retirement
17	Visitation
18	nan

THEMES
0	Occupation
1	Flight
2	Family
3	Housing
4	New life stage
5	Social connections
6	Tenure
7	nan


In [10]:
d = \
{
    'Work':{'Access to work', 'Access to work or economic opportunities', 'New business or expansion', 'Job change or loss'},
    'Kin':{'Family reasons (unification / escape)',  ' Family reasons (unification / escape)', 'Familial ties, reunification, or inheritance', 'Marriage or moving in with partner', 'Divorce or death in family', 'Friends in the area', 'Cultural reasons (language& to be close to other countrymen)'},
    'Education':{'Education (for self or others)', 'Education (for self or others) ', 'Access to education or training', 'Completion of education'},
    'Housing':{'Cost of rent (too high& somewhere else much cheaper)', 'Needed more space', 'Evicted', 'Place no longer available', 'Eviction, rehabilitation, or inadequate housing', 'New or improved home or property'},
    'Restart':{'Exploration, restart, or independence', 'Visitation', 'Retirement', 'Exploration, restart, or independence '},
    'Safety':{'Crime or insecurity', 'Asylum, war, crime, or violence', 'Political reason', 'Political reasons', 'Climate', 'Legal or Immigration Difficulties (need to escape officialsâ€™ attention)'},
    'Remain':{'Remain in area'},
    '(No Answer)':{'-1', 'DK/RA', '-3'}
}
newdict = {i: k for k, v in d.items() for i in v}
# display(reason_counts)


# New dictionary split out with flipped keys:
# REF: https://stackoverflow.com/questions/55190428/categorize-a-column-using-a-dictionary-key-multiple-values-pair
# REF: https://www.geeksforgeeks.org/using-dictionary-to-remap-values-in-pandas-dataframe-columns/
df = to_join

# 1 - Categories included in theme

df['reason_theme_old'] = df['reason_theme']
df['reason_theme'] = df['reason_category']
df = df.replace({"reason_theme": newdict})

display(df)

Unnamed: 0,original_text,edited_text,count,reason_category,reason_theme,reason_theme_old
0,Education,Education,2,Access to education or training,Education,Occupation
1,FOR APPRENTICESHIP,for apprenticeship,1,Access to education or training,Education,Occupation
2,for education,for education,1,Access to education or training,Education,Occupation
3,For training on carpentry,For training on carpentry,1,Access to education or training,Education,Occupation
4,My brother stayed here and i came for training...,My brother stayed here and i came for training...,1,Access to education or training,Education,Occupation
...,...,...,...,...,...,...
137,To Visit Someone,To Visit Someone,1,Visitation,Restart,Social connections
138,visiting,visiting,1,Visitation,Restart,Social connections
139,visiting a friend,visiting a friend,1,Visitation,Restart,Social connections
140,-3,-3,6,,,


## Join editted aspirations back to respondents

In [11]:
df = survey[(['SITE_CODE', '8 Identify City ']+reason_allcols)]

# d = \
# {
#     'Work':{'Access to work', 'Access to work or economic opportunities', 'New business or expansion', 'Job change or loss'},
#     'Kin':{'Family reasons (unification / escape)',  ' Family reasons (unification / escape)', 'Familial ties, reunification, or inheritance', 'Marriage or moving in with partner', 'Divorce or death in family', 'Friends in the area', 'Cultural reasons (language& to be close to other countrymen)'},
#     'Education':{'Education (for self or others)', 'Education (for self or others) ', 'Access to education or training', 'Completion of education'},
#     'Housing':{'Cost of rent (too high& somewhere else much cheaper)', 'Needed more space', 'Evicted', 'Place no longer available', 'Eviction, rehabilitation, or inadequate housing', 'New or improved home or property'},
#     'Restart':{'Exploration, restart, or independence', 'Visitation', 'Retirement', 'Exploration, restart, or independence '},
#     'Asylum':{'Crime or insecurity', 'Asylum, war, crime, or violence', 'Political reason', 'Political reasons', 'Climate', 'Legal or Immigration Difficulties (need to escape officialsâ€™ attention)'},
#     'Remain':{'Remain in area'},
#     '(No Answer)':{'-1', 'DK/RA', '-3'}
# }

d = \
{
    'Work':{'Access to work', 'Access to work or economic opportunities', 'New business or expansion', 'Job change or loss'},
    'Kin':{'Family reasons (unification / escape)',  ' Family reasons (unification / escape)', 'Familial ties, reunification, or inheritance', 'Marriage or moving in with partner', 'Divorce or death in family', 'Friends in the area', 'Cultural reasons (language& to be close to other countrymen)'},
    'Education':{'Education (for self or others)', 'Education (for self or others) ', 'Access to education or training', 'Completion of education'},
    'Housing':{'Cost of rent (too high& somewhere else much cheaper)', 'Needed more space', 'Evicted', 'Place no longer available', 'Eviction, rehabilitation, or inadequate housing', 'New or improved home or property'},
    'Restart':{'Exploration, restart, or independence', 'Visitation', 'Retirement', 'Exploration, restart, or independence '},
    'Safety':{'Crime or insecurity', 'Asylum, war, crime, or violence', 'Political reason', 'Political reasons', 'Climate', 'Legal or Immigration Difficulties (need to escape officialsâ€™ attention)'},
    'Remain':{'Remain in area'},
    '(No Answer)':{'-1', 'DK/RA', '-3'}
}

newdict = {i: k for k, v in d.items() for i in v}

# empty lists
col_list = []
edt_list = []
cat_list = []
gen_list = []
thm_list = []
fin_list = []
add_list = []
short_list = []

for i,c in enumerate(reason_cols):
    # merge new data onto old data
    df =  pd.merge( df, to_join[['original_text', 'edited_text', 'reason_category', 'reason_theme']], how='left', right_on='original_text', left_on=c)
    # preserve index
    df['index'] = df['SITE_CODE']
    df = df.set_index('index')

    # get number from column name
    n = c.split(" ")[0]
    # append new columns names together
    c1 = '{}_orig'.format(n)
    c2 = '{}_edt_reason'.format(n)
    c3 = '{}_cat_reason'.format(n)
    c4 = '{}_gen_reason'.format(n)
    # clean up merged column names
    df = df.drop(columns='original_text')
    df = df.rename(columns={'edited_text':c2, 
                            'reason_category':c3, 
                            'reason_theme':c4})
    
    # new column replacing 'Other' category in old column
    ac = reason_addcols[i]
    df[n] = df.apply(lambda x: x[c3] if (x[ac]=='Other'
                                         or x[ac]=='Other ' 
                                         or x[ac]=='Other (specify)' 
                                         or x[ac]=='Yes' 
                                         or x[ac]=='No')
                     else x[ac], axis=1)
    
    # add broad themes
    ct = '{}_theme'.format(n)
    df[ct] = df[n]
    df = df.replace({ct: newdict})
    
    # add new column names to list
    col_list = col_list + [ac, c, c2, c3, c4, ct, n]
    add_list = add_list + [ac]
    edt_list = edt_list + [c2]
    cat_list = cat_list + [c3]
    gen_list = gen_list + [c4]
    thm_list = thm_list + [ct]
    fin_list = fin_list + [n]
    short_list = short_list + [n, ct]
    
    # print diagnostic
    print("{}\t{}\t{}\t{}\t{}".format(c, c2, c3, c4, ct))


reason_df = df[(['SITE_CODE', '8 Identify City ']+short_list)]
display(reason_df)
reason_df.to_csv( "data_gen/reasons/Reasons_JoinedtoRespondents_Small.csv".format(datenow) )    
    
reason_df = df[(['SITE_CODE', '8 Identify City ']+col_list)]
display(reason_df)
reason_df.to_csv( "data_gen/reasons/Reasons_JoinedtoRespondents.csv".format(datenow) )

203b Specify Other	203b_edt_reason	203b_cat_reason	203b_gen_reason	203b_theme
205c Specify Other	205c_edt_reason	205c_cat_reason	205c_gen_reason	205c_theme
207b Specify other	207b_edt_reason	207b_cat_reason	207b_gen_reason	207b_theme
210b Specify other	210b_edt_reason	210b_cat_reason	210b_gen_reason	210b_theme
215 Specify other	215_edt_reason	215_cat_reason	215_gen_reason	215_theme
217 Reason for leaving	217_edt_reason	217_cat_reason	217_gen_reason	217_theme


Unnamed: 0_level_0,SITE_CODE,8 Identify City,203b,203b_theme,205c,205c_theme,207b,207b_theme,210b,210b_theme,215,215_theme,217,217_theme
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
276785,276785,Nairobi,Access to work,Work,-1,(No Answer),-1,(No Answer),-1,(No Answer),-1,(No Answer),-1,(No Answer)
276788,276788,Nairobi,-3,(No Answer),-3,(No Answer),-3,(No Answer),-3,(No Answer),-3,(No Answer),-3,(No Answer)
276802,276802,Nairobi,-1,(No Answer),-1,(No Answer),-1,(No Answer),-1,(No Answer),-1,(No Answer),-1,(No Answer)
276814,276814,Nairobi,Family reasons (unification / escape),Kin,Access to work,Work,Access to work,Work,Education (for self or others),Education,-1,(No Answer),-1,(No Answer)
276822,276822,Nairobi,Access to work,Work,-1,(No Answer),-1,(No Answer),-1,(No Answer),-1,(No Answer),-1,(No Answer)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297661,297661,Johannesburg,-1,(No Answer),-1,(No Answer),-1,(No Answer),-1,(No Answer),-1,(No Answer),-1,(No Answer)
297663,297663,Johannesburg,-1,(No Answer),-1,(No Answer),-1,(No Answer),-1,(No Answer),-1,(No Answer),-1,(No Answer)
297666,297666,Johannesburg,"Exploration, restart, or independence",Restart,Access to work,Work,Access to work,Work,Needed more space,Housing,-1,(No Answer),-1,(No Answer)
297667,297667,Johannesburg,Family reasons (unification / escape),Kin,Family reasons (unification / escape),Kin,Education (for self or others),Education,-1,(No Answer),-1,(No Answer),-1,(No Answer)


Unnamed: 0_level_0,SITE_CODE,8 Identify City,203a Reason for Location,203b Specify Other,203b_edt_reason,203b_cat_reason,203b_gen_reason,203b_theme,203b,205b Reason for leaving,...,215_gen_reason,215_theme,215,216 Relations,217 Reason for leaving,217_edt_reason,217_cat_reason,217_gen_reason,217_theme,217
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
276785,276785,Nairobi,Access to work,-1,-1,,,Work,Access to work,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
276788,276788,Nairobi,-3,-3,-3,,,(No Answer),-3,-3,...,,(No Answer),-3,-3,-3,-3,,,(No Answer),-3
276802,276802,Nairobi,-1,-1,-1,,,(No Answer),-1,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
276814,276814,Nairobi,Family reasons (unification / escape),-1,-1,,,Kin,Family reasons (unification / escape),Access to work,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
276822,276822,Nairobi,Access to work,-1,-1,,,Work,Access to work,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297661,297661,Johannesburg,-1,-1,-1,,,(No Answer),-1,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
297663,297663,Johannesburg,-1,-1,-1,,,(No Answer),-1,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
297666,297666,Johannesburg,Other,So I could familiarise myself with South Africa.,So I could familiarise myself with South Africa.,"Exploration, restart, or independence","Exploration, restart, or independence",Restart,"Exploration, restart, or independence",Access to work,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
297667,297667,Johannesburg,Family reasons (unification / escape),-1,-1,,,Kin,Family reasons (unification / escape),Family reasons (unification / escape),...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1


## Reasons Subcategories Counts

In [12]:
# 1 - Set-up dataframe structure

reason_s = pd.Series()

# subset dataframe with just the cities
newcols = ['city', 'reason']
reason_cities = pd.DataFrame(columns=newcols)

# 2 - Concatenate All Reasons and Reasons Per City
for i,c in enumerate(fin_list):
    reason_s = pd.concat([reason_s, reason_df[c]])
    reason_cities = pd.concat([reason_cities, reason_df[['8 Identify City ', c]].rename(columns={'8 Identify City ':newcols[0], c:newcols[1]})])
print("\nCONCATENATED:")
display( reason_s.value_counts() )
print("\nCONCATENATED WITH CITY")
display( reason_cities )

# 3 - Value Counts for All Reasons and Reasons per City
reason_counts = pd.DataFrame( reason_s.value_counts() ).rename(columns={'count':'total'})
display( reason_counts )

reason_city_counts = reason_cities.reset_index().rename(columns={'index':'total'}).groupby(newcols).count().reset_index()
display( reason_city_counts )


CONCATENATED:


-1                                                                           8318
Access to work                                                                835
Family reasons (unification / escape)                                         257
Education (for self or others)                                                124
Cost of rent (too high& somewhere else much cheaper)                           68
Needed more space                                                              37
Access to work or economic opportunities                                       19
New business or expansion                                                      17
Evicted                                                                        16
Exploration, restart, or independence                                          15
Marriage or moving in with partner                                             14
Crime or insecurity                                                            14
Job change or lo


CONCATENATED WITH CITY


Unnamed: 0,city,reason
276785,Nairobi,Access to work
276788,Nairobi,-3
276802,Nairobi,-1
276814,Nairobi,Family reasons (unification / escape)
276822,Nairobi,Access to work
...,...,...
297661,Johannesburg,-1
297663,Johannesburg,-1
297666,Johannesburg,-1
297667,Johannesburg,-1


Unnamed: 0,total
-1,8318
Access to work,835
Family reasons (unification / escape),257
Education (for self or others),124
Cost of rent (too high& somewhere else much cheaper),68
Needed more space,37
Access to work or economic opportunities,19
New business or expansion,17
Evicted,16
"Exploration, restart, or independence",15


Unnamed: 0,city,reason,total
0,Accra,Family reasons (unification / escape),2
1,Accra,-1,2849
2,Accra,Access to education or training,6
3,Accra,Access to work,369
4,Accra,Access to work or economic opportunities,6
...,...,...,...
80,Nairobi,Place no longer available,1
81,Nairobi,Political reason,2
82,Nairobi,Political reasons,3
83,Nairobi,Remain in area,2


## Reasons Subcategories Tables

In [13]:
# 1 - Set-up dataframe structure

reason_s = pd.Series()

# subset dataframe with just the cities
newcols = ['city', 'reason']
reason_cities = pd.DataFrame(columns=newcols)

# 2 - Concatenate All Reasons and Reasons Per City
for i,c in enumerate(fin_list):
    reason_s = pd.concat([reason_s, reason_df[c]])
    reason_cities = pd.concat([reason_cities, reason_df[['8 Identify City ', c]].rename(columns={'8 Identify City ':newcols[0], c:newcols[1]})])
# display( reason_s.value_counts() )
# display( reason_cities )

# 3 - Value Counts for All Reasons and Reasons per Cityu
reason_counts = pd.DataFrame( reason_s.value_counts() ).rename(columns={'count':'total'})
# display( reason_counts )

reason_city_counts = reason_cities.reset_index().rename(columns={'index':'total'}).groupby(newcols).count().reset_index()
# display( reason_city_counts )

# each column
for i, n in enumerate(fin_list):
    # merge value counts on index
    ctcol = reason_df.rename(columns={n:'reason'})['reason']
    ct = pd.DataFrame(ctcol.value_counts()).rename(columns={'count':str(n)})
    reason_counts = pd.merge(reason_counts, ct, how='left', left_index=True, right_index=True)
    
    # merge groupings per column
    city_ct = reason_df[['8 Identify City ', n, 'SITE_CODE']].groupby(['8 Identify City ', n]).count().reset_index().rename(columns={'8 Identify City ':newcols[0], n:newcols[1], 'SITE_CODE':n})
    reason_city_counts = pd.merge(reason_city_counts, city_ct, how='left', on=newcols)
# reset index
reason_counts = reason_counts.reset_index().rename(columns={'index':'reason'})

# add source (ACMS/CSR)
reasonadd_u = reasonadd_s.unique()
reason_counts['source'] = np.where(reason_counts['reason'].isin(reasonadd_u), 'ACMS', 'CSR')
reason_city_counts['source'] = np.where(reason_city_counts['reason'].isin(reasonadd_u), 'ACMS', 'CSR')

display(reason_counts)
display(reason_city_counts)
display(reason_df)

display(ct.rename(columns={str(n):'reason'}))

Unnamed: 0,reason,total,203b,205c,207b,210b,215,217,source
0,-1,8318,819.0,1355.0,1355.0,1553.0,1618.0,1618.0,ACMS
1,Access to work,835,449.0,146.0,184.0,40.0,16.0,,ACMS
2,Family reasons (unification / escape),257,161.0,45.0,38.0,13.0,,,ACMS
3,Education (for self or others),124,84.0,20.0,18.0,,2.0,,ACMS
4,Cost of rent (too high& somewhere else much ch...,68,32.0,14.0,12.0,9.0,1.0,,ACMS
5,Needed more space,37,20.0,9.0,4.0,4.0,,,ACMS
6,Access to work or economic opportunities,19,7.0,2.0,2.0,2.0,,6.0,CSR
7,New business or expansion,17,6.0,3.0,4.0,2.0,1.0,1.0,CSR
8,Evicted,16,8.0,3.0,2.0,3.0,,,ACMS
9,"Exploration, restart, or independence",15,3.0,6.0,4.0,1.0,,1.0,CSR


Unnamed: 0,city,reason,total,203b,205c,207b,210b,215,217,source
0,Accra,Family reasons (unification / escape),2,,,,,2.0,,ACMS
1,Accra,-1,2849,293.0,442.0,442.0,536.0,568.0,568.0,ACMS
2,Accra,Access to education or training,6,3.0,1.0,1.0,,,1.0,CSR
3,Accra,Access to work,369,162.0,73.0,102.0,23.0,9.0,,ACMS
4,Accra,Access to work or economic opportunities,6,1.0,1.0,1.0,1.0,,2.0,CSR
...,...,...,...,...,...,...,...,...,...,...
80,Nairobi,Place no longer available,1,1.0,,,,,,ACMS
81,Nairobi,Political reason,2,2.0,,,,,,ACMS
82,Nairobi,Political reasons,3,,1.0,1.0,1.0,,,ACMS
83,Nairobi,Remain in area,2,,,,,,2.0,CSR


Unnamed: 0_level_0,SITE_CODE,8 Identify City,203a Reason for Location,203b Specify Other,203b_edt_reason,203b_cat_reason,203b_gen_reason,203b_theme,203b,205b Reason for leaving,...,215_gen_reason,215_theme,215,216 Relations,217 Reason for leaving,217_edt_reason,217_cat_reason,217_gen_reason,217_theme,217
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
276785,276785,Nairobi,Access to work,-1,-1,,,Work,Access to work,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
276788,276788,Nairobi,-3,-3,-3,,,(No Answer),-3,-3,...,,(No Answer),-3,-3,-3,-3,,,(No Answer),-3
276802,276802,Nairobi,-1,-1,-1,,,(No Answer),-1,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
276814,276814,Nairobi,Family reasons (unification / escape),-1,-1,,,Kin,Family reasons (unification / escape),Access to work,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
276822,276822,Nairobi,Access to work,-1,-1,,,Work,Access to work,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297661,297661,Johannesburg,-1,-1,-1,,,(No Answer),-1,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
297663,297663,Johannesburg,-1,-1,-1,,,(No Answer),-1,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
297666,297666,Johannesburg,Other,So I could familiarise myself with South Africa.,So I could familiarise myself with South Africa.,"Exploration, restart, or independence","Exploration, restart, or independence",Restart,"Exploration, restart, or independence",Access to work,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
297667,297667,Johannesburg,Family reasons (unification / escape),-1,-1,,,Kin,Family reasons (unification / escape),Family reasons (unification / escape),...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1


Unnamed: 0_level_0,reason
reason,Unnamed: 1_level_1
-1,1618
Access to work or economic opportunities,6
Remain in area,4
Job change or loss,3
Access to education or training,2
-3,1
New or improved home or property,1
"Eviction, rehabilitation, or inadequate housing",1
New business or expansion,1
Climate,1


## Reasons Theme Counts

In [14]:
d = \
{
    'Work':{'Access to work', 'Access to work or economic opportunities', 'New business or expansion', 'Job change or loss'},
    'Kin':{'Family reasons (unification / escape)',  ' Family reasons (unification / escape)', 'Familial ties, reunification, or inheritance', 'Marriage or moving in with partner', 'Divorce or death in family', 'Friends in the area', 'Cultural reasons (language& to be close to other countrymen)'},
    'Education':{'Education (for self or others)', 'Education (for self or others) ', 'Access to education or training', 'Completion of education'},
    'Housing':{'Cost of rent (too high& somewhere else much cheaper)', 'Needed more space', 'Evicted', 'Place no longer available', 'Eviction, rehabilitation, or inadequate housing', 'New or improved home or property'},
    'Restart':{'Exploration, restart, or independence', 'Visitation', 'Retirement', 'Exploration, restart, or independence '},
    'Safety':{'Crime or insecurity', 'Asylum, war, crime, or violence', 'Political reason', 'Political reasons', 'Climate', 'Legal or Immigration Difficulties (need to escape officialsâ€™ attention)'},
    'Remain':{'Remain in area'},
    '(No Answer)':{'-1', 'DK/RA', '-3'}
}
# display(reason_counts)


# New dictionary split out with flipped keys:
# REF: https://stackoverflow.com/questions/55190428/categorize-a-column-using-a-dictionary-key-multiple-values-pair
# REF: https://www.geeksforgeeks.org/using-dictionary-to-remap-values-in-pandas-dataframe-columns/
df = reason_counts
df_city = reason_city_counts
newdict = {i: k for k, v in d.items() for i in v}

folder = "data_gen/reasons/"

"""
1
CATEGORIES
"""

df['theme'] = df['reason']
df = df.replace({"theme": newdict})
df_city['theme'] = df_city['reason']
df_city = df_city.replace({"theme": newdict})

display(df)
df.to_csv( os.path.join(folder, "ReasonSubcategories_Counts.csv".format(datenow)) )
display(df_city)
df_city.to_csv( os.path.join(folder, "ReasonSubcategories_Counts_ByCity.csv".format(datenow)) )

"""
2
THEME COUNTS
"""

reason_themes = df.drop(columns=['reason', 'source']).groupby('theme').sum()
reason_city_themes = df_city.drop(columns=['reason', 'source']).groupby([newcols[0], 'theme']).sum()

display(reason_themes)
reason_themes.to_csv( os.path.join(folder, "ReasonsTheme_Counts.csv".format(datenow)) )
display(reason_city_themes)
reason_city_themes.to_csv( os.path.join(folder, "ReasonsTheme_Counts_ByCity.csv".format(datenow)) )

"""
3
THEME PERCENTS
"""

reason_themepcts = reason_themes.drop(index=['(No Answer)'])
for i,c in enumerate(reason_themepcts.columns):
    reason_themepcts[c] = reason_themepcts[c] / reason_themepcts[c].sum()

reason_city_themepcts = reason_city_themes.copy().drop(index=['(No Answer)'], level=1)
for i,mi in enumerate(reason_city_themepcts.index):
    for i,c in enumerate(reason_city_themepcts.columns):
        mi0 = mi[0]
        reason_city_themepcts.loc[mi, c] = reason_city_themepcts.loc[mi, c] / reason_city_themepcts.query("city == @mi0")[c].sum()

display(reason_themepcts)
reason_themepcts.to_csv( os.path.join(folder, "ReasonTheme_Percents.csv".format(datenow)) )

display(reason_city_themepcts)
reason_themepcts.to_csv(  os.path.join(folder, "ReasonsTheme_Percents_ByCity.csv".format(datenow)) )

Unnamed: 0,reason,total,203b,205c,207b,210b,215,217,source,theme
0,-1,8318,819.0,1355.0,1355.0,1553.0,1618.0,1618.0,ACMS,(No Answer)
1,Access to work,835,449.0,146.0,184.0,40.0,16.0,,ACMS,Work
2,Family reasons (unification / escape),257,161.0,45.0,38.0,13.0,,,ACMS,Kin
3,Education (for self or others),124,84.0,20.0,18.0,,2.0,,ACMS,Education
4,Cost of rent (too high& somewhere else much ch...,68,32.0,14.0,12.0,9.0,1.0,,ACMS,Housing
5,Needed more space,37,20.0,9.0,4.0,4.0,,,ACMS,Housing
6,Access to work or economic opportunities,19,7.0,2.0,2.0,2.0,,6.0,CSR,Work
7,New business or expansion,17,6.0,3.0,4.0,2.0,1.0,1.0,CSR,Work
8,Evicted,16,8.0,3.0,2.0,3.0,,,ACMS,Housing
9,"Exploration, restart, or independence",15,3.0,6.0,4.0,1.0,,1.0,CSR,Restart


Unnamed: 0,city,reason,total,203b,205c,207b,210b,215,217,source,theme
0,Accra,Family reasons (unification / escape),2,,,,,2.0,,ACMS,Kin
1,Accra,-1,2849,293.0,442.0,442.0,536.0,568.0,568.0,ACMS,(No Answer)
2,Accra,Access to education or training,6,3.0,1.0,1.0,,,1.0,CSR,Education
3,Accra,Access to work,369,162.0,73.0,102.0,23.0,9.0,,ACMS,Work
4,Accra,Access to work or economic opportunities,6,1.0,1.0,1.0,1.0,,2.0,CSR,Work
...,...,...,...,...,...,...,...,...,...,...,...
80,Nairobi,Place no longer available,1,1.0,,,,,,ACMS,Housing
81,Nairobi,Political reason,2,2.0,,,,,,ACMS,Safety
82,Nairobi,Political reasons,3,,1.0,1.0,1.0,,,ACMS,Safety
83,Nairobi,Remain in area,2,,,,,,2.0,CSR,Remain


Unnamed: 0_level_0,total,203b,205c,207b,210b,215,217
theme,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
(No Answer),8331,821.0,1360.0,1357.0,1555.0,1619.0,1619.0
Education,148,90.0,27.0,19.0,8.0,2.0,2.0
Housing,139,66.0,32.0,21.0,16.0,2.0,2.0
Kin,298,176.0,55.0,48.0,14.0,4.0,1.0
Remain,4,0.0,0.0,0.0,0.0,0.0,4.0
Restart,26,8.0,9.0,6.0,1.0,0.0,2.0
Safety,32,19.0,7.0,3.0,2.0,0.0,1.0
Work,883,464.0,154.0,190.0,48.0,17.0,10.0


Unnamed: 0_level_0,Unnamed: 1_level_0,total,203b,205c,207b,210b,215,217
city,theme,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Accra,(No Answer),2854,294.0,445.0,442.0,537.0,568.0,568.0
Accra,Education,61,40.0,11.0,6.0,2.0,1.0,1.0
Accra,Housing,38,6.0,17.0,5.0,8.0,0.0,2.0
Accra,Kin,138,74.0,28.0,23.0,9.0,3.0,1.0
Accra,Remain,1,0.0,0.0,0.0,0.0,0.0,1.0
Accra,Restart,7,2.0,2.0,2.0,0.0,0.0,1.0
Accra,Safety,7,2.0,3.0,0.0,1.0,0.0,1.0
Accra,Work,377,163.0,75.0,103.0,24.0,9.0,3.0
Johannesburg,(No Answer),2721,263.0,469.0,468.0,499.0,511.0,511.0
Johannesburg,Education,15,9.0,2.0,3.0,1.0,0.0,0.0


Unnamed: 0_level_0,total,203b,205c,207b,210b,215,217
theme,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Education,0.096732,0.109356,0.09507,0.066202,0.089888,0.08,0.090909
Housing,0.09085,0.080194,0.112676,0.073171,0.179775,0.08,0.090909
Kin,0.194771,0.213852,0.193662,0.167247,0.157303,0.16,0.045455
Remain,0.002614,0.0,0.0,0.0,0.0,0.0,0.181818
Restart,0.016993,0.009721,0.03169,0.020906,0.011236,0.0,0.090909
Safety,0.020915,0.023086,0.024648,0.010453,0.022472,0.0,0.045455
Work,0.577124,0.563791,0.542254,0.662021,0.539326,0.68,0.454545


Unnamed: 0_level_0,Unnamed: 1_level_0,total,203b,205c,207b,210b,215,217
city,theme,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Accra,Education,0.096979,0.139373,0.080882,0.043165,0.045455,0.076923,0.1
Accra,Housing,0.06689,0.024278,0.135912,0.037582,0.19027,0.0,0.21978
Accra,Kin,0.260297,0.306846,0.25874,0.179574,0.262883,0.248408,0.136616
Accra,Remain,0.002548,0.0,0.0,0.0,0.0,0.0,0.154885
Accra,Restart,0.017883,0.011942,0.024852,0.019001,0.0,0.0,0.178212
Accra,Safety,0.018208,0.012086,0.038216,0.0,0.039218,0.0,0.20879
Accra,Work,0.998774,0.996975,0.99287,0.997295,0.978082,0.965113,0.750322
Johannesburg,Education,0.04065,0.035714,0.043478,0.06383,0.0625,0.0,0.0
Johannesburg,Housing,0.121455,0.10698,0.158934,0.113472,0.26556,0.25,0.0
Johannesburg,Kin,0.292452,0.280921,0.43008,0.25525,0.264829,0.307692,0.0


## Reasons Theme Tables

In [15]:
print(cat_list)

print("\nPREVIEW:")
df = reason_df.copy()
display(df[fin_list])

print("\nORIGINAL:")
display(reason_df)

['203b_cat_reason', '205c_cat_reason', '207b_cat_reason', '210b_cat_reason', '215_cat_reason', '217_cat_reason']

PREVIEW:


Unnamed: 0_level_0,203b,205c,207b,210b,215,217
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
276785,Access to work,-1,-1,-1,-1,-1
276788,-3,-3,-3,-3,-3,-3
276802,-1,-1,-1,-1,-1,-1
276814,Family reasons (unification / escape),Access to work,Access to work,Education (for self or others),-1,-1
276822,Access to work,-1,-1,-1,-1,-1
...,...,...,...,...,...,...
297661,-1,-1,-1,-1,-1,-1
297663,-1,-1,-1,-1,-1,-1
297666,"Exploration, restart, or independence",Access to work,Access to work,Needed more space,-1,-1
297667,Family reasons (unification / escape),Family reasons (unification / escape),Education (for self or others),-1,-1,-1



ORIGINAL:


Unnamed: 0_level_0,SITE_CODE,8 Identify City,203a Reason for Location,203b Specify Other,203b_edt_reason,203b_cat_reason,203b_gen_reason,203b_theme,203b,205b Reason for leaving,...,215_gen_reason,215_theme,215,216 Relations,217 Reason for leaving,217_edt_reason,217_cat_reason,217_gen_reason,217_theme,217
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
276785,276785,Nairobi,Access to work,-1,-1,,,Work,Access to work,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
276788,276788,Nairobi,-3,-3,-3,,,(No Answer),-3,-3,...,,(No Answer),-3,-3,-3,-3,,,(No Answer),-3
276802,276802,Nairobi,-1,-1,-1,,,(No Answer),-1,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
276814,276814,Nairobi,Family reasons (unification / escape),-1,-1,,,Kin,Family reasons (unification / escape),Access to work,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
276822,276822,Nairobi,Access to work,-1,-1,,,Work,Access to work,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297661,297661,Johannesburg,-1,-1,-1,,,(No Answer),-1,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
297663,297663,Johannesburg,-1,-1,-1,,,(No Answer),-1,-1,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
297666,297666,Johannesburg,Other,So I could familiarise myself with South Africa.,So I could familiarise myself with South Africa.,"Exploration, restart, or independence","Exploration, restart, or independence",Restart,"Exploration, restart, or independence",Access to work,...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1
297667,297667,Johannesburg,Family reasons (unification / escape),-1,-1,,,Kin,Family reasons (unification / escape),Family reasons (unification / escape),...,,(No Answer),-1,-1,-1,-1,,,(No Answer),-1


In [16]:
df = reason_df.copy()

themecolnames = []

for c in fin_list:
    colname = str(c) + '_theme'
    themecolnames.append(colname)
    df[colname] = df[c]
    df = df.replace({colname: newdict})


df = df[themecolnames].dropna(how='all').fillna('(No Answer)')
df['count'] = 1
df

df.to_csv(  os.path.join(folder, "ReasonsTheme_ByRespondent.csv".format(datenow)) )

## Compare Reasons to Leave and Locate

In [17]:
df[['203b_theme', '205c_theme', 'count']].groupby(['203b_theme', '205c_theme']).count().reset_index()

Unnamed: 0,203b_theme,205c_theme,count
0,(No Answer),(No Answer),820
1,(No Answer),Kin,1
2,Education,(No Answer),35
3,Education,Education,22
4,Education,Housing,2
5,Education,Kin,7
6,Education,Restart,1
7,Education,Safety,1
8,Education,Work,22
9,Housing,(No Answer),54


## Reasons Theme Counts By City

In [18]:
## ORIGINAL TABLE
display(reason_city_themes)

## 1 - TEST SELECTING WITH MULTI-INDEX
# REF: https://stackoverflow.com/questions/18835077/selecting-from-multi-index-pandas
print(reason_city_themes.index)
mi = (      'Accra ',   'Education')
c = '203b'
reason_city_themes.loc[mi, c]

## 2 - TABLES BY CITY
mi0 = 'Accra '
display(reason_city_themes.query("city == @mi0")[c])
display(reason_city_themes.query("city == 'Accra '")[c])
print(reason_city_themes.query("city == 'Accra '")[c].sum())
display(reason_city_themes.query("city == 'Nairobi '")[c])
print(reason_city_themes.query("city == 'Nairobi '")[c].sum())
display(reason_city_themes.query("city == 'Johannesburg'")[c])
print(reason_city_themes.query("city == 'Johannesburg'")[c].sum())

Unnamed: 0_level_0,Unnamed: 1_level_0,total,203b,205c,207b,210b,215,217
city,theme,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Accra,(No Answer),2854,294.0,445.0,442.0,537.0,568.0,568.0
Accra,Education,61,40.0,11.0,6.0,2.0,1.0,1.0
Accra,Housing,38,6.0,17.0,5.0,8.0,0.0,2.0
Accra,Kin,138,74.0,28.0,23.0,9.0,3.0,1.0
Accra,Remain,1,0.0,0.0,0.0,0.0,0.0,1.0
Accra,Restart,7,2.0,2.0,2.0,0.0,0.0,1.0
Accra,Safety,7,2.0,3.0,0.0,1.0,0.0,1.0
Accra,Work,377,163.0,75.0,103.0,24.0,9.0,3.0
Johannesburg,(No Answer),2721,263.0,469.0,468.0,499.0,511.0,511.0
Johannesburg,Education,15,9.0,2.0,3.0,1.0,0.0,0.0


MultiIndex([(      'Accra ', '(No Answer)'),
            (      'Accra ',   'Education'),
            (      'Accra ',     'Housing'),
            (      'Accra ',         'Kin'),
            (      'Accra ',      'Remain'),
            (      'Accra ',     'Restart'),
            (      'Accra ',      'Safety'),
            (      'Accra ',        'Work'),
            ('Johannesburg', '(No Answer)'),
            ('Johannesburg',   'Education'),
            ('Johannesburg',     'Housing'),
            ('Johannesburg',         'Kin'),
            ('Johannesburg',      'Remain'),
            ('Johannesburg',     'Restart'),
            ('Johannesburg',      'Safety'),
            ('Johannesburg',        'Work'),
            (    'Nairobi ', '(No Answer)'),
            (    'Nairobi ',   'Education'),
            (    'Nairobi ',     'Housing'),
            (    'Nairobi ',         'Kin'),
            (    'Nairobi ',      'Remain'),
            (    'Nairobi ',     'Restart'),
          

city    theme      
Accra   (No Answer)    294.0
        Education       40.0
        Housing          6.0
        Kin             74.0
        Remain           0.0
        Restart          2.0
        Safety           2.0
        Work           163.0
Name: 203b, dtype: float64

city    theme      
Accra   (No Answer)    294.0
        Education       40.0
        Housing          6.0
        Kin             74.0
        Remain           0.0
        Restart          2.0
        Safety           2.0
        Work           163.0
Name: 203b, dtype: float64

581.0


city      theme      
Nairobi   (No Answer)    264.0
          Education       41.0
          Housing         34.0
          Kin             41.0
          Remain           0.0
          Restart          3.0
          Safety          11.0
          Work           154.0
Name: 203b, dtype: float64

548.0


city          theme      
Johannesburg  (No Answer)    263.0
              Education        9.0
              Housing         26.0
              Kin             61.0
              Remain           0.0
              Restart          3.0
              Safety           6.0
              Work           147.0
Name: 203b, dtype: float64

515.0


# C. Individual Columns to be Categorized

In [19]:
individual_cols = ['113 Additional Training',
                   '157 How',
                   '336 How Have You Changed',
                   '418 Motivation To Change  ',
                   '437 Reasons why',
                   '716 Motivation to choose destination']


display( survey[individual_cols] )

## Loop and save the individual columns
for i,c in enumerate(individual_cols):
    individual_c = pd.DataFrame(pd.Series(survey[c].value_counts())).rename(columns={c:'count'})
    individual_c.to_csv('data_cat/to_categorize/To_Categorize_{}.csv'.format(c))

Unnamed: 0_level_0,113 Additional Training,157 How,336 How Have You Changed,418 Motivation To Change,437 Reasons why,716 Motivation to choose destination
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
276785,-1,increase the experience,-3,-1,-1,-1
276788,Computer programming or skills,Increased budget allocation,-1,-1,-1,-1
276802,-1,-1,-1,-1,-1,It Has A Good Climatic Condition And Also Quite ^
276814,Computer programming or skills,-1,-3,-1,-1,-1
276822,-1,-1,-1,-1,Sometimes They Lie To Her ^,-1
...,...,...,...,...,...,...
297661,-1,-1,-1,-1,-1,Just Like The Place When I See It Ontv^
297663,Shoe repair,-1,you have to adapt and adopt new ways of living^,-1,-1,There Are Many Opportunities Here In South Afr...
297666,-1,-1,I had to get used to the way of life of South ...,-1,-1,I Love My Home But The Problem Is That Now The...
297667,-1,-1,-1,-1,-1,I Love The Atmosphere And The Neighborhood^


## Aspiration

In [20]:
## Load categorized table to join
to_join = pd.read_csv("data_cat/categorized/Categorized_716 Motivation to choose destination.csv")
print("\nNEW CATEGORIES TO JOIN:")
display(to_join)

## Preview Previous
print("\nVALUE COUNTS FROM SURVEY:")
display(pd.DataFrame(pd.Series(survey['716 Motivation to choose destination'].value_counts())).rename(columns={c:'count'}))

## Merge Back
c = '716 Motivation to choose destination'
df = survey[(['SITE_CODE', '8 Identify City ']+[c])]
df = pd.merge( df, to_join[['original_text', 'edited_text', 'asp_category_1', 'asp_category_2', 'asp_category_3']], how='left', right_on='original_text', left_on=c)

# get number from column name
n = c.split(" ")[0]
# clean up merged column names
df = df.drop(columns='original_text')
df = df.rename(columns={'edited_text':'{}_edt_asp'.format(n), 
                        'asp_category_1':'{}_cat1_asp'.format(n), 
                        'asp_category_2':'{}_cat2_asp'.format(n), 
                        'asp_category_3':'{}_cat3_asp'.format(n)})

df_obj = df[['{}_edt_asp'.format(n), '{}_cat1_asp'.format(n), '{}_cat2_asp'.format(n), '{}_cat3_asp'.format(n)]].select_dtypes(['object'])
df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

asp_df = df.copy()
print("\nJOINED ASPIRATIONS:")
display(asp_df)
asp_df.to_csv( "data_gen/aspirations/Aspirations_JoinedtoRespondents.csv".format(datenow) )


NEW CATEGORIES TO JOIN:


Unnamed: 0,original_text,edited_text,count,asp_category_1,asp_category_2,asp_category_3
0,To Guard My House And Also Plant And My Kids W...,I want to guard my house. I want to plant. My ...,1,Available land or improved living conditions,"Environment, geography, or natural landscape",Improved outcomes or education for children
1,Access To Land Is Easier For Me^,,1,Available land or improved living conditions,"Environment, geography, or natural landscape",
2,Access To Land^,,1,Available land or improved living conditions,"Environment, geography, or natural landscape",
3,I Can Easly Do Farming There Because Of Availa...,,1,Available land or improved living conditions,"Environment, geography, or natural landscape",
4,I Want To Own A Farm There^,,1,Available land or improved living conditions,"Environment, geography, or natural landscape",
...,...,...,...,...,...,...
1325,No^,,1,,,
1326,None ^,,12,,,
1327,None^,,12,,,
1328,Still Young Not Sure Where He Woul Retire To^,I am still young and not sure where I will ret...,1,,,



VALUE COUNTS FROM SURVEY:


Unnamed: 0_level_0,count
716 Motivation to choose destination,Unnamed: 1_level_1
-3,137
-1,31
None^,12
None ^,12
Na^,10
...,...
It'S Home To Me~ A Person Will Always Go To Their Home Town^,1
Is My Home Country And Most Of My Family Members Stays There^,1
I Don'T Want To Die In A Foreign Land^,1
Looks Good From The Books ^,1



JOINED ASPIRATIONS:


Unnamed: 0,SITE_CODE,8 Identify City,716 Motivation to choose destination,716_edt_asp,716_cat1_asp,716_cat2_asp,716_cat3_asp
0,276785,Nairobi,-1,,,,
1,276788,Nairobi,-1,,,,
2,276802,Nairobi,It Has A Good Climatic Condition And Also Quite ^,It has a good climatic condition. It is also q...,"Environment, geography, or natural landscape","Peacefulness, safety, or cleanliness",
3,276814,Nairobi,-1,,,,
4,276822,Nairobi,-1,,,,
...,...,...,...,...,...,...,...
1639,297661,Johannesburg,Just Like The Place When I See It Ontv^,I just like the place when I see it on TV.,Personal preference or change,,
1640,297663,Johannesburg,There Are Many Opportunities Here In South Afr...,There many opportunities here in South Africa....,Improved outcomes or education for children,Familiarity or to remain in area,
1641,297666,Johannesburg,I Love My Home But The Problem Is That Now The...,,Reunify with family or return to homeland,Reunify with family or return to homeland,
1642,297667,Johannesburg,I Love The Atmosphere And The Neighborhood^,,"Peacefulness, safety, or cleanliness",,


### Weighted Score for Multiple Aspiration Themes

In [21]:
"""
WEIGHTED SCORE FOR ASPIRATIONS
To maintain counts and equality between respondents, 
if respondents aspirations touched upon multiple themes,
the contibute 1/x where x is the number of themes touched upon
(Up to 3)
"""

asp_df['weighted'] = np.nan

for idx in asp_df.index:
    x = len(asp_df.loc[idx, ['716_cat1_asp', '716_cat2_asp', '716_cat3_asp']].dropna())
    if x != 0:
        asp_df.loc[idx, 'weighted'] = 1/x
    else:
        asp_df.loc[idx, 'weighted'] = 0

asp_df

Unnamed: 0,SITE_CODE,8 Identify City,716 Motivation to choose destination,716_edt_asp,716_cat1_asp,716_cat2_asp,716_cat3_asp,weighted
0,276785,Nairobi,-1,,,,,0.0
1,276788,Nairobi,-1,,,,,0.0
2,276802,Nairobi,It Has A Good Climatic Condition And Also Quite ^,It has a good climatic condition. It is also q...,"Environment, geography, or natural landscape","Peacefulness, safety, or cleanliness",,0.5
3,276814,Nairobi,-1,,,,,0.0
4,276822,Nairobi,-1,,,,,0.0
...,...,...,...,...,...,...,...,...
1639,297661,Johannesburg,Just Like The Place When I See It Ontv^,I just like the place when I see it on TV.,Personal preference or change,,,1.0
1640,297663,Johannesburg,There Are Many Opportunities Here In South Afr...,There many opportunities here in South Africa....,Improved outcomes or education for children,Familiarity or to remain in area,,0.5
1641,297666,Johannesburg,I Love My Home But The Problem Is That Now The...,,Reunify with family or return to homeland,Reunify with family or return to homeland,,0.5
1642,297667,Johannesburg,I Love The Atmosphere And The Neighborhood^,,"Peacefulness, safety, or cleanliness",,,1.0


In [22]:
"""
TOTAL WEIGHTED SCORE
"""
newcols = ['city', 'aspiration']
asp_cities = pd.DataFrame(columns=newcols)
display(asp_cities)

col_list = ["716_cat1_asp", "716_cat2_asp", "716_cat3_asp"]
for i,c in enumerate(col_list):
    asp_cities = pd.concat([asp_cities, asp_df[['8 Identify City ', c, 'weighted']].rename(columns={'8 Identify City ':newcols[0], c:newcols[1]})], axis=0)
# display( asp_cities )

asp_cities['count'] = 1
asp_city_counts = asp_cities.groupby(newcols).sum().reset_index()
display( asp_city_counts )

asp_city_counts.to_csv('data_gen/aspirations/Aspirations_ByCurrentLocation_Weighted.csv'.format(datenow))

asp_counts = asp_city_counts[['aspiration', 'weighted', 'count']].groupby('aspiration').sum()
display(asp_counts)

asp_counts.to_csv('data_gen/aspirations/Aspirations_BySubcategory_Weighted.csv'.format(datenow))

Unnamed: 0,city,aspiration


Unnamed: 0,city,aspiration,weighted,count
0,Accra,Available land or improved living conditions,60.166667,76
1,Accra,Care or improved health,11.666667,21
2,Accra,"Environment, geography, or natural landscape",11.5,16
3,Accra,"Exposure to activity, amenities, and culture",13.833333,21
4,Accra,Familiarity or to remain in area,26.166667,32
5,Accra,"Freedom, governance, or other political factors",3.0,4
6,Accra,Friendliness or other attributes of population,4.833333,7
7,Accra,Improved outcomes or education for children,9.333333,14
8,Accra,Improved training or education opportunities,1.5,2
9,Accra,Improved work or economic opportunities,17.5,26


Unnamed: 0_level_0,weighted,count
aspiration,Unnamed: 1_level_1,Unnamed: 2_level_1
Available land or improved living conditions,124.0,159
Care or improved health,16.166667,27
"Environment, geography, or natural landscape",41.5,63
"Exposure to activity, amenities, and culture",51.0,70
Familiarity or to remain in area,107.833333,125
"Freedom, governance, or other political factors",13.5,17
Friendliness or other attributes of population,17.5,25
Improved outcomes or education for children,21.166667,33
Improved training or education opportunities,2.833333,4
Improved work or economic opportunities,90.333333,125


### One Aspiration Only

In [23]:
## FIRST ONLY (UNUSED)
asp_s = pd.Series()

c = '716_cat1_asp'

newcols = ['city', 'aspiration']
asp_cities = asp_df[['8 Identify City ', c]].rename(columns={'8 Identify City ':newcols[0], c:newcols[1]})

asp_city_counts = asp_cities.reset_index().rename(columns={'index':'total'}).groupby(newcols).count().reset_index()
display( asp_city_counts )

asp_counts = asp_city_counts[['aspiration', 'total']].groupby('reason').sum()
display( asp_counts )

Unnamed: 0,city,aspiration,total
0,Accra,Available land or improved living conditions,62
1,Accra,Care or improved health,6
2,Accra,"Environment, geography, or natural landscape",8
3,Accra,"Exposure to activity, amenities, and culture",16
4,Accra,Familiarity or to remain in area,25
5,Accra,"Freedom, governance, or other political factors",2
6,Accra,Friendliness or other attributes of population,4
7,Accra,Improved outcomes or education for children,8
8,Accra,Improved training or education opportunities,2
9,Accra,Improved work or economic opportunities,15


KeyError: 'reason'