-
Notifications
You must be signed in to change notification settings - Fork 0
/
read_amostra.py
254 lines (228 loc) · 10.2 KB
/
read_amostra.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
import ftplib
import os
import shutil
from zipfile import ZipFile
from zipfile import BadZipFile
import pandas as pd
import numpy as np
from sectors_of_interest import list_muns, states_link, aps
def convert_to_decimals(x):
""" Auxiliary function for the text reading of weights on text files from IBGE
"""
return float(x[:3] + '.' + x[3:])
columns_interest = [i for i in range(36, 136)] + [0, 23]
names_columns_weighted_areas = ['AREAP', 'weight', 'V6400']
converters = {'AREAP': str, 'weight': convert_to_decimals, 'V6400': str}
columns_weighted_areas = [(7, 20), (28, 44), (157, 158)]
def download_from_ibge(path, directory, flag='setores'):
""" Download all zip files of census tract data """
if not os.path.exists(os.path.join('data', flag)):
os.makedirs(os.path.join('data', flag))
ftp = ftplib.FTP(path)
ftp.login("anonymous", "censo2010")
ftp.cwd(directory)
files = ftp.nlst()
print(files)
for file in files:
if file.endswith('.zip'):
with open(os.path.join('data', flag, file), 'wb') as f:
ftp.retrbinary('RETR ' + file, f.write)
def unzip_files_temp(file, flag=r'data/setores'):
""" Automatic unzip files. If encounters new directory, go down one level and try again.
After unzipping, returns list of files
"""
if not os.path.exists(flag):
os.mkdir(flag)
# importing required modules
# opening the zip file in READ mode
try:
with ZipFile(os.path.join(flag, file), 'r') as zip_ref:
# Extracting all the files
path = zip_ref.namelist()
zip_ref.extractall(flag)
except BadZipFile:
return
except IsADirectoryError as e:
print(e)
file = os.path.join(flag, file)
file = os.listdir(file)
with ZipFile(file[0], 'r') as zip_ref:
# Extracting all the files
zip_ref.printdir()
path = zip_ref.namelist()
zip_ref.extractall(flag)
return path
def extract_txt(file_location):
""" Auxiliary function to read fixed_weighted_files from txt from IBGE
"""
return pd.read_fwf(file_location, colspecs=columns_weighted_areas, names=names_columns_weighted_areas,
header=None, converters=converters)
def get_weighted_areas():
""" Download txt files of weighted areas from IBGE and process attained level of education
"""
site = r"ftp.ibge.gov.br"
folder = r'Censos/Censo_Demografico_2010/Resultados_Gerais_da_Amostra/Microdados'
download_from_ibge(site, folder, 'amostra')
areas = r'data/amostra'
files = os.listdir(areas)
output = pd.DataFrame()
for file in files:
unzipped_path = unzip_files_temp(file, areas)
if not unzipped_path:
continue
data = extract_txt(os.path.join(areas, unzipped_path[4]))
# Weighted
data = data.groupby('AREAP').apply(lambda x: np.bincount(x['V6400'], weights=x['weight'])).reset_index()
for each in [1, 2, 3, 4]:
data[each] = data[0].apply(lambda x: x[each] / x.sum())
# Much slower, but allows to get exception only for lines with no value for level '5'
for i in data[0].index:
try:
data.loc[i, 5] = data.loc[i, 0][5] / data.loc[i, 0].sum()
except IndexError:
data.loc[i, 5] = 0
data = data.drop(0, axis=1)
data = data.melt(id_vars=['AREAP'], var_name=['qual'])
output = pd.concat([output, data])
shutil.rmtree(os.path.join(areas, unzipped_path[0]))
output.to_csv('input/quali_AP.csv', sep=';', index=False)
# shutil.rmtree(areas)
return output
def get_color(files, sectors):
output = pd.DataFrame()
for file in files:
unzipped_path = unzip_files_temp(file, sectors)
if not unzipped_path:
continue
for each in unzipped_path:
if 'Pessoa03_' in each and '.csv' in each and each.split('_')[-1][:2] in states_link:
print(each)
data = pd.read_csv(os.path.join(sectors, each), sep=';')
data = data[['Cod_setor', 'V002', 'V003', 'V004', 'V005', 'V006']]
data = data.replace('X', 0)
data = pd.merge(data, aps, on='Cod_setor', how='inner')
data = data.astype(int)
data = data.groupby('AREAP').agg(sum)
data = data.drop('Cod_setor', axis=1)
data = data.reset_index()
names = ['branca', 'preta', 'amarela', 'parda', 'indigena']
new = pd.DataFrame()
new['AREAP'] = data['AREAP']
for each in [1, 2, 3, 4, 5]:
new[names[each - 1]] = data.apply(lambda x: x[each] / x[1:].sum(), axis=1)
new = new.melt(id_vars=['AREAP'], var_name=['cor'])
output = pd.concat([output, new])
output.to_csv('input/etnia_AP.csv', sep=';', index=False)
return output
def extract_age_gender(male, female):
""" Gather number of people by gender and age
"""
result = list()
for each in [male, female]:
each['mun'] = each['Cod_setor'].astype(str).str[:7]
each = each[each['mun'].isin(list_muns)]
if len(each) == 0:
return
if 'Cod_se' in each.columns:
# A file from SP has an extra column. I know. Go figure.
each = each.drop('Cod_se', axis=1)
each = each.iloc[:, columns_interest]
new_columns = [str(int(x[2:]) - 34) if x.startswith('V0') and x != 'V022' else x for x in each.columns]
new_columns = [str(int(x[1:]) - 34) if x.startswith('V') and x != 'V022' else x for x in new_columns]
new_columns = ['0' if x == 'V022' else x for x in new_columns]
each.columns = new_columns
each = pd.merge(each, aps, on='Cod_setor', how='inner')
# Some sectors with less than five residences, have omitted information, included as 'X'. Replace them
each = each.replace('X', 0)
each = each.astype(int)
each = each.groupby('AREAP').agg(sum)
each = each.drop('Cod_setor', axis=1)
each = each.reset_index()
result.append(each)
male = result[0]
female = result[1]
male['gender'] = 2
female['gender'] = 1
return pd.concat([male, female])
def read_age_gender(files, sectors):
""" Auxiliary function to read age and gender from sectors tables
"""
results = dict()
output = pd.DataFrame()
for file in files:
unzipped_path = unzip_files_temp(file, sectors)
if not unzipped_path:
continue
keys = list()
for each in unzipped_path:
if 'Pessoa11_' in each and '.csv' in each and each.split('_')[-1][:2] in states_link:
keys.append(each)
elif 'Pessoa12_' in each and '.csv' in each and each.split('_')[-1][:2] in states_link:
keys.append(each)
if len(keys) == 2:
print(keys[0])
print(keys[1])
try:
results['male'] = pd.read_csv(os.path.join(sectors, keys[0]), sep=';')
results['female'] = pd.read_csv(os.path.join(sectors, keys[1]), sep=';')
except FileNotFoundError:
continue
data = extract_age_gender(results['male'], results['female'])
data = data.melt(id_vars=['AREAP', 'gender'], var_name=['age'])
data = data.rename(columns={'value': 'num_people'})
output = pd.concat([output, data])
output.to_csv('input/num_people_age_gender_AP.csv', sep=';', index=False)
return output
def get_wage_num_family(files, sectors):
output = pd.DataFrame()
for file in files:
unzipped_path = unzip_files_temp(file, sectors)
if not unzipped_path:
continue
for each in unzipped_path:
if 'Basico_' in each and '.csv' in each and each.split('_')[-1][:2] in states_link:
print(each)
data = pd.read_csv(os.path.join(sectors, each), sep=';', encoding='latin-1', decimal=',')
# Variables of interest (mu, sigma for number of people per family and nominal wage)
try:
data = data[['Cod_setor', 'V003', 'V004', 'V009', 'V010']]
except KeyError:
data = data[['Cod_setor', 'V003', 'V004', 'V009', 'V010']]
data = data.rename(columns={'Cod_setor': 'Cod_setor'})
except KeyError:
print('Problems with column name, using column index instead. Yes. Go figure IBGE')
print(data.columns)
data['Cod_setor'] = data.iloc[:, 0]
data = data[['Cod_setor', 'V003', 'V004', 'V009', 'V010']]
data = pd.merge(data, aps, on='Cod_setor', how='inner')
# Average of averages and variances of sectors by weighted areas
data = data.groupby('AREAP').agg(np.mean)
data = data.drop('Cod_setor', axis=1)
data = data.reset_index()
output = pd.concat([output, data])
output = output.rename(columns={'V003': 'avg_num_people', 'V004': 'var_num_people',
'V009': 'avg_wage', 'V010': 'var_wage'})
output.to_csv('input/average_variance_family_wages.csv', sep=';', index=False)
return output
def get_sectors():
""" General download and send list of sectors tables for picking up of details of age, gender, color, wage and
average number of people in the family, by census sectors.
"""
site = r"ftp.ibge.gov.br"
folder = r'Censos/Censo_Demografico_2010/Resultados_do_Universo/Agregados_por_Setores_Censitarios'
# download_from_ibge(site, folder, 'setores')
# After downloading, unzip one by one, extract data and delete, except original zipfile
# Get list of files
sectors = r'data/setores'
files = os.listdir(sectors)
output1 = read_age_gender(files, sectors)
output2 = get_color(files, sectors)
output3 = get_wage_num_family(files, sectors)
shutil.rmtree(sectors)
output2, output3 = None, None
return output1, output2, output3
if __name__ == '__main__':
if not os.path.exists('input'):
os.mkdir('input')
n, m, o = get_sectors()
p = get_weighted_areas()