# Notebook de tratamento dos dados acerca da frequência de notificações da tuberculose no Brasil

## Importação das bibliotecas

In [1]:
import pandas as pd

---

## Configuração de variáveis

In [2]:
info = 'tuberculose'
place = 'brasil'
path_to_csv = '../data/extracted/{}/{}_{}.csv'.format(info, info, place)

---

## Recuperação e visualização superficial dos metadados

In [3]:
df = pd.read_csv(path_to_csv, encoding='utf8', sep=';', skipfooter=18, engine='python')

In [4]:
df.tail(8)

Unnamed: 0,UF de notificacão,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,Total
20,41 Paraná,3224,3436,3454,3294,3418,2832,3059,2991,2808,...,2534,2650,2554,2466,2436,2501,2860,2883,2814,57706
21,42 Santa Catarina,1695,1873,1903,1870,1837,1820,1866,2022,2002,...,2229,2268,2248,2180,2199,2228,2282,2265,1509,40584
22,43 Rio Grande do Sul,5231,5509,5786,5906,5661,5155,5496,5687,6100,...,6405,6348,6372,6380,6043,6599,6908,7642,6327,122095
23,50 Mato Grosso do Sul,980,914,1022,1034,1053,882,963,1019,1044,...,1091,1098,998,1003,1130,1140,1455,1406,1171,21393
24,51 Mato Grosso,1425,1248,1267,1148,1344,1310,1170,1280,1158,...,1605,1886,1794,1442,1456,1430,1414,1532,1188,27836
25,52 Goiás,1148,1149,1242,1089,1081,976,959,953,1012,...,1058,1072,991,1095,1027,1163,1212,1173,952,21357
26,53 Distrito Federal,638,606,614,625,621,583,634,518,449,...,554,473,512,478,463,413,471,483,382,10436
27,Total,87265,92859,93773,92980,92056,85031,84600,86768,86318,...,86183,86208,85213,85452,86207,91301,95539,97629,83678,1772254


---

## Tratamento das informações

### Inclusão da região aos metadados

In [5]:
region_map = {
    "1": "Norte",
    "2": "Nordeste",
    "3": "Sudeste",
    "4": "Sul",
    "5": "Centro-Oeste",
    "T": "-"
}
df['Região'] = [region_map[i[0]] for i in df['UF de notificacão']]

In [6]:
df.head(8)

Unnamed: 0,UF de notificacão,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2013,2014,2015,2016,2017,2018,2019,2020,Total,Região
0,11 Rondônia,674,639,651,628,630,519,548,564,680,...,711,686,665,775,780,678,740,573,13090,Norte
1,12 Acre,364,368,368,356,329,391,314,314,364,...,406,462,361,450,484,483,582,537,8081,Norte
2,13 Amazonas,2334,2286,2375,2559,2477,2432,2461,2688,2606,...,3060,3143,3404,3259,3693,3803,3955,3500,58327,Norte
3,14 Roraima,150,172,206,218,155,147,137,155,145,...,167,148,174,160,205,265,324,286,3674,Norte
4,15 Pará,3517,3785,3940,4089,3975,3789,3763,3790,4106,...,4119,3940,3991,4224,4511,4678,5542,4437,82682,Norte
5,16 Amapá,223,296,273,275,274,245,273,266,253,...,220,211,225,287,298,284,357,274,5267,Norte
6,17 Tocantins,320,333,284,260,253,256,231,207,223,...,194,186,193,210,187,233,234,200,4620,Norte
7,21 Maranhão,2901,3043,2980,2974,3179,2883,2780,2464,2462,...,2278,2111,2238,2442,2469,2647,2666,2332,51953,Nordeste


### Criação de um novo dataframe apenas com os dados numéricos

In [7]:
useful_columns = df.mean().index.tolist()
useful_columns.insert(0, 'UF de notificacão')
useful_columns.insert(-1, 'Região')
print(useful_columns)

['UF de notificacão', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', 'Região', 'Total']


In [8]:
useful_data = df[useful_columns]

In [9]:
useful_data.loc[:26, 'UF de notificacão'] = useful_data.loc[:26, 'UF de notificacão'].apply(lambda x: x[3:])
useful_data.set_index('UF de notificacão')
useful_data

Unnamed: 0,UF de notificacão,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2013,2014,2015,2016,2017,2018,2019,2020,Região,Total
0,Rondônia,674,639,651,628,630,519,548,564,680,...,711,686,665,775,780,678,740,573,Norte,13090
1,Acre,364,368,368,356,329,391,314,314,364,...,406,462,361,450,484,483,582,537,Norte,8081
2,Amazonas,2334,2286,2375,2559,2477,2432,2461,2688,2606,...,3060,3143,3404,3259,3693,3803,3955,3500,Norte,58327
3,Roraima,150,172,206,218,155,147,137,155,145,...,167,148,174,160,205,265,324,286,Norte,3674
4,Pará,3517,3785,3940,4089,3975,3789,3763,3790,4106,...,4119,3940,3991,4224,4511,4678,5542,4437,Norte,82682
5,Amapá,223,296,273,275,274,245,273,266,253,...,220,211,225,287,298,284,357,274,Norte,5267
6,Tocantins,320,333,284,260,253,256,231,207,223,...,194,186,193,210,187,233,234,200,Norte,4620
7,Maranhão,2901,3043,2980,2974,3179,2883,2780,2464,2462,...,2278,2111,2238,2442,2469,2647,2666,2332,Nordeste,51953
8,Piauí,1568,1519,1448,1563,1551,1306,1357,1224,1108,...,908,813,754,812,791,884,877,758,Nordeste,22158
9,Ceará,4240,4385,4586,4461,4656,4140,4161,4473,4664,...,4052,3879,4012,4045,4289,4555,4544,3696,Nordeste,85787


---

## Exportação dos dados tratados

In [10]:
info = 'tuberculose'
place = 'brasil'
path_to_csv = '../data/loaded/{}/{}_{}.csv'.format(info, info, place)

In [11]:
useful_data.to_csv(path_to_csv)