# Coletando dados de vacinados de uma base de dados oficial

## Inicializando a análise

In [1]:
import pandas as pd

dados = pd.read_csv("vaccinations.csv", sep=',', encoding='latin-1')
dados

Unnamed: 0,location,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
0,Albania,ALB,2021-01-10,0.0,0.0,,,,0.00,0.00,,
1,Albania,ALB,2021-01-11,,,,,64.0,,,,22.0
2,Albania,ALB,2021-01-12,128.0,128.0,,,64.0,0.00,0.00,,22.0
3,Albania,ALB,2021-01-13,188.0,188.0,,60.0,63.0,0.01,0.01,,22.0
4,Albania,ALB,2021-01-14,266.0,266.0,,78.0,66.0,0.01,0.01,,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4729,Zimbabwe,ZWE,2021-02-24,7872.0,7872.0,,3831.0,1312.0,0.05,0.05,,88.0
4730,Zimbabwe,ZWE,2021-02-25,11007.0,11007.0,,3135.0,1572.0,0.07,0.07,,106.0
4731,Zimbabwe,ZWE,2021-02-26,12579.0,12579.0,,1572.0,1750.0,0.08,0.08,,118.0
4732,Zimbabwe,ZWE,2021-02-27,15705.0,15705.0,,3126.0,2150.0,0.11,0.11,,145.0


In [9]:
#Alterando o nome das colunas
dados.rename(columns={'location':'country'}, inplace = True )
dados

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
0,Albania,ALB,2021-01-10,0.0,0.0,,,,0.00,0.00,,
1,Albania,ALB,2021-01-11,,,,,64.0,,,,22.0
2,Albania,ALB,2021-01-12,128.0,128.0,,,64.0,0.00,0.00,,22.0
3,Albania,ALB,2021-01-13,188.0,188.0,,60.0,63.0,0.01,0.01,,22.0
4,Albania,ALB,2021-01-14,266.0,266.0,,78.0,66.0,0.01,0.01,,23.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4729,Zimbabwe,ZWE,2021-02-24,7872.0,7872.0,,3831.0,1312.0,0.05,0.05,,88.0
4730,Zimbabwe,ZWE,2021-02-25,11007.0,11007.0,,3135.0,1572.0,0.07,0.07,,106.0
4731,Zimbabwe,ZWE,2021-02-26,12579.0,12579.0,,1572.0,1750.0,0.08,0.08,,118.0
4732,Zimbabwe,ZWE,2021-02-27,15705.0,15705.0,,3126.0,2150.0,0.11,0.11,,145.0


In [2]:
#Transformando a coluna de data para data
dados['date'] = pd.to_datetime(dados['date'])

In [10]:
#Mostrando as colunas existentes no dataframe
dados.columns

Index(['country', 'iso_code', 'date', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated',
       'daily_vaccinations_raw', 'daily_vaccinations',
       'total_vaccinations_per_hundred', 'people_vaccinated_per_hundred',
       'people_fully_vaccinated_per_hundred',
       'daily_vaccinations_per_million'],
      dtype='object')

## Pesquisando um valor específico dentro do dataframe usando o query()

https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html

In [11]:
# Mostrando os dados usando o query()
dados.query('country == "Brazil" ').sort_values('date', ascending=False)[:2]

Unnamed: 0,country,iso_code,date,total_vaccinations,people_vaccinated,people_fully_vaccinated,daily_vaccinations_raw,daily_vaccinations,total_vaccinations_per_hundred,people_vaccinated_per_hundred,people_fully_vaccinated_per_hundred,daily_vaccinations_per_million
588,Brazil,BRA,2021-02-28,8433568.0,6518628.0,1914940.0,111526.0,211824.0,3.97,3.07,0.9,997.0
587,Brazil,BRA,2021-02-27,8322042.0,6437836.0,1884206.0,220255.0,215553.0,3.92,3.03,0.89,1014.0


In [12]:
# Mostra a quantidade de dados únicos (países) no dataframe
dados['country'].nunique()

119

# 1) País com registro mais recente de atualização


In [15]:
dados.groupby('country')['date'].max().sort_values(ascending = False)

country
Zimbabwe          2021-02-28
European Union    2021-02-28
South Korea       2021-02-28
Czechia           2021-02-28
South Africa      2021-02-28
                     ...    
Myanmar           2021-02-02
Egypt             2021-01-30
Greenland         2021-01-27
Northern Cyprus   2021-01-22
Monaco            2021-01-18
Name: date, Length: 119, dtype: datetime64[ns]

# 2) País por quantidade de vacinação


In [17]:
# Mapeando os 10 primeiros
dados.groupby('country')['total_vaccinations'].max().sort_values(ascending = False)[:10]

country
World             244265679.0
United States      75236003.0
China              40520000.0
European Union     33042239.0
United Kingdom     20885683.0
England            17641792.0
India              14301266.0
Turkey              8547875.0
Brazil              8433568.0
Israel              8092725.0
Name: total_vaccinations, dtype: float64

### Agora, vamos salvar essa informação dentro de uma variável para podermos manipular depois

In [16]:
imunizado_por_pais = dados.groupby('country')['total_vaccinations'].max().sort_values(ascending = False)
imunizado_por_pais

country
World                  244265679.0
United States           75236003.0
China                   40520000.0
European Union          33042239.0
United Kingdom          20885683.0
                          ...     
Paraguay                    1000.0
Trinidad and Tobago          440.0
Venezuela                    157.0
Saint Helena                 107.0
San Marino                    35.0
Name: total_vaccinations, Length: 119, dtype: float64

### Porém, ao fazer essa conversão, o tipo da variável se torna Series de array unidimensional, e nós não queremos isso. Portanto, devemos converter de volta para Dataframe e resetar o index.

In [19]:
imunizado = imunizado_por_pais.to_frame().reset_index()
imunizado.head(10)

Unnamed: 0,country,total_vaccinations
0,World,244265679.0
1,United States,75236003.0
2,China,40520000.0
3,European Union,33042239.0
4,United Kingdom,20885683.0
5,England,17641792.0
6,India,14301266.0
7,Turkey,8547875.0
8,Brazil,8433568.0
9,Israel,8092725.0


In [23]:
#O dado "World" não é útil para nós nesse momento, portanto vamos exclui-lo
imunizado.drop([0], inplace = True)
imunizado

Unnamed: 0,country,total_vaccinations
1,United States,75236003.0
2,China,40520000.0
3,European Union,33042239.0
4,United Kingdom,20885683.0
5,England,17641792.0
...,...,...
114,Paraguay,1000.0
115,Trinidad and Tobago,440.0
116,Venezuela,157.0
117,Saint Helena,107.0


# 3) País com maior número de vacinados por dia

Criaremos um variável para armezenar apenas os dados que precisamos

In [27]:
dados.groupby('country')['daily_vaccinations'].max().sort_values(ascending = False)

country
World                  6209381.0
China                  1916190.0
United States          1735053.0
European Union          856854.0
India                   486318.0
                         ...    
Trinidad and Tobago         63.0
Venezuela                   31.0
Greenland                    NaN
Saint Helena                 NaN
San Marino                   NaN
Name: daily_vaccinations, Length: 119, dtype: float64

# 5) País com maior média de vacinados por dia

In [52]:
#pais = dados[['daily_vaccinations', 'country']].loc[dados['country'] == 'Albania']
media_vacinados = dados.groupby('country')['daily_vaccinations'].mean().round(2)
media_vacinados

country
Albania              179.10
Algeria             3289.05
Andorra               62.00
Anguilla             167.20
Argentina          14642.03
                    ...    
United States     982394.57
Venezuela             31.00
Wales              12629.00
World            2793939.84
Zimbabwe            1145.50
Name: daily_vaccinations, Length: 119, dtype: float64

In [71]:
#Trasnformar num dataframe
media_vacinados = media_vacinados.to_frame().reset_index()
media_vacinados

Unnamed: 0,country,daily_vaccinations
0,Albania,179.10
1,Algeria,3289.05
2,Andorra,62.00
3,Anguilla,167.20
4,Argentina,14642.03
...,...,...
114,United States,982394.57
115,Venezuela,31.00
116,Wales,12629.00
117,World,2793939.84


In [77]:
#Renomeado a coluna daily_vaccinations
media_vacinados.rename(columns={'daily_vaccinations':'mean_daily_vaccinations'}, inplace = True)
media_vacinados.sort_values(by='mean_daily_vaccinations', ascending = False)

Unnamed: 0,country,mean_daily_vaccinations
117,World,2793939.84
114,United States,982394.57
21,China,623616.00
34,European Union,375428.49
49,India,306434.45
...,...,...
63,Liechtenstein,52.62
115,Venezuela,31.00
42,Greenland,
92,Saint Helena,


# 6) - Com o nome das colunas já alterados, vamos criar um novo data set

A função para isso é a merge() quer irá comparar os dados da tabela de chaves iguais, como o nome do país, e realizar a junção.Assim, iremos ter todos os dados tratados em uma única tabela

In [80]:
dados_tratados = pd.merge(imunizado, media_vacinados, how='inner')
dados_tratados

Unnamed: 0,country,total_vaccinations,mean_daily_vaccinations
0,United States,75236003.0,982394.57
1,China,40520000.0,623616.00
2,European Union,33042239.0,375428.49
3,United Kingdom,20885683.0,258688.14
4,England,17641792.0,218455.14
...,...,...,...
113,Paraguay,1000.0,250.00
114,Trinidad and Tobago,440.0,63.00
115,Venezuela,157.0,31.00
116,Saint Helena,107.0,


# Exportando o novo data set para um arquivo CSV

In [81]:
dados_tratados.to_csv("Estatistica_vacinacao_pais.csv", index = True, sep = ';')
dados_tratados.to_json("Estatistica_vacinacao_pais_json", orient = 'records')