In [116]:
import pandas as pd
import matplotlib as mp 
import seaborn as sns

In [117]:
df = pd.read_csv("datasets/vacData.csv")
df.columns = ['YearWeek', 'Country', 'Denominator', 'DosesReceived',
       'DosesExported', 'FirstDose', 'FirstDoseRefused', 'SecondDose',
       'DoseAdditional1', 'DoseAdditional2', 'DoseAdditional3', 'UnknownDose',
       'Region', 'TargetGroup', 'Vaccine', 'Population']
df.drop(['UnknownDose','FirstDoseRefused','DoseAdditional1','DoseAdditional2','DoseAdditional3','Region'],axis='columns',inplace=True)
df.head()

Unnamed: 0,YearWeek,Country,Denominator,DosesReceived,DosesExported,FirstDose,SecondDose,TargetGroup,Vaccine,Population
0,2020-W53,AT,7388778.0,61425.0,0.0,5343,0,ALL,COM,8901064
1,2020-W53,AT,7388778.0,0.0,0.0,0,0,ALL,UNK,8901064
2,2020-W53,AT,7388778.0,0.0,0.0,0,0,ALL,NVXD,8901064
3,2020-W53,AT,7388778.0,0.0,0.0,0,0,ALL,MOD,8901064
4,2020-W53,AT,7388778.0,0.0,0.0,0,0,ALL,VLA,8901064


In [118]:
df.loc[df.Denominator.isna()]
df.Denominator.isna().sum()/len(df)


0.46978671097051644

The missing values in the Denominator column accounts for neaerly half of our dataframe (46%) so we shoudln't drop them or else we will loose too many values.

In [119]:
missing_vaccine = df.Vaccine.isna()
missing_vaccine.sum()/len(df)

2.3025721502127223e-05

This time the missing values are only a small fraction of the total so we can drop them without loosing much information.

In [120]:
vrows_to_drop = df.loc[missing_vaccine].index
df.drop(index=vrows_to_drop, inplace=True)
df.head()

Unnamed: 0,YearWeek,Country,Denominator,DosesReceived,DosesExported,FirstDose,SecondDose,TargetGroup,Vaccine,Population
0,2020-W53,AT,7388778.0,61425.0,0.0,5343,0,ALL,COM,8901064
1,2020-W53,AT,7388778.0,0.0,0.0,0,0,ALL,UNK,8901064
2,2020-W53,AT,7388778.0,0.0,0.0,0,0,ALL,NVXD,8901064
3,2020-W53,AT,7388778.0,0.0,0.0,0,0,ALL,MOD,8901064
4,2020-W53,AT,7388778.0,0.0,0.0,0,0,ALL,VLA,8901064


We want to drop the whole rows in order to not have missing values in the vaccine column so we locate the rows with the loc[] and index methods, then we use the drop method on the dataframe with the indexes that we specified earlier to drop the whole rows.

In [121]:
missing_doses = df.loc[df.FirstDose == 0].loc[df.SecondDose == 0]
drows_to_drop = missing_doses


This time we have a lot of values involved in this operation but since they all are equal to 0 and it represents the number of people that are getting vaccinated then we can delete them because it doesn't provide any information.

In [122]:
vaccines = ['VLA','UNK','SPU','SIN','NVX','MOD','JANSS','COM','BHACOV','BECNBG','AZ']
valuesFirstDose =[]
for i in range(len(vaccines)):
    vac = df.Vaccine.str.contains(vaccines[i])
    sum = df.FirstDose.loc[vac].sum()
    valuesFirstDose.append(sum)
valuesFirstDose

[315500,
 289595189,
 1866721,
 64182,
 688714,
 70085039,
 41418128,
 453925219,
 406,
 2274694,
 71974115]

In [123]:
valuesSecondDose =[]
for i in range(len(vaccines)):
    vac = df.Vaccine.str.contains(vaccines[i])
    sum = df.SecondDose.loc[vac].sum()
    valuesSecondDose.append(sum)
valuesSecondDose

[81997,
 279659271,
 1829410,
 114939,
 432725,
 68209519,
 460223,
 432298457,
 519,
 2177869,
 64331666]

In [124]:
df2 = pd.DataFrame({
                    'Vaccine' : ['Valneva','Unknown','Sputnik','CoronaVac','Novavax','Moderna','Janssen','Pfizer','Bharat','BECNBG','AstraZeneca'],
                    'FirstTotal': valuesFirstDose,
                    'SecondTotal':valuesSecondDose,
                    })
df2

Unnamed: 0,Vaccine,FirstTotal,SecondTotal
0,Valneva,315500,81997
1,Unknown,289595189,279659271
2,Sputnik,1866721,1829410
3,CoronaVac,64182,114939
4,Novavax,688714,432725
5,Moderna,70085039,68209519
6,Janssen,41418128,460223
7,Pfizer,453925219,432298457
8,Bharat,406,519
9,BECNBG,2274694,2177869


In [125]:
drows_to_drop = missing_doses.index
df.drop(index=drows_to_drop, inplace=True)
df

Unnamed: 0,YearWeek,Country,Denominator,DosesReceived,DosesExported,FirstDose,SecondDose,TargetGroup,Vaccine,Population
0,2020-W53,AT,7388778.0,61425.0,0.0,5343,0,ALL,COM,8901064
9,2020-W53,AT,7388778.0,0.0,0.0,1,0,ALL,COMBA.1,8901064
11,2020-W53,AT,433755.0,0.0,0.0,1,0,Age0_4,COM,8901064
34,2020-W53,AT,258398.0,0.0,0.0,5,0,Age15_17,COM,8901064
45,2020-W53,AT,683584.0,0.0,0.0,92,0,Age18_24,COM,8901064
...,...,...,...,...,...,...,...,...,...,...
564566,2022-W48,SK,4431608.0,0.0,0.0,1,2,ALL,COM,5459781
564567,2022-W48,SK,4431608.0,0.0,0.0,2,0,ALL,JANSS,5459781
564569,2022-W48,SK,159879.0,0.0,0.0,0,1,Age15_17,COM,5459781
564572,2022-W48,SK,2055643.0,0.0,0.0,2,0,Age25_49,JANSS,5459781


In [127]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,YearWeek,Country,Denominator,DosesReceived,DosesExported,FirstDose,SecondDose,TargetGroup,Vaccine,Population
0,2020-W53,AT,7388778.0,61425.0,0.0,5343,0,ALL,COM,8901064
1,2020-W53,AT,7388778.0,0.0,0.0,1,0,ALL,COMBA.1,8901064
2,2020-W53,AT,433755.0,0.0,0.0,1,0,Age0_4,COM,8901064
3,2020-W53,AT,258398.0,0.0,0.0,5,0,Age15_17,COM,8901064
4,2020-W53,AT,683584.0,0.0,0.0,92,0,Age18_24,COM,8901064
...,...,...,...,...,...,...,...,...,...,...
201945,2022-W48,SK,4431608.0,0.0,0.0,1,2,ALL,COM,5459781
201946,2022-W48,SK,4431608.0,0.0,0.0,2,0,ALL,JANSS,5459781
201947,2022-W48,SK,159879.0,0.0,0.0,0,1,Age15_17,COM,5459781
201948,2022-W48,SK,2055643.0,0.0,0.0,2,0,Age25_49,JANSS,5459781


In [128]:
df.to_csv('datasets/vacDataClean.csv')
df2.to_csv('datasets/vacTotalData.csv')