In [None]:
import pandas as pd
import matplotlib as mp 
import seaborn as sns

In [None]:
df = pd.read_csv("datasets/vacData.csv")
df.columns = ['YearWeek', 'Country', 'Denominator', 'DosesReceived',
       'DosesExported', 'FirstDose', 'FirstDoseRefused', 'SecondDose',
       'DoseAdditional1', 'DoseAdditional2', 'DoseAdditional3', 'UnknownDose',
       'Region', 'TargetGroup', 'Vaccine', 'Population']
df.drop(['UnknownDose','FirstDoseRefused','DoseAdditional1','DoseAdditional2','DoseAdditional3','Region'],axis='columns',inplace=True)
df.head()

We open the first dataset and only keep the columns we plan to use 

In [None]:
df.loc[df.Denominator.isna()]
df.Denominator.isna().sum()/len(df)


The missing values in the Denominator column accounts for neaerly half of our dataframe (46%) so we shoudln't drop them or else we will loose too many values.

In [None]:
missing_vaccine = df.Vaccine.isna()
missing_vaccine.sum()/len(df)

This time the missing values are only a small fraction of the total so we can drop them without loosing much information.

In [None]:
vrows_to_drop = df.loc[missing_vaccine].index
df.drop(index=vrows_to_drop, inplace=True)
df.head()

We want to drop the whole rows in order to not have missing values in the vaccine column so we locate the rows with the loc[] and index methods, then we use the drop method on the dataframe with the indexes that we specified earlier to drop the whole rows.

In [None]:
missing_doses = df.loc[df.FirstDose == 0].loc[df.SecondDose == 0]
drows_to_drop = missing_doses
drows_to_drop

In [None]:
drows_to_drop = missing_doses.index #empty doses to drop
df.drop(index=drows_to_drop, inplace=True)
df

This time we have a lot of values involved in this operation but since they all are equal to 0 and it represents the number of people that are getting vaccinated then we can delete them because it doesn't provide any information.

In [None]:
vaccines = ['VLA','UNK','SPU','SIN','NVX','MOD','JANSS','COM','BHACOV','BECNBG','AZ']
valuesFirstDose =[]
for i in range(len(vaccines)):
    vac = df.Vaccine.str.contains(vaccines[i])
    sum = df.FirstDose.loc[vac].sum()
    valuesFirstDose.append(sum)
valuesFirstDose

In [None]:
valuesSecondDose =[]
for i in range(len(vaccines)):
    vac = df.Vaccine.str.contains(vaccines[i])
    sum = df.SecondDose.loc[vac].sum()
    valuesSecondDose.append(sum)
valuesSecondDose

Here we go through all the values in the First and Second dose columns and we sum all the values corresponding to each vaccine and we get 2 arrays with the sums indexed in the same order and length as the vaccines array so we can compute it into a dataframe to use for a graph on the website.

In [None]:
df2 = pd.DataFrame({
                    'Vaccine' : ['Valneva','Unknown','Sputnik','CoronaVac','Novavax','Moderna','Janssen','Pfizer','Bharat','BECNBG','AstraZeneca'],
                    'FirstTotal': valuesFirstDose,
                    'SecondTotal':valuesSecondDose,
                    })
df2

In [None]:
countries = []
for country in df.Country.unique():
    countries.append(country)
valuesCountriesFirst =[]
for i in range(len(countries)):
    country = df.Country.str.contains(countries[i])
    sum = df.FirstDose.loc[country].loc[df.TargetGroup.str.contains('ALL')].sum()
    valuesCountriesFirst.append(sum)
valuesCountriesFirst

In [None]:
valuesCountriesSecond =[]
for i in range(len(countries)):
    country = df.Country.str.contains(countries[i])
    sum = df.SecondDose.loc[country].loc[df.TargetGroup.str.contains('ALL')].sum()
    valuesCountriesSecond.append(sum)
valuesCountriesSecond

In [None]:
df3 = pd.DataFrame({
                    'Countries' : countries,
                    'FirstTotal' : valuesCountriesFirst,
                    'SecondTotal' : valuesCountriesSecond,
                    })
df3

Same thing as for the vaccine but this time for total number of vaccinated people by each country, we only sum the data where the target group is 'ALL' no not have some values counted twice.

In [None]:
valuesDosesReceived =[]
for i in range(len(countries)):
    country = df.Country.str.contains(countries[i])
    sum = df.DosesReceived.loc[country].sum()
    valuesDosesReceived.append(sum)
valuesDosesReceived

In [None]:
valuesDosesExported =[]
for i in range(len(countries)):
    country = df.Country.str.contains(countries[i])
    sum = df.DosesExported.loc[country].sum()
    valuesDosesExported.append(sum)
valuesDosesExported

In [None]:
df4 = pd.DataFrame({
                    'Countries' : countries,
                    'ReceivedTotal' : valuesDosesReceived,
                    'ExportedTotal' : valuesDosesExported,
                    })
df4

Here I did the same thing as I did for the vaccine dataframe, but for the Countries and the doses they exported and received.

In [None]:
df.reset_index(drop=True, inplace=True)
df

In [None]:
df.to_csv('datasets/vacDataClean.csv')
df2.to_csv('datasets/vacTotalData.csv')
df3.to_csv('datasets/totalCountry.csv')
df4.to_csv('datasets/totalReceivedExported.csv')

Now that all the data cleaning is done, we reset the index of the main dataframe to not have index related issues and then we export all the dataframes to csv files to open in the website.py file.