<a href="https://colab.research.google.com/github/Akashkunwar/dscodes/blob/master/cases_vaccination.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
###COVID DATA
## Retreving Data
# Assigning Github URLs
cases_url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
deaths_url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"
recovered_url="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"

# Reading URLs through Pandas liberary
total_cases = pd.read_csv(cases_url)
total_deaths = pd.read_csv(deaths_url)
total_recovered = pd.read_csv(recovered_url)

## Cleaning Data
# Extreacting and storing column dates in dates
dates = total_cases.columns[4:]

# Reshaping total cases column from wide shape to df shape
total_cases_df = total_cases.melt(
    id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
    value_vars=dates, 
    var_name='Date', 
    value_name='Confirmed'
    )

# Reshaping total deaths column from wide shape to df shape
total_deaths_df = total_deaths.melt(
    id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
    value_vars=dates, 
    var_name='Date', 
    value_name='Deaths'
    )

# Reshaping total recovered column from wide shape to df shape
total_recovered_df = total_recovered.melt(
    id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
    value_vars=dates, 
    var_name='Date', 
    value_name='Recovered'
)

# Merging total_cases_df and total_deaths_df
covid_df = total_cases_df.merge(
  right=total_deaths_df, 
  how='left',
  on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long']
)

# Merging covid_df and total_recovered_df
covid_df = covid_df.merge(
  right=total_recovered_df, 
  how='left',
  on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long']
)

# Converting Date to proper datetime format
covid_df["Date"] = pd.to_datetime(covid_df["Date"])

# Inserting converted date to full table
covid_df = covid_df.sort_values(['Country/Region', 'Date'])
#covid_df = covid_df.sort_values(['Country/Region', 'Date'], ascending=[True, True])
# Reseting Index
# covid_df = covid_df.reset_index()

# Copying covid_df to covid_df
#covid_df = covid_df.copy()

# Renaming column names
covid_df = covid_df.rename(columns={'Country/Region': 'Country',
                                    'Province/State':'Province',
                                    'Confirmed': 'Total cases',
                                    'Deaths': 'Total deaths',
                                    'Recovered':'Total recovered'})

# Filling NaN value of provinve with country name and storing it in coiuntry column
covid_df["Province"] = covid_df["Province"].fillna(covid_df["Country"])
covid_df["Country"] = covid_df["Province"]

# Adding New cases, New deaths, New recovered to covid_df
covid_df["New cases"] = covid_df.groupby('Country')['Total cases'].diff().fillna(0)
covid_df["New deaths"] = covid_df.groupby('Country')['Total deaths'].diff().fillna(0)
covid_df["New recovered"] = covid_df.groupby('Country')['Total recovered'].diff().fillna(0)

# Converting float value to int in covid_df columns
covid_df["Total recovered"] = covid_df["Total recovered"].fillna(0)
covid_df["Total recovered"] = covid_df["Total recovered"].astype(int).fillna(0)
covid_df["New cases"] = covid_df["New cases"].astype(int)
covid_df["New deaths"] = covid_df["New deaths"].astype(int)
covid_df["New recovered"] = covid_df["New recovered"].astype(int)

# Arranging Columns
covid_df = covid_df[['Date', 'Province', 'Country','Lat','Long',
 'Total cases', 'Total deaths','Total recovered', 'New cases','New deaths','New recovered']]

# Removind unnecessary columns
covid_df = covid_df.drop(['Province', 'Lat', 'Long', 'Total recovered', 'New recovered'],axis=1)

# Reseting jumbled index in order
covid_df = covid_df.reset_index(drop=True)

In [3]:
###VACCINE DATA
## Retreving Data
# Assigning Github URLs
vaccine_data = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/vaccinations/vaccinations.csv')
vaccine_data['date']=pd.to_datetime(vaccine_data.date)

#Date format change
vaccine_data['date'] = pd.to_datetime(vaccine_data['date'],format='%y-%m-%d').dt.date

#Dropping uncecessary data
vaccine_data.drop(['total_vaccinations_per_hundred',
                   'iso_code',
                   'people_vaccinated_per_hundred',
                   'people_fully_vaccinated_per_hundred',
                   'daily_vaccinations_per_million',
                   'daily_vaccinations_raw',
                   'total_vaccinations',
                   'people_vaccinated',
                   'daily_people_vaccinated_per_hundred',
                   'total_boosters_per_hundred'], inplace=True, axis=1)

#Rename vaccines to vaccine_name
# vaccine_data.rename(columns = {'vaccines':'vaccine_name'}, inplace = True) 

#Adding total vaccination column
Total_vaccinations = vaccine_data.groupby('location')['daily_vaccinations'].cumsum()
vaccine_data.insert(5,'Total_vaccinations',Total_vaccinations)

#Renaming columns
vaccine_df = vaccine_data.rename(columns={'location': 'Country',
                                    'date':'Date',
                                    'people_fully_vaccinated': 'Fully vaccinated',
                                    'total_boosters': 'Total boosters',
                                    'Total_vaccinations':'Total vaccinations',
                                    'daily_vaccinations': 'Daily vaccinations',
                                    'daily_people_vaccinated': 'Daily people vaccinated'})
#Arranging columns
vaccine_df = vaccine_df[['Date',
                         'Country',
                         "Fully vaccinated",
                         'Total vaccinations',
                         'Total boosters',
                         'Daily vaccinations',
                         'Daily people vaccinated']]

# Converting Date to proper datetime format
vaccine_df["Date"] = pd.to_datetime(vaccine_df["Date"])
vaccine_df = vaccine_df.fillna(0)
vaccine_df = vaccine_df.astype({"Fully vaccinated":'int',
                                'Total vaccinations':'int',
                                'Total boosters':'int',
                                'Daily vaccinations':'int',
                                'Daily people vaccinated':'int'})

#Removing unnecessary rows
# vaccine_df = vaccine_df.set_index("Country").drop("World")
# vaccine_df.reset_index(inplace=True)

In [34]:
a = pd.merge(covid_df, vaccine_df, how="outer", on=["Date", "Country"])

In [36]:
a[a.Country=='France']

Unnamed: 0,Date,Country,Total cases,Total deaths,New cases,New deaths,Fully vaccinated,Total vaccinations,Total boosters,Daily vaccinations,Daily people vaccinated
107291,2020-01-22,France,0.0,0.0,0.0,0.0,,,,,
107303,2020-01-23,France,0.0,0.0,0.0,0.0,,,,,
107315,2020-01-24,France,2.0,0.0,2.0,0.0,,,,,
107327,2020-01-25,France,3.0,0.0,1.0,0.0,,,,,
107339,2020-01-26,France,3.0,0.0,0.0,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
117959,2022-06-29,France,30083301.0,146171.0,124724.0,48.0,52906795.0,145680030.0,39600767.0,62868.0,1101.0
117971,2022-06-30,France,30216647.0,146213.0,133346.0,42.0,52909167.0,145748785.0,39705832.0,68755.0,1131.0
117983,2022-07-01,France,30341632.0,146265.0,124985.0,52.0,,,,,
117995,2022-07-02,France,30341632.0,146265.0,0.0,0.0,,,,,


In [37]:
vaccine_df[vaccine_df.Country=='France']

Unnamed: 0,Date,Country,Fully vaccinated,Total vaccinations,Total boosters,Daily vaccinations,Daily people vaccinated
35127,2020-12-27,France,0,0,0,0,0
35128,2020-12-28,France,1,673,0,673,672
35129,2020-12-29,France,1,1197,0,524,523
35130,2020-12-30,France,1,1693,0,496,496
35131,2020-12-31,France,2,2175,0,482,481
...,...,...,...,...,...,...,...
35673,2022-06-26,France,52899646,145508018,39358567,47859,1028
35674,2022-06-27,France,52901504,145559372,39415314,51354,1042
35675,2022-06-28,France,52904126,145617162,39519009,57790,1059
35676,2022-06-29,France,52906795,145680030,39600767,62868,1101


In [7]:
# merged = covid_df.combine_first(vaccine_df)
# merged = merged.fillna(0)
# merged = merged.astype({"Fully vaccinated":'int',
#                                 'Total vaccinations':'int',
#                                 'Total boosters':'int',
#                                 'Daily vaccinations':'int',
#                                 'Daily people vaccinated':'int'})
# merged = merged[['Date',
#                  'Country',
#                  'New cases',
#                  'New deaths',
#                  'New recovered',
#                  'Total boosters',
#                  'Total cases',
#                  'Total deaths',
#                  'Total recovered',
#                  'Daily people vaccinated',
#                  'Daily vaccinations',
#                  'Fully vaccinated',
#                  'Total vaccinations']]
# merged
covid_df.combine_first(vaccine_df)

Unnamed: 0,Country,Daily people vaccinated,Daily vaccinations,Date,Fully vaccinated,New cases,New deaths,Total boosters,Total cases,Total deaths,Total vaccinations
0,Afghanistan,0.0,0.0,2020-01-22,0.0,0,0,0.0,0,0,0.0
1,Afghanistan,1367.0,1367.0,2020-01-23,0.0,0,0,0.0,0,0,1367.0
2,Afghanistan,1367.0,1367.0,2020-01-24,0.0,0,0,0.0,0,0,2734.0
3,Afghanistan,1367.0,1367.0,2020-01-25,0.0,0,0,0.0,0,0,4101.0
4,Afghanistan,1367.0,1367.0,2020-01-26,0.0,0,0,0.0,0,0,5468.0
...,...,...,...,...,...,...,...,...,...,...,...
254785,Zimbabwe,,,2022-06-29,,137,4,,255520,5553,
254786,Zimbabwe,,,2022-06-30,,66,2,,255586,5555,
254787,Zimbabwe,,,2022-07-01,,0,0,,255586,5555,
254788,Zimbabwe,,,2022-07-02,,0,0,,255586,5555,


In [19]:
pd.merge(covid_df, vaccine_df, how="outer", on=["Date", "Country"])

Unnamed: 0,Date,Country,Total cases,Total deaths,New cases,New deaths,Fully vaccinated,Total vaccinations,Total boosters,Daily vaccinations,Daily people vaccinated
0,2020-01-22,Afghanistan,0.0,0.0,0.0,0.0,,,,,
1,2020-01-23,Afghanistan,0.0,0.0,0.0,0.0,,,,,
2,2020-01-24,Afghanistan,0.0,0.0,0.0,0.0,,,,,
3,2020-01-25,Afghanistan,0.0,0.0,0.0,0.0,,,,,
4,2020-01-26,Afghanistan,0.0,0.0,0.0,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
272925,2022-06-29,World,,,,,4.799123e+09,0.0,2.123141e+09,0.0,0.0
272926,2022-06-30,World,,,,,4.800716e+09,0.0,2.125459e+09,0.0,0.0
272927,2022-07-01,World,,,,,4.801855e+09,0.0,2.127183e+09,0.0,0.0
272928,2022-07-02,World,,,,,4.802835e+09,0.0,2.129358e+09,0.0,0.0


In [30]:
a = pd.merge(covid_df, vaccine_df, how="outer", on=["Date", "Country"])

Unnamed: 0,Date,Country,Total cases,Total deaths,New cases,New deaths,Fully vaccinated,Total vaccinations,Total boosters,Daily vaccinations,Daily people vaccinated
0,2020-01-22,Afghanistan,0.0,0.0,0.0,0.0,,,,,
1,2020-01-23,Afghanistan,0.0,0.0,0.0,0.0,,,,,
2,2020-01-24,Afghanistan,0.0,0.0,0.0,0.0,,,,,
3,2020-01-25,Afghanistan,0.0,0.0,0.0,0.0,,,,,
4,2020-01-26,Afghanistan,0.0,0.0,0.0,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
272925,2022-06-29,World,,,,,4.799123e+09,0.0,2.123141e+09,0.0,0.0
272926,2022-06-30,World,,,,,4.800716e+09,0.0,2.125459e+09,0.0,0.0
272927,2022-07-01,World,,,,,4.801855e+09,0.0,2.127183e+09,0.0,0.0
272928,2022-07-02,World,,,,,4.802835e+09,0.0,2.129358e+09,0.0,0.0


In [31]:
a[a.Country=='India']

Unnamed: 0,Date,Country,Total cases,Total deaths,New cases,New deaths,Fully vaccinated,Total vaccinations,Total boosters,Daily vaccinations,Daily people vaccinated
132312,2020-01-22,India,0.0,0.0,0.0,0.0,,,,,
132313,2020-01-23,India,0.0,0.0,0.0,0.0,,,,,
132314,2020-01-24,India,0.0,0.0,0.0,0.0,,,,,
132315,2020-01-25,India,0.0,0.0,0.0,0.0,,,,,
132316,2020-01-26,India,0.0,0.0,0.0,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
133201,2022-06-29,India,43452164.0,525116.0,18819.0,39.0,913642889.0,1.970128e+09,43868135.0,1432271.0,177132.0
133202,2022-06-30,India,43469234.0,525139.0,17070.0,23.0,914479703.0,1.971543e+09,44348776.0,1414474.0,178313.0
133203,2022-07-01,India,43486326.0,525168.0,17092.0,29.0,915190727.0,1.972913e+09,44697360.0,1369941.0,175455.0
133204,2022-07-02,India,43502429.0,525199.0,16103.0,31.0,915792386.0,1.974191e+09,45002940.0,1278094.0,165575.0


In [32]:
vaccine_df[vaccine_df.Country=='India']

Unnamed: 0,Date,Country,Fully vaccinated,Total vaccinations,Total boosters,Daily vaccinations,Daily people vaccinated
45795,2021-01-15,India,0,0,0,0,0
45796,2021-01-16,India,0,191181,0,191181,191181
45797,2021-01-17,India,0,303331,0,112150,112150
45798,2021-01-18,India,0,454681,0,151350,151350
45799,2021-01-19,India,0,623390,0,168709,168709
...,...,...,...,...,...,...,...
46325,2022-06-29,India,913642889,1970128269,43868135,1432271,177132
46326,2022-06-30,India,914479703,1971542743,44348776,1414474,178313
46327,2022-07-01,India,915190727,1972912684,44697360,1369941,175455
46328,2022-07-02,India,915792386,1974190778,45002940,1278094,165575


In [33]:
covid_df[covid_df.Country=='India']

Unnamed: 0,Date,Country,Total cases,Total deaths,New cases,New deaths
132312,2020-01-22,India,0,0,0,0
132313,2020-01-23,India,0,0,0,0
132314,2020-01-24,India,0,0,0,0
132315,2020-01-25,India,0,0,0,0
132316,2020-01-26,India,0,0,0,0
...,...,...,...,...,...,...
133201,2022-06-29,India,43452164,525116,18819,39
133202,2022-06-30,India,43469234,525139,17070,23
133203,2022-07-01,India,43486326,525168,17092,29
133204,2022-07-02,India,43502429,525199,16103,31


In [20]:
pd.concat([covid_df, vaccine_df], axis=0, ignore_index=True)

Unnamed: 0,Date,Country,Total cases,Total deaths,New cases,New deaths,Fully vaccinated,Total vaccinations,Total boosters,Daily vaccinations,Daily people vaccinated
0,2020-01-22,Afghanistan,0.0,0.0,0.0,0.0,,,,,
1,2020-01-23,Afghanistan,0.0,0.0,0.0,0.0,,,,,
2,2020-01-24,Afghanistan,0.0,0.0,0.0,0.0,,,,,
3,2020-01-25,Afghanistan,0.0,0.0,0.0,0.0,,,,,
4,2020-01-26,Afghanistan,0.0,0.0,0.0,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
366697,2022-06-22,Zimbabwe,,,,,4588340.0,11898709.0,1043552.0,7816.0,2587.0
366698,2022-06-23,Zimbabwe,,,,,4616555.0,11915330.0,1064036.0,16621.0,5081.0
366699,2022-06-24,Zimbabwe,,,,,4620396.0,11931309.0,1065656.0,15979.0,4958.0
366700,2022-06-25,Zimbabwe,,,,,4622428.0,11947500.0,1067073.0,16191.0,5001.0


In [21]:
pd.merge(covid_df, vaccine_df, how="outer", on=['Date','Country'])

Unnamed: 0,Date,Country,Total cases,Total deaths,New cases,New deaths,Fully vaccinated,Total vaccinations,Total boosters,Daily vaccinations,Daily people vaccinated
0,2020-01-22,Afghanistan,0.0,0.0,0.0,0.0,,,,,
1,2020-01-23,Afghanistan,0.0,0.0,0.0,0.0,,,,,
2,2020-01-24,Afghanistan,0.0,0.0,0.0,0.0,,,,,
3,2020-01-25,Afghanistan,0.0,0.0,0.0,0.0,,,,,
4,2020-01-26,Afghanistan,0.0,0.0,0.0,0.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
272925,2022-06-29,World,,,,,4.799123e+09,0.0,2.123141e+09,0.0,0.0
272926,2022-06-30,World,,,,,4.800716e+09,0.0,2.125459e+09,0.0,0.0
272927,2022-07-01,World,,,,,4.801855e+09,0.0,2.127183e+09,0.0,0.0
272928,2022-07-02,World,,,,,4.802835e+09,0.0,2.129358e+09,0.0,0.0


In [22]:
vaccine_df

Unnamed: 0,Date,Country,Fully vaccinated,Total vaccinations,Total boosters,Daily vaccinations,Daily people vaccinated
0,2021-02-22,Afghanistan,0,0,0,0,0
1,2021-02-23,Afghanistan,0,1367,0,1367,1367
2,2021-02-24,Afghanistan,0,2734,0,1367,1367
3,2021-02-25,Afghanistan,0,4101,0,1367,1367
4,2021-02-26,Afghanistan,0,5468,0,1367,1367
...,...,...,...,...,...,...,...
111907,2022-06-22,Zimbabwe,4588340,11898709,1043552,7816,2587
111908,2022-06-23,Zimbabwe,4616555,11915330,1064036,16621,5081
111909,2022-06-24,Zimbabwe,4620396,11931309,1065656,15979,4958
111910,2022-06-25,Zimbabwe,4622428,11947500,1067073,16191,5001


In [23]:
len(set(list(covid_df.Country.unique())) ^ set(list(vaccine_df.Country.unique())))

131

In [24]:
len(set(list(covid_df.Country.unique()))) - len(set(list(vaccine_df.Country.unique())))

49

In [25]:
set(list(vaccine_df.Country.unique())) - set(list(covid_df.Country.unique()))
# 'United States'
# 'South Korea'
# 'Australia'
# 'China'
# 'Myanmar'
# 'Canada'
# 'Democratic Republic of Congo','Congo'
# 'Falkland Islands'
# 'Taiwan'
# 'Saint Helena'

{'Africa',
 'Asia',
 'Australia',
 'Bonaire Sint Eustatius and Saba',
 'Canada',
 'Cape Verde',
 'China',
 'Congo',
 'Democratic Republic of Congo',
 'England',
 'Europe',
 'European Union',
 'Faeroe Islands',
 'Falkland Islands',
 'High income',
 'Low income',
 'Lower middle income',
 'Macao',
 'Myanmar',
 'Nauru',
 'Niue',
 'North America',
 'Northern Cyprus',
 'Northern Ireland',
 'Oceania',
 'Palestine',
 'Pitcairn',
 'Saint Helena',
 'Scotland',
 'Sint Maarten (Dutch part)',
 'South America',
 'South Korea',
 'Taiwan',
 'Timor',
 'Tokelau',
 'Turkmenistan',
 'Tuvalu',
 'United States',
 'Upper middle income',
 'Wales',
 'World'}

In [26]:
set(list(covid_df.Country.unique())) - set(list(vaccine_df.Country.unique()))
# 'Korea, South'
# 'US'
# 'Australian Capital Territory','Western Australia',
# 'Congo (Brazzaville)','Congo (Kinshasa)
# 'Falkland Islands (Malvinas)'
# 'Taiwan*'
# 'Saint Helena, Ascension and Tristan da Cunha',

{'Alberta',
 'Anhui',
 'Antarctica',
 'Australian Capital Territory',
 'Beijing',
 'Bonaire, Sint Eustatius and Saba',
 'British Columbia',
 'Burma',
 'Cabo Verde',
 'Channel Islands',
 'Chongqing',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Diamond Princess',
 'Eritrea',
 'Falkland Islands (Malvinas)',
 'Faroe Islands',
 'French Guiana',
 'Fujian',
 'Gansu',
 'Grand Princess',
 'Guadeloupe',
 'Guangdong',
 'Guangxi',
 'Guizhou',
 'Hainan',
 'Hebei',
 'Heilongjiang',
 'Henan',
 'Holy See',
 'Hubei',
 'Hunan',
 'Inner Mongolia',
 'Jiangsu',
 'Jiangxi',
 'Jilin',
 'Korea, North',
 'Korea, South',
 'Liaoning',
 'MS Zaandam',
 'Macau',
 'Manitoba',
 'Marshall Islands',
 'Martinique',
 'Mayotte',
 'Micronesia',
 'New Brunswick',
 'New South Wales',
 'Newfoundland and Labrador',
 'Ningxia',
 'Northern Territory',
 'Northwest Territories',
 'Nova Scotia',
 'Nunavut',
 'Ontario',
 'Palau',
 'Prince Edward Island',
 'Qinghai',
 'Quebec',
 'Queensland',
 'Repatriated Travellers',
 'Reunion',

In [27]:
list(covid_df.Country.unique())
list(vaccine_df.Country.unique())
len(list(set(list(covid_df.Country.unique())).intersection(list(vaccine_df.Country.unique()))))

194

In [28]:
set(list(covid_df.Country.unique())) ^ set(list(vaccine_df.Country.unique()))

{'Africa',
 'Alberta',
 'Anhui',
 'Antarctica',
 'Asia',
 'Australia',
 'Australian Capital Territory',
 'Beijing',
 'Bonaire Sint Eustatius and Saba',
 'Bonaire, Sint Eustatius and Saba',
 'British Columbia',
 'Burma',
 'Cabo Verde',
 'Canada',
 'Cape Verde',
 'Channel Islands',
 'China',
 'Chongqing',
 'Congo',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Democratic Republic of Congo',
 'Diamond Princess',
 'England',
 'Eritrea',
 'Europe',
 'European Union',
 'Faeroe Islands',
 'Falkland Islands',
 'Falkland Islands (Malvinas)',
 'Faroe Islands',
 'French Guiana',
 'Fujian',
 'Gansu',
 'Grand Princess',
 'Guadeloupe',
 'Guangdong',
 'Guangxi',
 'Guizhou',
 'Hainan',
 'Hebei',
 'Heilongjiang',
 'Henan',
 'High income',
 'Holy See',
 'Hubei',
 'Hunan',
 'Inner Mongolia',
 'Jiangsu',
 'Jiangxi',
 'Jilin',
 'Korea, North',
 'Korea, South',
 'Liaoning',
 'Low income',
 'Lower middle income',
 'MS Zaandam',
 'Macao',
 'Macau',
 'Manitoba',
 'Marshall Islands',
 'Martinique',
 'Mayotte',