# The following is a Data Science project that makes an analysis on Covid-19 data obtained from the John Hopkins dataset on Github. 

## In this notebook, I am going to obtain the data from Github and prepare the data for the Visualisations.

In [1]:
%matplotlib inline 
import pandas as pd
import matplotlib
import matplotlib.ticker as mticker
from matplotlib.ticker import ScalarFormatter, FormatStrFormatter
import matplotlib.pyplot as plt
import numpy as np
import datetime
from datetime import timedelta
import wget
import math
from numpy.random import rand
from matplotlib.pyplot import text

### Obtaining data sets through the command 'read_csv()'. I passed the url to the raw data of each data set.

In [2]:
cases_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
recovered_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
deaths_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"

cases = pd.read_csv(cases_url)
recovered = pd.read_csv(recovered_url)
deaths = pd.read_csv(deaths_url)

### Checking each data set to see how it looks for each data set. This is mainly to check if the data sets differ in anyway.

In [3]:
cases.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,6/22/20,6/23/20,6/24/20,6/25/20,6/26/20,6/27/20,6/28/20,6/29/20,6/30/20,7/1/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,29157,29481,29640,30175,30451,30616,30967,31238,31517,31836
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,1995,2047,2114,2192,2269,2330,2402,2466,2535,2580
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,11920,12076,12248,12445,12685,12968,13273,13571,13907,14272
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,855,855,855,855,855,855,855,855,855,855
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,186,189,197,212,212,259,267,276,284,291


In [4]:
recovered.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,6/22/20,6/23/20,6/24/20,6/25/20,6/26/20,6/27/20,6/28/20,6/29/20,6/30/20,7/1/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,8841,9260,9869,10174,10306,10674,12604,13934,14131,15651
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,1159,1195,1217,1250,1298,1346,1384,1438,1459,1516
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,8559,8674,8792,8920,9066,9202,9371,9674,9897,10040
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,796,797,797,797,799,799,799,799,799,799
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,77,77,77,81,81,81,81,93,93,97


In [5]:
deaths.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,6/22/20,6/23/20,6/24/20,6/25/20,6/26/20,6/27/20,6/28/20,6/29/20,6/30/20,7/1/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,598,618,639,675,683,703,721,733,746,774
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,44,45,47,49,51,53,55,58,62,65
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,852,861,869,878,885,892,897,905,912,920
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,52,52,52,52,52,52,52,52,52,52
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,10,10,10,10,10,10,11,11,13,15


### Due to the similar form of each data set, I am proposing the removal of columns Lat and Long because at this moment, they are not needed for creating the graphs or representing the data in any meaningful way because they seem to be irrevelant to the task at hand. Afterwards I will remove the Province/State column in each data set and then group the provinces of the same country together under the name of that country. For example the different areas of Australia are put in separately so I will just group these together under the name Australia with the correct summation of data from each of its provinces.

In [6]:
cases.isna() # Shows that Province/State can be removed because it contains null values

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,6/22/20,6/23/20,6/24/20,6/25/20,6/26/20,6/27/20,6/28/20,6/29/20,6/30/20,7/1/20
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
261,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
262,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
263,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
264,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
def remove_Province_Lat_Long(dataset):
    dataset.drop('Province/State', axis = 1, inplace = True)
    dataset.drop('Lat', axis = 1, inplace = True)
    dataset.drop('Long', axis = 1, inplace = True)

In [8]:
remove_Province_Lat_Long(cases)
remove_Province_Lat_Long(recovered)
remove_Province_Lat_Long(deaths)

In [9]:
columns = cases.columns.values.tolist()

## The next section prepares the code for the Visualisations. I change the format of dates from 'Day/Month/Year' to 'Month Day'. Afterwards I perform a groupby. This groupby takes all countries that have multiple provinces (China for example) and sums up the data on each date into one specific entry. 
## For example, if the Hunan province in China had 100 cases on the 22nd January and the Sichuan province has 200, then the groupby would group these pronvinces together under the name China to have 300 cases on the 22nd January.

In [10]:
for i in columns[1:]:
    cases = cases.rename(columns={i : pd.to_datetime(i).month_name()[0:3] + ' ' + str(pd.to_datetime(i).day)})
    recovered = recovered.rename(columns={i : pd.to_datetime(i).month_name()[0:3] + ' ' + str(pd.to_datetime(i).day) })
    deaths = deaths.rename(columns={i : pd.to_datetime(i).month_name()[0:3] + ' ' + str(pd.to_datetime(i).day) })   

In [11]:
def groupby_and_aggregate_regions(df):
    return df.groupby('Country/Region').agg(sum)


cases = groupby_and_aggregate_regions(cases)
recovered = groupby_and_aggregate_regions(recovered)
deaths = groupby_and_aggregate_regions(deaths)

In [12]:
# I felt that obtaining additional data like the following may be useful in the creation of other graphs in the future
# and to see if there's a trend among countries regarding mean values and maximum values.

def adding_mean_min_max_to_dfs(dataset):
    mean = []
    mmin = []
    mmax = []
    for i in range(len(dataset)):
        mean.append(sum(dataset.iloc[i].values[1:]) / len(dataset.iloc[0].values[1:]))
        a = (list(filter(lambda x : x > 0, cases.iloc[0].values[1:])))
        a.sort()
        mmin.append(a[0])
        mmax.append(max(dataset.iloc[i].values[1:]))
    dataset.insert(0, 'Mean', mean, True)
    dataset.insert(1, 'Min', mmin, True)
    dataset.insert(2, 'Max', mmax, True)
    

adding_mean_min_max_to_dfs(cases)
adding_mean_min_max_to_dfs(recovered)
adding_mean_min_max_to_dfs(deaths)

In [13]:
#Checking to see if it worked
cases.head()

Unnamed: 0_level_0,Mean,Min,Max,Jan 22,Jan 23,Jan 24,Jan 25,Jan 26,Jan 27,Jan 28,...,Jun 22,Jun 23,Jun 24,Jun 25,Jun 26,Jun 27,Jun 28,Jun 29,Jun 30,Jul 1
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,6439.981366,1,31836,0,0,0,0,0,0,0,...,29157,29481,29640,30175,30451,30616,30967,31238,31517,31836
Albania,622.341615,1,2580,0,0,0,0,0,0,0,...,1995,2047,2114,2192,2269,2330,2402,2466,2535,2580
Algeria,3961.552795,1,14272,0,0,0,0,0,0,0,...,11920,12076,12248,12445,12685,12968,13273,13571,13907,14272
Andorra,445.695652,1,855,0,0,0,0,0,0,0,...,855,855,855,855,855,855,855,855,855,855
Angola,44.962733,1,291,0,0,0,0,0,0,0,...,186,189,197,212,212,259,267,276,284,291


In [14]:
recovered.head()

Unnamed: 0_level_0,Mean,Min,Max,Jan 22,Jan 23,Jan 24,Jan 25,Jan 26,Jan 27,Jan 28,...,Jun 22,Jun 23,Jun 24,Jun 25,Jun 26,Jun 27,Jun 28,Jun 29,Jun 30,Jul 1
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,1413.049689,1.0,15651,0,0,0,0,0,0,0,...,8841,9260,9869,10174,10306,10674,12604,13934,14131,15651
Albania,398.515528,1.0,1516,0,0,0,0,0,0,0,...,1159,1195,1217,1250,1298,1346,1384,1438,1459,1516
Algeria,2347.26087,1.0,10040,0,0,0,0,0,0,0,...,8559,8674,8792,8920,9066,9202,9371,9674,9897,10040
Andorra,299.478261,1.0,799,0,0,0,0,0,0,0,...,796,797,797,797,799,799,799,799,799,799
Angola,14.664596,1.0,97,0,0,0,0,0,0,0,...,77,77,77,81,81,81,81,93,93,97


In [15]:
deaths.head()

Unnamed: 0_level_0,Mean,Min,Max,Jan 22,Jan 23,Jan 24,Jan 25,Jan 26,Jan 27,Jan 28,...,Jun 22,Jun 23,Jun 24,Jun 25,Jun 26,Jun 27,Jun 28,Jun 29,Jun 30,Jul 1
Country/Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,133.937888,1.0,774,0,0,0,0,0,0,0,...,598,618,639,675,683,703,721,733,746,774
Albania,19.130435,1.0,65,0,0,0,0,0,0,0,...,44,45,47,49,51,53,55,58,62,65
Algeria,316.409938,1.0,920,0,0,0,0,0,0,0,...,852,861,869,878,885,892,897,905,912,920
Andorra,25.285714,1.0,52,0,0,0,0,0,0,0,...,52,52,52,52,52,52,52,52,52,52
Angola,2.335404,1.0,15,0,0,0,0,0,0,0,...,10,10,10,10,10,10,11,11,13,15
