### Data Collection and Preprocessing

In [1]:
import pandas as pd
df = pd.read_csv('covid_19_data.csv',parse_dates=['Last Update'],index_col=0) 
#parsing according to the last update datetime column
#serial number as index
df.rename(columns={'ObservationDate':'Date', 'Country/Region':'Country'}, inplace=True)
#renaming certain columns

In [2]:
#used for Map later
df_confirmed = pd.read_csv("time_series_covid_19_confirmed.csv")
df_confirmed.rename(columns={'Country/Region':'Country'}, inplace=True)
df_confirmed = df_confirmed[["Province/State","Lat","Long","Country"]]
df_temp = df.copy()
df_temp['Country'].replace({'Mainland China': 'China'}, inplace=True)
df_latlong = pd.merge(df_temp, df_confirmed, on=["Country", "Province/State"])

### Exploratory Data Analysis (EDA)

In [3]:
print("Rows : ",df.shape[0])
print("\nColumns : ",df.shape[1])
print("\nFeatures : ",df.columns.tolist())
print("\nMissing Values : ",df.isnull().sum().sum())
print("\nUnique values : \n", df.nunique())

Rows :  10984

Columns :  7

Features :  ['Date', 'Province/State', 'Country', 'Last Update', 'Confirmed', 'Deaths', 'Recovered']

Missing Values :  5132

Unique values : 
 Date                71
Province/State     293
Country            215
Last Update       1814
Confirmed         1613
Deaths             352
Recovered          768
dtype: int64


In [4]:
df.info() #information about the datatypes 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10984 entries, 1 to 10984
Data columns (total 7 columns):
Date              10984 non-null object
Province/State    5852 non-null object
Country           10984 non-null object
Last Update       10984 non-null datetime64[ns]
Confirmed         10984 non-null float64
Deaths            10984 non-null float64
Recovered         10984 non-null float64
dtypes: datetime64[ns](1), float64(3), object(3)
memory usage: 686.5+ KB


In [5]:
print("Basic Statistics : \n",df.describe()) #metrics

Basic Statistics : 
            Confirmed        Deaths     Recovered
count   10984.000000  10984.000000  10984.000000
mean     1050.407502     43.769938    298.845776
std      6524.120490    426.967500   3051.738866
min         0.000000      0.000000      0.000000
25%         3.000000      0.000000      0.000000
50%        33.000000      0.000000      0.000000
75%       245.000000      2.000000     15.000000
max    110574.000000  13155.000000  63326.000000


In [6]:
print("Earliest Cases : \n",df.head()) 

Earliest Cases : 
            Date Province/State         Country         Last Update  Confirmed  \
SNo                                                                             
1    01/22/2020          Anhui  Mainland China 2020-01-22 17:00:00        1.0   
2    01/22/2020        Beijing  Mainland China 2020-01-22 17:00:00       14.0   
3    01/22/2020      Chongqing  Mainland China 2020-01-22 17:00:00        6.0   
4    01/22/2020         Fujian  Mainland China 2020-01-22 17:00:00        1.0   
5    01/22/2020          Gansu  Mainland China 2020-01-22 17:00:00        0.0   

     Deaths  Recovered  
SNo                     
1       0.0        0.0  
2       0.0        0.0  
3       0.0        0.0  
4       0.0        0.0  
5       0.0        0.0  


In [7]:
print("Latest Cases : \n",df.tail())

Latest Cases : 
              Date Province/State         Country         Last Update  \
SNo                                                                    
10980  04/01/2020        Wyoming              US 2020-04-01 22:04:58   
10981  04/01/2020       Xinjiang  Mainland China 2020-04-01 22:04:58   
10982  04/01/2020          Yukon          Canada 2020-04-01 22:04:58   
10983  04/01/2020         Yunnan  Mainland China 2020-04-01 22:04:58   
10984  04/01/2020       Zhejiang  Mainland China 2020-04-01 22:04:58   

       Confirmed  Deaths  Recovered  
SNo                                  
10980      130.0     0.0        0.0  
10981       76.0     3.0       73.0  
10982        5.0     0.0        0.0  
10983      182.0     2.0      172.0  
10984     1257.0     1.0     1226.0  


In [8]:
print("\n\t\t Date-wise number of cases in each category\n")
df.groupby('Date').sum()
#total number of Confirmed cases , Deaths and Recovery per day.


		 Date-wise number of cases in each category



Unnamed: 0_level_0,Confirmed,Deaths,Recovered
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
01/22/2020,555.0,17.0,28.0
01/23/2020,653.0,18.0,30.0
01/24/2020,941.0,26.0,36.0
01/25/2020,1438.0,42.0,39.0
01/26/2020,2118.0,56.0,52.0
01/27/2020,2927.0,82.0,61.0
01/28/2020,5578.0,131.0,107.0
01/29/2020,6165.0,133.0,126.0
01/30/2020,8235.0,171.0,143.0
01/31/2020,9925.0,213.0,222.0


In [9]:
print("\n\t\tMaximum number of Confirmed,Deaths and Recovered Cases\n")
df1 = df.groupby(['Country', 'Province/State'])['Confirmed', 'Deaths', 'Recovered'].max()
df1 = df.groupby('Date')['Confirmed', 'Deaths', 'Recovered'].sum().reset_index()
df1 = df1[df1['Date']==max(df1['Date'])].reset_index(drop=True)
df1.style.background_gradient(cmap='Pastel1')


		Maximum number of Confirmed,Deaths and Recovered Cases



Unnamed: 0,Date,Confirmed,Deaths,Recovered
0,04/01/2020,932605,46809,193177


In [10]:
print("\n\t\tWorld View - Country wise\n")
df_grouped = df.groupby('Country')['Confirmed', 'Deaths', 'Recovered'].sum().reset_index()
df2 = df_grouped.sort_values(by='Confirmed', ascending=False)
df2 = df2.reset_index(drop=True)
df2.style.background_gradient(cmap='summer')
#Maximum number of cases in China followed by Italy and Iran


		World View - Country wise



Unnamed: 0,Country,Confirmed,Deaths,Recovered
0,Mainland China,4257800.0,148955,2459140.0
1,Italy,1321660.0,129684,162971.0
2,US,1300820.0,22960,27926.0
3,Spain,846650.0,63950,125911.0
4,Germany,666629.0,4796,87128.0
5,Iran,575157.0,37407,188316.0
6,France,467914.0,25386,64099.0
7,South Korea,266905.0,2847,60094.0
8,UK,195591.0,11479,2320.0
9,Switzerland,172905.0,3054,12034.0


In [11]:
print("\n\t\tChina \n")
China = df.query('Country=="Mainland China"').groupby("Last Update")[['Confirmed', 'Deaths', 'Recovered']].sum().reset_index()
China
#Exploring data of china


		China 



Unnamed: 0,Last Update,Confirmed,Deaths,Recovered
0,2020-01-22 17:00:00,547.0,17.0,28.0
1,2020-01-23 17:00:00,639.0,18.0,30.0
2,2020-01-24 17:00:00,916.0,26.0,36.0
3,2020-01-25 17:00:00,1399.0,42.0,39.0
4,2020-01-26 16:00:00,2062.0,56.0,49.0
5,2020-01-27 23:59:00,2863.0,82.0,58.0
6,2020-01-28 23:00:00,5494.0,131.0,101.0
7,2020-01-29 19:30:00,6070.0,133.0,120.0
8,2020-01-30 16:00:00,8124.0,171.0,135.0
9,2020-01-31 15:20:00,29.0,0.0,2.0


In [12]:
print("\n\t\tCountry Wise - Sorted(Alphabetically) order\n")
df.groupby("Country")[['Confirmed', 'Deaths', 'Recovered']].sum().reset_index()


		Country Wise - Sorted(Alphabetically) order



Unnamed: 0,Country,Confirmed,Deaths,Recovered
0,Azerbaijan,1.0,0.0,0.0
1,"('St. Martin',)",2.0,0.0,0.0
2,Afghanistan,1462.0,33.0,31.0
3,Albania,2491.0,106.0,308.0
4,Algeria,5675.0,382.0,655.0
5,Andorra,3177.0,53.0,47.0
6,Angola,56.0,8.0,2.0
7,Antigua and Barbuda,68.0,0.0,0.0
8,Argentina,7548.0,186.0,1171.0
9,Armenia,4670.0,16.0,236.0
