In [1]:
import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv("/content/covid_19_data.csv")
data.head()

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0


In [6]:
data = data.drop(['SNo','Last Update'],axis=1)
data.head()

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered
0,01/22/2020,Anhui,Mainland China,1.0,0.0,0.0
1,01/22/2020,Beijing,Mainland China,14.0,0.0,0.0
2,01/22/2020,Chongqing,Mainland China,6.0,0.0,0.0
3,01/22/2020,Fujian,Mainland China,1.0,0.0,0.0
4,01/22/2020,Gansu,Mainland China,0.0,0.0,0.0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306429 entries, 0 to 306428
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   ObservationDate  306429 non-null  object 
 1   Province/State   228329 non-null  object 
 2   Country/Region   306429 non-null  object 
 3   Confirmed        306429 non-null  float64
 4   Deaths           306429 non-null  float64
 5   Recovered        306429 non-null  float64
dtypes: float64(3), object(3)
memory usage: 14.0+ MB


In [8]:
data['ObservationDate'] = data['ObservationDate'].astype('datetime64[ns]')
data['Confirmed'] = data['Confirmed'].astype('int64')
data['Deaths'] = data['Deaths'].astype('int64')
data['Recovered'] = data['Recovered'].astype('int64')

In [9]:
data.head()

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered
0,2020-01-22,Anhui,Mainland China,1,0,0
1,2020-01-22,Beijing,Mainland China,14,0,0
2,2020-01-22,Chongqing,Mainland China,6,0,0
3,2020-01-22,Fujian,Mainland China,1,0,0
4,2020-01-22,Gansu,Mainland China,0,0,0


In [None]:
data.info()

In [11]:
### Active Case 
data['Active'] = data['Confirmed'] - data['Recovered'] - data['Deaths']
data.head()

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered,Active
0,2020-01-22,Anhui,Mainland China,1,0,0,1
1,2020-01-22,Beijing,Mainland China,14,0,0,14
2,2020-01-22,Chongqing,Mainland China,6,0,0,6
3,2020-01-22,Fujian,Mainland China,1,0,0,1
4,2020-01-22,Gansu,Mainland China,0,0,0,0


In [15]:
### last/current date
data['ObservationDate'].max()

Timestamp('2021-05-29 00:00:00')

In [16]:
### Fetch the latest date data
latest_data = data[data['ObservationDate'] == data['ObservationDate'].max()]
latest_data.head()

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered,Active
305664,2021-05-29,,Afghanistan,70111,2899,57281,9931
305665,2021-05-29,,Albania,132297,2449,129215,633
305666,2021-05-29,,Algeria,128456,3460,89419,35577
305667,2021-05-29,,Andorra,13693,127,13416,150
305668,2021-05-29,,Angola,34180,757,27646,5777


In [17]:
latest_data.shape

(765, 7)

In [18]:
latest_data['Country/Region'].value_counts()

Russia       83
US           58
Japan        49
India        37
Colombia     34
             ..
Iceland       1
Indonesia     1
Iran          1
Iraq          1
Macau         1
Name: Country/Region, Length: 195, dtype: int64

In [19]:
latest_data['Country/Region'].nunique()

195

In [None]:
latest_data['Country/Region'].unique()

In [28]:
## Count of cases for each country
countries = latest_data.groupby("Country/Region")["Confirmed","Deaths","Recovered","Active"].sum()
countries = countries.reset_index()

  countries = latest_data.groupby("Country/Region")["Confirmed","Deaths","Recovered","Active"].sum()


In [29]:
countries

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active
0,Afghanistan,70111,2899,57281,9931
1,Albania,132297,2449,129215,633
2,Algeria,128456,3460,89419,35577
3,Andorra,13693,127,13416,150
4,Angola,34180,757,27646,5777
...,...,...,...,...,...
190,Vietnam,6908,47,2896,3965
191,West Bank and Gaza,307838,3492,300524,3822
192,Yemen,6731,1319,3399,2013
193,Zambia,94751,1276,91594,1881


In [25]:
countries[countries['Country/Region']=="India"]

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active
78,India,27894800,325972,25454320,2114508


In [26]:
countries[countries['Country/Region']=="Mainland China"]

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active
107,Mainland China,91072,4636,86117,319


In [27]:
countries['Country/Region']=="Mainland China"

0      False
1      False
2      False
3      False
4      False
       ...  
190    False
191    False
192    False
193    False
194    False
Name: Country/Region, Length: 195, dtype: bool

In [30]:
### Plot on World Map

In [31]:
import plotly.express as px

In [40]:
world_map = px.choropleth(countries,locations="Country/Region",locationmode="country names",
                          color="Confirmed",color_continuous_scale="reds",range_color=[0,1000000])
world_map

In [42]:
world_map = px.choropleth(countries,locations="Country/Region",locationmode="country names",
                          color="Recovered",color_continuous_scale="greens",range_color=[0,10000000])
world_map

In [None]:
### count of cases datewise
total_cases = data.groupby("ObservationDate")["Confirmed","Deaths","Recovered","Active"].sum().reset_index()
total_cases

In [44]:
### Top 20 countries having maximum confirmed and recovered cases

In [46]:
top = latest_data.groupby("Country/Region")['Confirmed','Recovered'].sum().reset_index()
top = top.sort_values(['Confirmed'],ascending=False)
top.head()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,Country/Region,Confirmed,Recovered
182,US,33251939,0
78,India,27894800,25454320
23,Brazil,16471600,14496224
60,France,5719877,390878
180,Turkey,5235978,5094279


In [None]:
top_20 = top.head(20)
top_20

In [48]:
#### India

In [None]:
india = data[data['Country/Region']=="India"]
india

In [50]:
india['Province/State'].nunique()

38

In [53]:
india['Province/State'].unique()

array([nan, 'Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', 'Dadar Nagar Haveli', 'Delhi', 'Goa', 'Gujarat',
       'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand',
       'Karnataka', 'Kerala', 'Ladakh', 'Madhya Pradesh', 'Maharashtra',
       'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha',
       'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu',
       'Telangana', 'Tripura', 'Unknown', 'Uttar Pradesh', 'Uttarakhand',
       'West Bengal', 'Dadra and Nagar Haveli and Daman and Diu',
       'Lakshadweep'], dtype=object)

In [None]:
india['Province/State'] = india['Province/State'].fillna("Unknown")

In [58]:
india['Province/State'].unique()

array(['Unknown', 'Andaman and Nicobar Islands', 'Andhra Pradesh',
       'Arunachal Pradesh', 'Assam', 'Bihar', 'Chandigarh',
       'Chhattisgarh', 'Dadar Nagar Haveli', 'Delhi', 'Goa', 'Gujarat',
       'Haryana', 'Himachal Pradesh', 'Jammu and Kashmir', 'Jharkhand',
       'Karnataka', 'Kerala', 'Ladakh', 'Madhya Pradesh', 'Maharashtra',
       'Manipur', 'Meghalaya', 'Mizoram', 'Nagaland', 'Odisha',
       'Puducherry', 'Punjab', 'Rajasthan', 'Sikkim', 'Tamil Nadu',
       'Telangana', 'Tripura', 'Uttar Pradesh', 'Uttarakhand',
       'West Bengal', 'Dadra and Nagar Haveli and Daman and Diu',
       'Lakshadweep'], dtype=object)

In [56]:
india['Province/State'].nunique()

38

In [None]:
india_latest_data = india[india['ObservationDate'] ==india['ObservationDate'].max()]
india_latest_data

In [None]:
top_state = india_latest_data.groupby("Province/State")['Confirmed','Recovered'].sum().reset_index()
top_state = top_state.sort_values(['Confirmed'],ascending=False)
top_state.head(20)

In [65]:
top_state['Confirmed'].max()

5713215

In [73]:
(top_state[top_state['Confirmed']==top_state['Confirmed'].max()]['Province/State'].values)[0]

'Maharashtra'