## Data Source

https://api.covid19india.org/

## Import libraries

In [1]:
# to get web contents
import requests
# to parse json contents
import json
# to parse csv files
import csv

# for numerical operations
import numpy as np
# to store and analysis data in dataframes
import pandas as pd

## Get data

### df_1 - Till Apr 19

In [2]:
# df_1 - Till Apr 19
# ==================

# get response from the web page
response = requests.get('https://api.covid19india.org/raw_data1.json')
# get contents from the response
content = response.content
# parse the json file
parsed = json.loads(content)
# keys
parsed.keys()

dict_keys(['raw_data'])

In [3]:
# save to df
df_1 = pd.DataFrame(parsed['raw_data'])

# shape of the dataframe
print(df_1.shape)

# # list of columns
print(df_1.columns)

# # first few rows
# df_1.head(2)

(17306, 20)
Index(['agebracket', 'backupnotes', 'contractedfromwhichpatientsuspected',
       'currentstatus', 'dateannounced', 'detectedcity', 'detecteddistrict',
       'detectedstate', 'estimatedonsetdate', 'gender', 'nationality', 'notes',
       'patientnumber', 'source1', 'source2', 'source3', 'statecode',
       'statepatientnumber', 'statuschangedate', 'typeoftransmission'],
      dtype='object')


### df_2 - Till Apr 26

In [4]:
# df_2 - Till Apr 26
# ==================

# get response from the web page
response = requests.get('https://api.covid19india.org/raw_data2.json')

# get contents from the response
content = response.content

# parse the json file
parsed = json.loads(content)

# keys
parsed.keys()

dict_keys(['raw_data'])

In [5]:
# save to df
df_2 = pd.DataFrame(parsed['raw_data'])

# shape of the dataframe
print(df_2.shape)

# # list of columns
print(df_2.columns)

# # first few rows
# df_2.head(2)

(10585, 20)
Index(['agebracket', 'backupnotes', 'contractedfromwhichpatientsuspected',
       'currentstatus', 'dateannounced', 'detectedcity', 'detecteddistrict',
       'detectedstate', 'estimatedonsetdate', 'gender', 'nationality', 'notes',
       'patientnumber', 'source1', 'source2', 'source3', 'statecode',
       'statepatientnumber', 'statuschangedate', 'typeoftransmission'],
      dtype='object')


### df_3 - Live

In [6]:
# df_3 - Live
# ===========

# get response from the web page
response = requests.get('https://api.covid19india.org/raw_data3.json')

# get contents from the response
content = response.content

# parse the json file
parsed = json.loads(content)

# keys
parsed.keys()

dict_keys(['raw_data'])

In [7]:
# save to df
df_3 = pd.DataFrame(parsed['raw_data'])

# shape of the dataframe
print(df_3.shape)

# # list of columns
print(df_3.columns)

# # first few rows
# df_3.head(2)

(7606, 20)
Index(['agebracket', 'contractedfromwhichpatientsuspected', 'currentstatus',
       'dateannounced', 'detectedcity', 'detecteddistrict', 'detectedstate',
       'entryid', 'gender', 'nationality', 'notes', 'numcases',
       'patientnumber', 'source1', 'source2', 'source3', 'statecode',
       'statepatientnumber', 'statuschangedate', 'typeoftransmission'],
      dtype='object')


In [8]:
# np.setdiff1d(df_1.columns, df_3.columns)

### full data

In [9]:
# full data
# =========

# drop unwanted rows
# df_1 = df_1.drop('_dnp34', axis = 1)
df_3 = df_3.drop(['entryid', 'numcases'], axis = 1)

# rename columns
# df_3 = df_3.rename({'entryid' : 'patientnumber'})

# columns
df_3.columns

Index(['agebracket', 'contractedfromwhichpatientsuspected', 'currentstatus',
       'dateannounced', 'detectedcity', 'detecteddistrict', 'detectedstate',
       'gender', 'nationality', 'notes', 'patientnumber', 'source1', 'source2',
       'source3', 'statecode', 'statepatientnumber', 'statuschangedate',
       'typeoftransmission'],
      dtype='object')

In [10]:
# df_3[['entryid', 'patientnumber']]

In [11]:
# concatenate data
df = pd.concat([df_1, df_2, df_3])

# shape of the data
df.shape

(35497, 20)

In [12]:
# list of columns
df.columns

Index(['agebracket', 'backupnotes', 'contractedfromwhichpatientsuspected',
       'currentstatus', 'dateannounced', 'detectedcity', 'detecteddistrict',
       'detectedstate', 'estimatedonsetdate', 'gender', 'nationality', 'notes',
       'patientnumber', 'source1', 'source2', 'source3', 'statecode',
       'statepatientnumber', 'statuschangedate', 'typeoftransmission'],
      dtype='object')

In [13]:
# first 3 rows of the dataframe
df.head(3)

Unnamed: 0,agebracket,backupnotes,contractedfromwhichpatientsuspected,currentstatus,dateannounced,detectedcity,detecteddistrict,detectedstate,estimatedonsetdate,gender,nationality,notes,patientnumber,source1,source2,source3,statecode,statepatientnumber,statuschangedate,typeoftransmission
0,20.0,Student from Wuhan,,Recovered,30/01/2020,Thrissur,Thrissur,Kerala,,F,India,Travelled from Wuhan,1,https://twitter.com/vijayanpinarayi/status/122...,https://weather.com/en-IN/india/news/news/2020...,,KL,KL-TS-P1,14/02/2020,Imported
1,,Student from Wuhan,,Recovered,02/02/2020,Alappuzha,Alappuzha,Kerala,,,India,Travelled from Wuhan,2,https://www.indiatoday.in/india/story/kerala-r...,https://weather.com/en-IN/india/news/news/2020...,,KL,KL-AL-P1,14/02/2020,Imported
2,,Student from Wuhan,,Recovered,03/02/2020,Kasaragod,Kasaragod,Kerala,,,India,Travelled from Wuhan,3,https://www.indiatoday.in/india/story/kerala-n...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,KL,KL-KS-P1,14/02/2020,Imported


In [14]:
# creating patient id column from patient number
# ===============================================

df['p_id'] = df['patientnumber'].apply(lambda x : 'P'+str(x))
df.columns

Index(['agebracket', 'backupnotes', 'contractedfromwhichpatientsuspected',
       'currentstatus', 'dateannounced', 'detectedcity', 'detecteddistrict',
       'detectedstate', 'estimatedonsetdate', 'gender', 'nationality', 'notes',
       'patientnumber', 'source1', 'source2', 'source3', 'statecode',
       'statepatientnumber', 'statuschangedate', 'typeoftransmission', 'p_id'],
      dtype='object')

## Rearrange and rename columns

In [15]:
# order of columns
cols = ['patientnumber', 'p_id', 'statepatientnumber', 
        'dateannounced', 'agebracket', 'gender', 
        'detectedcity', 'detecteddistrict', 'detectedstate', 'statecode', 'nationality',
        'typeoftransmission', 'contractedfromwhichpatientsuspected',
        'statuschangedate', 'currentstatus', 'estimatedonsetdate',
        'source1', 'source2', 'source3', 'notes', 'backupnotes']

# rearrange columns
df = df[cols]

# rename columns
df.columns = ['patient_number', 'p_id', 'state_patient_number', 
              'date_announced', 'age_bracket', 'gender', 
              'detected_city', 'detected_district', 'detected_state', 'state_code', 'nationality',
              'type_of_transmission', 'contracted_from_which_patient_suspected',
              'status_change_date', 'current_status', 'estimated_onset_date',
              'source1', 'source2', 'source3', 'notes', 'backup_notes']

# dataframe shape
df.shape

(35497, 21)

In [16]:
# first 3 rows of the dataframe
df.head(3)

Unnamed: 0,patient_number,p_id,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,state_code,...,type_of_transmission,contracted_from_which_patient_suspected,status_change_date,current_status,estimated_onset_date,source1,source2,source3,notes,backup_notes
0,1,P1,KL-TS-P1,30/01/2020,20.0,F,Thrissur,Thrissur,Kerala,KL,...,Imported,,14/02/2020,Recovered,,https://twitter.com/vijayanpinarayi/status/122...,https://weather.com/en-IN/india/news/news/2020...,,Travelled from Wuhan,Student from Wuhan
1,2,P2,KL-AL-P1,02/02/2020,,,Alappuzha,Alappuzha,Kerala,KL,...,Imported,,14/02/2020,Recovered,,https://www.indiatoday.in/india/story/kerala-r...,https://weather.com/en-IN/india/news/news/2020...,,Travelled from Wuhan,Student from Wuhan
2,3,P3,KL-KS-P1,03/02/2020,,,Kasaragod,Kasaragod,Kerala,KL,...,Imported,,14/02/2020,Recovered,,https://www.indiatoday.in/india/story/kerala-n...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,Travelled from Wuhan,Student from Wuhan


## Missing values

In [17]:
# no. of empty values in each column
# ==================================

print(df.shape, '\n')

for i in df.columns:
    print(i, '\t', df[df[i]==''].shape[0])

(35497, 21) 

patient_number 	 1328
p_id 	 0
state_patient_number 	 29203
date_announced 	 451
age_bracket 	 29401
gender 	 26968
detected_city 	 32857
detected_district 	 8351
detected_state 	 638
state_code 	 638
nationality 	 32790
type_of_transmission 	 32507
contracted_from_which_patient_suspected 	 33701
status_change_date 	 7717
current_status 	 631
estimated_onset_date 	 27891
source1 	 1048
source2 	 31578
source3 	 35018
notes 	 7239
backup_notes 	 27530


In [18]:
# no. of non-empty values in each column
# ===================================

print(df.shape, '\n')

for i in df.columns:
    print(i, '\t', df[df[i]!=''].shape[0])

(35497, 21) 

patient_number 	 34169
p_id 	 35497
state_patient_number 	 6294
date_announced 	 35046
age_bracket 	 6096
gender 	 8529
detected_city 	 2640
detected_district 	 27146
detected_state 	 34859
state_code 	 34859
nationality 	 2707
type_of_transmission 	 2990
contracted_from_which_patient_suspected 	 1796
status_change_date 	 27780
current_status 	 34866
estimated_onset_date 	 7606
source1 	 34449
source2 	 3919
source3 	 479
notes 	 28258
backup_notes 	 7967


In [19]:
# replacing empty strings with np.nan
# ==================================-

print(df.shape)

df = df.replace(r'', np.nan, regex=True)
df.isna().sum()

(35497, 21)


patient_number                              1328
p_id                                           0
state_patient_number                       29203
date_announced                               451
age_bracket                                29401
gender                                     26968
detected_city                              32857
detected_district                           8351
detected_state                               638
state_code                                   638
nationality                                32790
type_of_transmission                       32507
contracted_from_which_patient_suspected    33701
status_change_date                          7717
current_status                               631
estimated_onset_date                       35497
source1                                     1048
source2                                    31578
source3                                    35018
notes                                       7239
backup_notes        

In [20]:
# droping empty rows (row with just row number but without patient entry
# ======================================================================

print(df.shape)

# df.dropna(subset=['detected_state'], inplace=True)

print(df.shape)
df.isna().sum()

(35497, 21)
(35497, 21)


patient_number                              1328
p_id                                           0
state_patient_number                       29203
date_announced                               451
age_bracket                                29401
gender                                     26968
detected_city                              32857
detected_district                           8351
detected_state                               638
state_code                                   638
nationality                                32790
type_of_transmission                       32507
contracted_from_which_patient_suspected    33701
status_change_date                          7717
current_status                               631
estimated_onset_date                       35497
source1                                     1048
source2                                    31578
source3                                    35018
notes                                       7239
backup_notes        

## Save data

In [21]:
# save to csv`
df.to_csv('patients_data.csv', index=False)

# State tested data

In [22]:
# get response from the web page
response = requests.get('https://api.covid19india.org/state_test_data.json')

# get contents from the response
content = response.content

# parse the json file
parsed = json.loads(content)

# keys
parsed.keys()

dict_keys(['states_tested_data'])

In [23]:
# save data in a dataframe
th = pd.DataFrame(parsed['states_tested_data'])

# first few rows
th.head(3)

Unnamed: 0,negative,numcallsstatehelpline,numicubeds,numisolationbeds,numventilators,populationsourcecovid19india,positive,source,source2,state,...,testpositivityrate,testspermillion,testsperthousand,totalpeopleinquarantine,totalpeoplereleasedfromquarantine,totaltested,unconfirmed,updatedon,_d415a,_d5fpr
0,1210.0,,,50,,397000.0,12,https://t.me/indiacovid/2550,,Andaman and Nicobar Islands,...,0.86%,3534,3.53,,,1403,181,17/04/2020,,
1,,280.0,,50,,,27,https://t.me/indiacovid/3147?single,,Andaman and Nicobar Islands,...,1.01%,6748,6.75,,,2679,246,24/04/2020,1000.0,1000000.0
2,,298.0,,50,,,33,https://t.me/indiacovid/3365?single,,Andaman and Nicobar Islands,...,1.16%,7174,7.17,,,2848,106,27/04/2020,,


In [24]:
th.columns

Index(['negative', 'numcallsstatehelpline', 'numicubeds', 'numisolationbeds',
       'numventilators', 'populationsourcecovid19india', 'positive', 'source',
       'source2', 'state', 'tags', 'testpositivityrate', 'testspermillion',
       'testsperthousand', 'totalpeopleinquarantine',
       'totalpeoplereleasedfromquarantine', 'totaltested', 'unconfirmed',
       'updatedon', '_d415a', '_d5fpr'],
      dtype='object')

In [25]:
cols = ['state', 'updatedon', 'totaltested', 'testspermillion', 'testsperthousand',
        'positive', 'negative', 'unconfirmed', 'testpositivityrate',
        'totalpeopleinquarantine', 'totalpeoplereleasedfromquarantine', 'populationsourcecovid19india',
        'numicubeds', 'numisolationbeds', 'numventilators', 'numcallsstatehelpline',
        '_d415a', '_d5fpr', 'source', 'source2']

th = th[cols]

th.head()

Unnamed: 0,state,updatedon,totaltested,testspermillion,testsperthousand,positive,negative,unconfirmed,testpositivityrate,totalpeopleinquarantine,totalpeoplereleasedfromquarantine,populationsourcecovid19india,numicubeds,numisolationbeds,numventilators,numcallsstatehelpline,_d415a,_d5fpr,source,source2
0,Andaman and Nicobar Islands,17/04/2020,1403,3534,3.53,12,1210.0,181,0.86%,,,397000.0,,50,,,,,https://t.me/indiacovid/2550,
1,Andaman and Nicobar Islands,24/04/2020,2679,6748,6.75,27,,246,1.01%,,,,,50,,280.0,1000.0,1000000.0,https://t.me/indiacovid/3147?single,
2,Andaman and Nicobar Islands,27/04/2020,2848,7174,7.17,33,,106,1.16%,,,,,50,,298.0,,,https://t.me/indiacovid/3365?single,
3,Andaman and Nicobar Islands,01/05/2020,3754,9456,9.46,33,,199,0.88%,643.0,,,,50,,340.0,,,https://t.me/indiacovid/3781,
4,Andhra Pradesh,02/04/2020,1800,34,0.03,132,1175.0,493,7.33%,179.0,,52221000.0,2680.0,23479,330.0,,,,https://twitter.com/ArogyaAndhra/status/124558...,


In [26]:
# save to csv`
th.to_csv('tests.csv', index=False)

## Zones

In [27]:
# get response from the web page
response = requests.get('https://api.covid19india.org/zones.json')

# get contents from the response
content = response.content

# parse the json file
parsed = json.loads(content)

# keys
parsed.keys()

dict_keys(['zones'])

In [28]:
zo = pd.DataFrame(parsed['zones'])
zo.head(3)

Unnamed: 0,district,districtcode,lastupdated,source,state,statecode,zone
0,Nicobars,AN_Nicobars,01/05/2020,https://www.facebook.com/airnewsalerts/photos/...,Andaman and Nicobar Islands,AN,Green
1,North and Middle Andaman,AN_North and Middle Andaman,01/05/2020,https://www.facebook.com/airnewsalerts/photos/...,Andaman and Nicobar Islands,AN,Green
2,South Andaman,AN_South Andaman,01/05/2020,https://www.facebook.com/airnewsalerts/photos/...,Andaman and Nicobar Islands,AN,Red


In [29]:
# save to csv`
zo.to_csv('zones.csv', index=False)

## State wise Daily

In [30]:
# response = requests.get('https://api.covid19india.org/states_daily.json')
# content = response.content
# parsed = json.loads(content)

# df = pd.DataFrame(parsed['states_daily'])

In [31]:
# df = df.melt(id_vars = ['date', 'status'], 
#              value_vars = ['an', 'ap', 'ar', 'as', 'br', 'ch', 'ct', 'dd', 
#                     'dl', 'dn', 'ga', 'gj', 'hp', 'hr', 'jh', 'jk', 
#                     'ka', 'kl', 'la', 'ld', 'mh', 'ml', 'mn', 'mp',
#                     'mz', 'nl', 'or', 'pb', 'py', 'rj', 'sk', 'tg', 
#                     'tn', 'tr', 'tt', 'up', 'ut', 'wb'], 
#              var_name='state', value_name='count')

# df = df.set_index(['date', 'state'])

# df = df.pivot(columns = 'status').reset_index()

# df.columns = df.columns.droplevel(0)
# df.columns.name = ''

# df.columns = ['Date', 'State', 'Confirmed', 'Deceased', 'Recovered']
# df.head()

In [32]:
# response = requests.get('https://api.covid19india.org/csv/')
# parsed = response.content.decode('utf-8')
# parsed

# df = pd.DataFrame(parsed, sep=',')
# df.head()

In [33]:
# pd.DataFrame('http://api.covid19india.org/states_daily_csv/confirmed.csv')

In [34]:
# pd.read_csv('https://api.covid19india.org/csv/')

## States Daily changes

In [35]:
# response = requests.get('https://api.covid19india.org/states_daily.json')
# content = response.content
# parsed = json.loads(content)

In [36]:
# pd.DataFrame(parsed['states_daily'])

## National time series, statewise stats and test counts

In [37]:
# response = requests.get('https://api.covid19india.org/data.json')
# content = response.content
# parsed = json.loads(content)
# parsed.keys()

In [38]:
# day_wise = pd.DataFrame(parsed['cases_time_series'])
# day_wise.head()

In [39]:
# state_wise = pd.DataFrame(parsed['statewise'])
# state_wise.head()

In [40]:
# tested = pd.DataFrame(parsed['tested'])
# tested.head()

## District wise

In [41]:
# response = requests.get('https://api.covid19india.org/state_district_wise.json')
# content = response.content
# parsed = json.loads(content)
# parsed.keys()

In [42]:
# pd.DataFrame(parsed['Goa'])

In [43]:
# parsed['Goa'].keys()

In [44]:
# pd.DataFrame(parsed['Goa']['districtData'])

## District wise v2

In [45]:
# response = requests.get('https://api.covid19india.org/v2/state_district_wise.json')
# content = response.content
# parsed = json.loads(content)
# len(parsed)

In [46]:
# pd.DataFrame(parsed[1]['districtData'])

## Travel history (no more updated)

In [47]:
# response = requests.get('https://api.covid19india.org/travel_history.json')
# content = response.content
# parsed = json.loads(content)
# parsed.keys()

In [48]:
# th = pd.DataFrame(parsed['travel_history'])
# th.head()