## Data Source

https://api.covid19india.org/

## Import libraries

In [37]:
# to get web contents
import requests
# to parse json contents
import json
# to parse csv files
import csv

# for numerical operations
import numpy as np
# to store and analysis data in dataframes
import pandas as pd

## Get data

In [38]:
# get response from the web page
response = requests.get('https://api.covid19india.org/raw_data.json')
# get contents from the response
content = response.content
# parse the json file
parsed = json.loads(content)

In [39]:
# contents inside json file
parsed.keys()

dict_keys(['raw_data'])

## Save data

In [40]:
# save data in a dataframe
df = pd.DataFrame(parsed['raw_data'])
# first 3 rows of the dataframe
df.head(3)

Unnamed: 0,agebracket,backupnotes,contractedfromwhichpatientsuspected,currentstatus,dateannounced,detectedcity,detecteddistrict,detectedstate,estimatedonsetdate,gender,nationality,notes,patientnumber,source1,source2,source3,statecode,statepatientnumber,statuschangedate,typeoftransmission
0,20.0,Student from Wuhan,,Recovered,30/01/2020,Thrissur,Thrissur,Kerala,,F,India,Travelled from Wuhan,1,https://twitter.com/vijayanpinarayi/status/122...,https://weather.com/en-IN/india/news/news/2020...,,KL,KL-TS-P1,14/02/2020,Imported
1,,Student from Wuhan,,Recovered,02/02/2020,Alappuzha,Alappuzha,Kerala,,,India,Travelled from Wuhan,2,https://www.indiatoday.in/india/story/kerala-r...,https://weather.com/en-IN/india/news/news/2020...,,KL,KL-AL-P1,14/02/2020,Imported
2,,Student from Wuhan,,Recovered,03/02/2020,Kasaragod,Kasaragod,Kerala,,,India,Travelled from Wuhan,3,https://www.indiatoday.in/india/story/kerala-n...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,KL,KL-KS-P1,14/02/2020,Imported


In [41]:
# shade of the dataframe
df.shape

(24763, 20)

In [42]:
# list of columns
df.columns

Index(['agebracket', 'backupnotes', 'contractedfromwhichpatientsuspected',
       'currentstatus', 'dateannounced', 'detectedcity', 'detecteddistrict',
       'detectedstate', 'estimatedonsetdate', 'gender', 'nationality', 'notes',
       'patientnumber', 'source1', 'source2', 'source3', 'statecode',
       'statepatientnumber', 'statuschangedate', 'typeoftransmission'],
      dtype='object')

In [43]:
# creating patient id column from patient number
# ===============================================

df['p_id'] = df['patientnumber'].apply(lambda x : 'P'+str(x))
df.columns

Index(['agebracket', 'backupnotes', 'contractedfromwhichpatientsuspected',
       'currentstatus', 'dateannounced', 'detectedcity', 'detecteddistrict',
       'detectedstate', 'estimatedonsetdate', 'gender', 'nationality', 'notes',
       'patientnumber', 'source1', 'source2', 'source3', 'statecode',
       'statepatientnumber', 'statuschangedate', 'typeoftransmission', 'p_id'],
      dtype='object')

## Rearrange and rename columns

In [44]:
# order of columns
cols = ['patientnumber', 'p_id', 'statepatientnumber', 
        'dateannounced', 'agebracket', 'gender', 
        'detectedcity', 'detecteddistrict', 'detectedstate', 'statecode', 'nationality',
        'typeoftransmission', 'contractedfromwhichpatientsuspected',
        'statuschangedate', 'currentstatus', 'estimatedonsetdate',
        'source1', 'source2', 'source3', 'notes', 'backupnotes']

# rearrange columns
df = df[cols]

# rename columns
df.columns = ['patient_number', 'p_id', 'state_patient_number', 
              'date_announced', 'age_bracket', 'gender', 
              'detected_city', 'detected_district', 'detected_state', 'state_code', 'nationality',
              'type_of_transmission', 'contracted_from_which_patient_suspected',
              'status_change_date', 'current_status', 'estimated_onset_date',
              'source1', 'source2', 'source3', 'notes', 'backup_notes']

# dataframe shape
df.shape

(24763, 21)

In [45]:
# first 3 rows of the dataframe
df.head(3)

Unnamed: 0,patient_number,p_id,state_patient_number,date_announced,age_bracket,gender,detected_city,detected_district,detected_state,state_code,...,type_of_transmission,contracted_from_which_patient_suspected,status_change_date,current_status,estimated_onset_date,source1,source2,source3,notes,backup_notes
0,1,P1,KL-TS-P1,30/01/2020,20.0,F,Thrissur,Thrissur,Kerala,KL,...,Imported,,14/02/2020,Recovered,,https://twitter.com/vijayanpinarayi/status/122...,https://weather.com/en-IN/india/news/news/2020...,,Travelled from Wuhan,Student from Wuhan
1,2,P2,KL-AL-P1,02/02/2020,,,Alappuzha,Alappuzha,Kerala,KL,...,Imported,,14/02/2020,Recovered,,https://www.indiatoday.in/india/story/kerala-r...,https://weather.com/en-IN/india/news/news/2020...,,Travelled from Wuhan,Student from Wuhan
2,3,P3,KL-KS-P1,03/02/2020,,,Kasaragod,Kasaragod,Kerala,KL,...,Imported,,14/02/2020,Recovered,,https://www.indiatoday.in/india/story/kerala-n...,https://twitter.com/ANI/status/122422148580539...,https://weather.com/en-IN/india/news/news/2020...,Travelled from Wuhan,Student from Wuhan


## Missing values

In [46]:
# no. of empty values in each column
# ==================================

print(df.shape, '\n')

for i in df.columns:
    print(i, '\t', df[df[i]==''].shape[0])

(24763, 21) 

patient_number 	 0
p_id 	 0
state_patient_number 	 21913
date_announced 	 148
age_bracket 	 22637
gender 	 19675
detected_city 	 22456
detected_district 	 6092
detected_state 	 150
state_code 	 150
nationality 	 22066
type_of_transmission 	 21781
contracted_from_which_patient_suspected 	 23225
status_change_date 	 259
current_status 	 148
estimated_onset_date 	 24763
source1 	 532
source2 	 20893
source3 	 24313
notes 	 1466
backup_notes 	 24402


In [47]:
# no. of non-empty values in each column
# ===================================

print(df.shape, '\n')

for i in df.columns:
    print(i, '\t', df[df[i]!=''].shape[0])

(24763, 21) 

patient_number 	 24763
p_id 	 24763
state_patient_number 	 2850
date_announced 	 24615
age_bracket 	 2126
gender 	 5088
detected_city 	 2307
detected_district 	 18671
detected_state 	 24613
state_code 	 24613
nationality 	 2697
type_of_transmission 	 2982
contracted_from_which_patient_suspected 	 1538
status_change_date 	 24504
current_status 	 24615
estimated_onset_date 	 0
source1 	 24231
source2 	 3870
source3 	 450
notes 	 23297
backup_notes 	 361


In [48]:
# replacing empty strings with np.nan
# ==================================-

print(df.shape)

df = df.replace(r'', np.nan, regex=True)
df.isna().sum()

(24763, 21)


patient_number                                 0
p_id                                           0
state_patient_number                       21913
date_announced                               148
age_bracket                                22637
gender                                     19675
detected_city                              22456
detected_district                           6092
detected_state                               150
state_code                                   150
nationality                                22066
type_of_transmission                       21781
contracted_from_which_patient_suspected    23225
status_change_date                           259
current_status                               148
estimated_onset_date                       24763
source1                                      532
source2                                    20893
source3                                    24313
notes                                       1466
backup_notes        

In [49]:
# droping empty rows (row with just row number but without patient entry
# ======================================================================

print(df.shape)

df.dropna(subset=['detected_state'], inplace=True)

print(df.shape)
df.isna().sum()

(24763, 21)
(24613, 21)


patient_number                                 0
p_id                                           0
state_patient_number                       21763
date_announced                                 0
age_bracket                                22488
gender                                     19526
detected_city                              22306
detected_district                           5943
detected_state                                 0
state_code                                     0
nationality                                21917
type_of_transmission                       21632
contracted_from_which_patient_suspected    23075
status_change_date                           110
current_status                                 0
estimated_onset_date                       24613
source1                                      384
source2                                    20744
source3                                    24164
notes                                       1317
backup_notes        

## Save data

In [50]:
# save to csv`
df.to_csv('patients_data.csv', index=False)

## State wise Daily

In [51]:
# response = requests.get('https://api.covid19india.org/states_daily.json')
# content = response.content
# parsed = json.loads(content)

# df = pd.DataFrame(parsed['states_daily'])

In [52]:
# df = df.melt(id_vars = ['date', 'status'], 
#              value_vars = ['an', 'ap', 'ar', 'as', 'br', 'ch', 'ct', 'dd', 
#                     'dl', 'dn', 'ga', 'gj', 'hp', 'hr', 'jh', 'jk', 
#                     'ka', 'kl', 'la', 'ld', 'mh', 'ml', 'mn', 'mp',
#                     'mz', 'nl', 'or', 'pb', 'py', 'rj', 'sk', 'tg', 
#                     'tn', 'tr', 'tt', 'up', 'ut', 'wb'], 
#              var_name='state', value_name='count')

# df = df.set_index(['date', 'state'])

# df = df.pivot(columns = 'status').reset_index()

# df.columns = df.columns.droplevel(0)
# df.columns.name = ''

# df.columns = ['Date', 'State', 'Confirmed', 'Deceased', 'Recovered']
# df.head()

In [53]:
# response = requests.get('https://api.covid19india.org/csv/')
# parsed = response.content.decode('utf-8')
# parsed

# df = pd.DataFrame(parsed, sep=',')
# df.head()

In [54]:
# pd.DataFrame('http://api.covid19india.org/states_daily_csv/confirmed.csv')

In [55]:
# pd.read_csv('https://api.covid19india.org/csv/')

## States Daily changes

In [56]:
# response = requests.get('https://api.covid19india.org/states_daily.json')
# content = response.content
# parsed = json.loads(content)

In [57]:
# pd.DataFrame(parsed['states_daily'])

## National time series, statewise stats and test counts

In [58]:
# response = requests.get('https://api.covid19india.org/data.json')
# content = response.content
# parsed = json.loads(content)
# parsed.keys()

In [59]:
# day_wise = pd.DataFrame(parsed['cases_time_series'])
# day_wise.head()

In [60]:
# state_wise = pd.DataFrame(parsed['statewise'])
# state_wise.head()

In [61]:
# tested = pd.DataFrame(parsed['tested'])
# tested.head()

## District wise

In [62]:
# response = requests.get('https://api.covid19india.org/state_district_wise.json')
# content = response.content
# parsed = json.loads(content)
# parsed.keys()

In [63]:
# pd.DataFrame(parsed['Goa'])

In [64]:
# parsed['Goa'].keys()

In [65]:
# pd.DataFrame(parsed['Goa']['districtData'])

## District wise v2

In [66]:
# response = requests.get('https://api.covid19india.org/v2/state_district_wise.json')
# content = response.content
# parsed = json.loads(content)
# len(parsed)

In [67]:
# pd.DataFrame(parsed[1]['districtData'])

## Travel history (no more updated)

In [68]:
# response = requests.get('https://api.covid19india.org/travel_history.json')
# content = response.content
# parsed = json.loads(content)
# parsed.keys()

In [69]:
# th = pd.DataFrame(parsed['travel_history'])
# th.head()