In [1]:
#import libraries

import requests
from bs4 import BeautifulSoup
import glob
from datetime import datetime
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import calmap
%matplotlib inline

In [2]:
# Ministry of Health and Family Welfare url
URL = 'https://www.mohfw.gov.in'
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

html_thead = soup.find_all('thead')[-1]
html_th = [tr for tr in html_thead.find_all('tr')]

html_tbody = soup.find_all('tbody')[-1]
html_text = [tr for tr in html_tbody.find_all('tr')]

headings = []

for tr in html_th:
    th = tr.find_all(['th'])
    row = [i.text for i in th]
    headings.append(row)

print(headings)

content = []
for tr in html_text:
    td = tr.find_all(['td'])
    row = [i.text for i in td]
    content.append(row)
    
#print(content)

[['S. No.', 'Name of State / UT', 'Total Confirmed cases* ', 'Cured/Discharged/Migrated', 'Deaths**']]


In [3]:
data= pd.DataFrame(content[:-6], columns=headings[0])

In [4]:
data

Unnamed: 0,S. No.,Name of State / UT,Total Confirmed cases*,Cured/Discharged/Migrated,Deaths**
0,1,Andaman and Nicobar Islands,33,33,0
1,2,Andhra Pradesh,3171,2057,58
2,3,Arunachal Pradesh,2,1,0
3,4,Assam,781,87,4
4,5,Bihar,3061,1083,15
5,6,Chandigarh,279,187,4
6,7,Chhattisgarh,369,83,0
7,8,Dadar Nagar Haveli,2,0,0
8,9,Delhi,15257,7264,303
9,10,Goa,68,37,0


In [5]:
# drop S.NO column
data = data.drop('S. No.', axis=1)

In [6]:
data.head()

Unnamed: 0,Name of State / UT,Total Confirmed cases*,Cured/Discharged/Migrated,Deaths**
0,Andaman and Nicobar Islands,33,33,0
1,Andhra Pradesh,3171,2057,58
2,Arunachal Pradesh,2,1,0
3,Assam,781,87,4
4,Bihar,3061,1083,15


In [7]:
#rename column names
data = data.rename(columns={'Name of State / UT':'States'})
data = data.rename(columns=lambda x: re.sub('Total Confirmed cases \(Including .* foreign Nationals\) ',
                                                      'Total Cases',x))
data = data.rename(columns=lambda x: re.sub('Deaths \( more than 70% cases due to comorbidities \)',
                                                      'Deaths', x))
data = data.rename(columns={'Total Confirmed cases* ': 'Total Cases'})
data = data.rename(columns={'Deaths**':'Death'})
data = data.rename(columns={'Cured/Discharged/Migrated':'Recovered'})
data = data.rename(columns={'Deaths':'Death'})

# remove extra characters from 'Name of State/UT' column
data['States'] = data['States'].str.replace('#', '')

# remove extra characters from 'Name of State/UT' column
data['Death'] = data['Death'].str.replace('#', '')

In [8]:
data.head()

Unnamed: 0,States,Total Cases,Recovered,Death
0,Andaman and Nicobar Islands,33,33,0
1,Andhra Pradesh,3171,2057,58
2,Arunachal Pradesh,2,1,0
3,Assam,781,87,4
4,Bihar,3061,1083,15


In [9]:
#add todays date and create new column
now  = datetime.now()
data['Date'] = now.strftime("%m/%d/%Y") 
data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%Y')
data.head()

Unnamed: 0,States,Total Cases,Recovered,Death,Date
0,Andaman and Nicobar Islands,33,33,0,2020-05-28
1,Andhra Pradesh,3171,2057,58,2020-05-28
2,Arunachal Pradesh,2,1,0,2020-05-28
3,Assam,781,87,4,2020-05-28
4,Bihar,3061,1083,15,2020-05-28


In [10]:
#add latitude and longitude
lat = {'Delhi':28.7041, 'Haryana':29.0588, 'Kerala':10.8505, 'Rajasthan':27.0238,
       'Telengana':18.1124, 'Uttar Pradesh':26.8467, 'Ladakh':34.2996, 'Tamil Nadu':11.1271,
       'Andhra Pradesh':15.9129, 'Odisha':20.9517, 'Uttarakhand':30.0668, 'West Bengal':22.9868, 
       'Puducherry': 11.9416, 'Chandigarh': 30.7333, 'Chhattisgarh':21.2787, 
       'Jammu and Kashmir':33.7782, 'Punjab':31.1471, 'Karnataka':15.3173, 
       'Maharashtra':19.7515, 'Gujarat': 22.2587, 
       'Himachal Pradesh': 31.1048, 'Madhya Pradesh': 22.9734, 'Bihar': 25.0961, 'Manipur':24.6637, 
       'Mizoram':23.1645, 'Goa': 15.2993, 'Andaman and Nicobar Islands': 11.7401, 'Assam' : 26.2006, 
       'Jharkhand': 23.6102, 'Arunachal Pradesh': 28.2180, 'Tripura': 23.9408, 'Nagaland': 26.1584, 
       'Meghalaya' : 25.4670}

long = {'Delhi':77.1025, 'Haryana':76.0856, 'Kerala':76.2711, 'Rajasthan':74.2179,
        'Telengana':79.0193, 'Uttar Pradesh':80.9462, 'Ladakh':78.2932, 'Tamil Nadu':78.6569,
        'Andhra Pradesh':79.7400, 'Odisha':85.0985, 'Uttarakhand':79.0193, 'West Bengal':87.8550, 
        'Puducherry': 79.8083, 'Chandigarh': 76.7794, 'Chhattisgarh':81.8661,
        'Jammu and Kashmir':76.5762, 'Punjab':75.3412, 'Karnataka':75.7139, 
        'Maharashtra':75.7139, 'Gujarat': 71.1924, 
        'Himachal Pradesh': 77.1734, 'Madhya Pradesh': 78.6569, 'Bihar': 85.3131, 'Manipur':93.9063, 
        'Mizoram':92.9376, 'Goa': 74.1240, 'Andaman and Nicobar Islands': 92.6586, 'Assam' : 92.9376, 
        'Jharkhand': 85.2799, 'Arunachal Pradesh': 94.7278, 'Tripura': 91.9882, 'Nagaland': 94.5624,
        'Meghalaya' : 91.3662}

data['Latitude'] = data['States'].map(lat)
data['Longitude'] = data['States'].map(long)

In [11]:
data.head()

Unnamed: 0,States,Total Cases,Recovered,Death,Date,Latitude,Longitude
0,Andaman and Nicobar Islands,33,33,0,2020-05-28,11.7401,92.6586
1,Andhra Pradesh,3171,2057,58,2020-05-28,15.9129,79.74
2,Arunachal Pradesh,2,1,0,2020-05-28,28.218,94.7278
3,Assam,781,87,4,2020-05-28,26.2006,92.9376
4,Bihar,3061,1083,15,2020-05-28,25.0961,85.3131


In [12]:
my_folder = './.daily_update/'
if not os.path.exists(my_folder):
    os.makedirs(my_folder)

file_name = my_folder + now.strftime("%Y_%m_%d") + '.csv'

In [13]:
data.to_csv(file_name, index=False)

In [14]:
csv_files = glob.glob(my_folder + '*.csv')

all_data = []
for i in csv_files:
    temp = pd.read_csv(i)
    all_data.append(temp)
    

In [15]:
final_data = pd.concat(all_data, ignore_index=True).sort_values(['Date'], ascending=True).reset_index(drop=True)
final_data['Date'] = pd.to_datetime(final_data['Date'])
#final_data.drop(final_data.index[[1005]])
final_data['Active'] = final_data['Total Cases'] - final_data['Recovered'] - final_data['Death']
final_data = final_data.sort_values(['Date', 'States']).reset_index(drop=True)
final_data
#final_data.isna().sum()
#final_data.to_csv('demo.csv', index=False)

Unnamed: 0,States,Total Cases,Recovered,Death,Date,Latitude,Longitude,Active
0,Andaman and Nicobar Islands,16,11,0,2020-04-21,11.7401,92.6586,5
1,Andhra Pradesh,757,96,22,2020-04-21,15.9129,79.7400,639
2,Arunachal Pradesh,1,1,0,2020-04-21,28.2180,94.7278,0
3,Assam,35,19,1,2020-04-21,26.2006,92.9376,15
4,Bihar,114,42,2,2020-04-21,25.0961,85.3131,70
5,Chandigarh,26,13,0,2020-04-21,30.7333,76.7794,13
6,Chhattisgarh,36,25,0,2020-04-21,21.2787,81.8661,11
7,Delhi,2081,431,47,2020-04-21,28.7041,77.1025,1603
8,Goa,7,7,0,2020-04-21,15.2993,74.1240,0
9,Gujarat,2066,131,77,2020-04-21,22.2587,71.1924,1858


In [16]:
final_data.to_csv('COVID19INDIA.csv', index=False)