In [1]:
#import libraries

import requests
from bs4 import BeautifulSoup
import glob
from datetime import datetime
import pandas as pd
import os
import re
import matplotlib.pyplot as plt
import calmap
%matplotlib inline

In [2]:
# Ministry of Health and Family Welfare url
URL = 'https://www.mohfw.gov.in'
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

html_thead = soup.find_all('thead')[-1]
html_th = [tr for tr in html_thead.find_all('tr')]

html_tbody = soup.find_all('tbody')[-1]
html_text = [tr for tr in html_tbody.find_all('tr')]

headings = []

for tr in html_th:
    th = tr.find_all(['th'])
    row = [i.text for i in th]
    headings.append(row)

print(headings)

content = []
for tr in html_text:
    td = tr.find_all(['td'])
    row = [i.text for i in td]
    content.append(row)
    
#print(content)

[['S. No.', 'Name of State / UT', 'Active Cases*', 'Cured/Discharged/Migrated*', 'Deaths**', 'Total Confirmed cases*']]


In [3]:
data= pd.DataFrame(content[:-6], columns=headings[0])

In [4]:
data

Unnamed: 0,S. No.,Name of State / UT,Active Cases*,Cured/Discharged/Migrated*,Deaths**,Total Confirmed cases*
0,1,Andaman and Nicobar Islands,53,159,0,212
1,2,Andhra Pradesh,32336,25574,758,58668
2,3,Arunachal Pradesh,552,303,3,858
3,4,Assam,7291,18033,58,25382
4,5,Bihar,10220,18515,217,28952
5,6,Chandigarh,221,518,12,751
6,7,Chhattisgarh,1586,4114,29,5729
7,8,Dadra and Nagar Haveli and Daman and Diu,240,463,2,705
8,9,Delhi,15288,106118,3690,125096
9,10,Goa,1552,2449,26,4027


In [5]:
# drop S.NO column
data = data.drop('S. No.', axis=1)

In [6]:
data.head()

Unnamed: 0,Name of State / UT,Active Cases*,Cured/Discharged/Migrated*,Deaths**,Total Confirmed cases*
0,Andaman and Nicobar Islands,53,159,0,212
1,Andhra Pradesh,32336,25574,758,58668
2,Arunachal Pradesh,552,303,3,858
3,Assam,7291,18033,58,25382
4,Bihar,10220,18515,217,28952


In [7]:
#rename column names
data = data.rename(columns={'Name of State / UT':'States'})
data = data.rename(columns=lambda x: re.sub('Total Confirmed cases \(Including .* foreign Nationals\) ',
                                                      'Total Cases',x))
data = data.rename(columns=lambda x: re.sub('Deaths \( more than 70% cases due to comorbidities \)',
                                                      'Deaths', x))
data = data.rename(columns={'Total Confirmed cases*': 'Total Cases'})
data = data.rename(columns={'Deaths**':'Death'})
data = data.rename(columns={'Cured/Discharged/Migrated*':'Recovered'})
data = data.rename(columns={'Deaths':'Death'})
data = data.rename(columns={'Active Cases*':'Active Cases'})

# remove extra characters from 'Name of State/UT' column
data['States'] = data['States'].str.replace('#', '')

# remove extra characters from 'Name of State/UT' column
data['Death'] = data['Death'].str.replace('#', '')

# drop active cases colum
data = data.drop('Active Cases', axis=1)

In [8]:
data.head()

Unnamed: 0,States,Recovered,Death,Total Cases
0,Andaman and Nicobar Islands,159,0,212
1,Andhra Pradesh,25574,758,58668
2,Arunachal Pradesh,303,3,858
3,Assam,18033,58,25382
4,Bihar,18515,217,28952


In [9]:
#add todays date and create new column
now  = datetime.now()
data['Date'] = now.strftime("%m/%d/%Y") 
data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%Y')
data.head()

Unnamed: 0,States,Recovered,Death,Total Cases,Date
0,Andaman and Nicobar Islands,159,0,212,2020-07-22
1,Andhra Pradesh,25574,758,58668,2020-07-22
2,Arunachal Pradesh,303,3,858,2020-07-22
3,Assam,18033,58,25382,2020-07-22
4,Bihar,18515,217,28952,2020-07-22


In [10]:
#add latitude and longitude
lat = {'Delhi':28.7041, 'Haryana':29.0588, 'Kerala':10.8505, 'Rajasthan':27.0238,
       'Telengana':18.1124, 'Uttar Pradesh':26.8467, 'Ladakh':34.2996, 'Tamil Nadu':11.1271,
       'Andhra Pradesh':15.9129, 'Odisha':20.9517, 'Uttarakhand':30.0668, 'West Bengal':22.9868, 
       'Puducherry': 11.9416, 'Chandigarh': 30.7333, 'Chhattisgarh':21.2787, 
       'Jammu and Kashmir':33.7782, 'Punjab':31.1471, 'Karnataka':15.3173, 
       'Maharashtra':19.7515, 'Gujarat': 22.2587, 
       'Himachal Pradesh': 31.1048, 'Madhya Pradesh': 22.9734, 'Bihar': 25.0961, 'Manipur':24.6637, 
       'Mizoram':23.1645, 'Goa': 15.2993, 'Andaman and Nicobar Islands': 11.7401, 'Assam' : 26.2006, 
       'Jharkhand': 23.6102, 'Arunachal Pradesh': 28.2180, 'Tripura': 23.9408, 'Nagaland': 26.1584, 
       'Meghalaya' : 25.4670}

long = {'Delhi':77.1025, 'Haryana':76.0856, 'Kerala':76.2711, 'Rajasthan':74.2179,
        'Telengana':79.0193, 'Uttar Pradesh':80.9462, 'Ladakh':78.2932, 'Tamil Nadu':78.6569,
        'Andhra Pradesh':79.7400, 'Odisha':85.0985, 'Uttarakhand':79.0193, 'West Bengal':87.8550, 
        'Puducherry': 79.8083, 'Chandigarh': 76.7794, 'Chhattisgarh':81.8661,
        'Jammu and Kashmir':76.5762, 'Punjab':75.3412, 'Karnataka':75.7139, 
        'Maharashtra':75.7139, 'Gujarat': 71.1924, 
        'Himachal Pradesh': 77.1734, 'Madhya Pradesh': 78.6569, 'Bihar': 85.3131, 'Manipur':93.9063, 
        'Mizoram':92.9376, 'Goa': 74.1240, 'Andaman and Nicobar Islands': 92.6586, 'Assam' : 92.9376, 
        'Jharkhand': 85.2799, 'Arunachal Pradesh': 94.7278, 'Tripura': 91.9882, 'Nagaland': 94.5624,
        'Meghalaya' : 91.3662}

data['Latitude'] = data['States'].map(lat)
data['Longitude'] = data['States'].map(long)

In [11]:
data.head()

Unnamed: 0,States,Recovered,Death,Total Cases,Date,Latitude,Longitude
0,Andaman and Nicobar Islands,159,0,212,2020-07-22,11.7401,92.6586
1,Andhra Pradesh,25574,758,58668,2020-07-22,15.9129,79.74
2,Arunachal Pradesh,303,3,858,2020-07-22,28.218,94.7278
3,Assam,18033,58,25382,2020-07-22,26.2006,92.9376
4,Bihar,18515,217,28952,2020-07-22,25.0961,85.3131


In [12]:
my_folder = './.daily_update/'
if not os.path.exists(my_folder):
    os.makedirs(my_folder)

file_name = my_folder + now.strftime("%Y_%m_%d") + '.csv'

In [13]:
data.to_csv(file_name, index=False)

In [14]:
csv_files = glob.glob(my_folder + '*.csv')

all_data = []
for i in csv_files:
    temp = pd.read_csv(i)
    all_data.append(temp)
    

In [15]:
final_data = pd.concat(all_data, ignore_index=True).sort_values(['Date'], ascending=True).reset_index(drop=True)
final_data['Date'] = pd.to_datetime(final_data['Date'])
#final_data.drop(final_data.index[[1005]])
final_data['Active'] = final_data['Total Cases'] - final_data['Recovered'] - final_data['Death']
final_data = final_data.sort_values(['Date', 'States']).reset_index(drop=True)
final_data

Unnamed: 0,States,Recovered,Death,Total Cases,Date,Latitude,Longitude,Active
0,Andaman and Nicobar Islands,11,0,16,2020-04-21,11.7401,92.6586,5
1,Andhra Pradesh,96,22,757,2020-04-21,15.9129,79.7400,639
2,Arunachal Pradesh,1,0,1,2020-04-21,28.2180,94.7278,0
3,Assam,19,1,35,2020-04-21,26.2006,92.9376,15
4,Bihar,42,2,114,2020-04-21,25.0961,85.3131,70
...,...,...,...,...,...,...,...,...
2630,Tamil Nadu,126670,2626,180643,2020-07-22,11.1271,78.6569,51347
2631,Telangana,37385,429,47705,2020-07-22,,,9891
2632,Tripura,1926,8,3331,2020-07-22,23.9408,91.9882,1397
2633,Uttar Pradesh,31855,1229,53288,2020-07-22,26.8467,80.9462,20204


In [16]:
final_data.to_csv('COVID19INDIA.csv', index=False)