In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

# STEP 1 (GETTING THE DATA FROM WIKIPEDIA)

In [2]:
res = requests.get('https://en.wikipedia.org/wiki/2019–20_coronavirus_pandemic_by_country_and_territory')

The above information is obtained directly from [WIKIPEDIA](https://en.wikipedia.org/wiki/2019–20_coronavirus_pandemic_by_country_and_territory)

In [3]:
type(res)

requests.models.Response

# STEP 2(CONVERTING INTO BEAUTIFULSOUP OBJECT)

In [4]:
# Beautiful Soup is a library that makes it easy to scrape information from web pages. 
#It sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree.
soup = BeautifulSoup(res.text, 'lxml')


In [7]:
#fetching the table having the container 
covid_19_table = soup.find('div', {'id':'covid19-container'})


In [8]:
len(covid_19_table)

4

In [9]:
#Prettify applies various stylistic formatting conventions to it. 
#These formatting conventions usually adjust positioning, spacing, 
#and similar modifications intended to make the content easier for people to view, read, and understand.
covid_19_table_ = covid_19_table.prettify()

In [10]:
#reading the html
results = pd.read_html((covid_19_table_.encode('utf8')))     
res_l = list(results)
res_l[0].head()


Unnamed: 0_level_0,Countries and territories [b],Countries and territories [b],Cases [a],Deaths [c],Recov. [d],Ref.
Unnamed: 0_level_1,â 225,â 225.1,"â 3,083,467","â 213,824","â 915,988",[4]
0,,United States [e],1033721,58947,117690,[11]
1,,Spain,210773,23822,102548,[12]
2,,Italy,201505,27359,68941,[13] [14]
3,,United Kingdom [f],161145,21678,â,[16]
4,,Germany [g],159735,6280,110041,[17] [18]


# STEP 3 DATA CLEANING 

In [11]:
#droping unused columns
df = res_l[0]
df.drop(df.columns[[0, 5]], axis = 1, inplace = True)
df.head()

Unnamed: 0_level_0,Countries and territories [b],Cases [a],Deaths [c],Recov. [d]
Unnamed: 0_level_1,â 225.1,"â 3,083,467","â 213,824","â 915,988"
0,United States [e],1033721,58947,117690
1,Spain,210773,23822,102548
2,Italy,201505,27359,68941
3,United Kingdom [f],161145,21678,â
4,Germany [g],159735,6280,110041


In [12]:
#renaming columns 
df.columns = ('Countries and territories', 'Cases', 'Deaths', 'Recov')
df.head()

Unnamed: 0,Countries and territories,Cases,Deaths,Recov
0,United States [e],1033721,58947,117690
1,Spain,210773,23822,102548
2,Italy,201505,27359,68941
3,United Kingdom [f],161145,21678,â
4,Germany [g],159735,6280,110041


In [13]:
#replacing  missing values with NaN
df.replace('â', np.NaN, inplace=True)

In [14]:
# Top 10 affected countries
df.head()

Unnamed: 0,Countries and territories,Cases,Deaths,Recov
0,United States [e],1033721,58947,117690.0
1,Spain,210773,23822,102548.0
2,Italy,201505,27359,68941.0
3,United Kingdom [f],161145,21678,
4,Germany [g],159735,6280,110041.0


In [15]:
# least infected
df = df.truncate(after = 227)
df.tail()


Unnamed: 0,Countries and territories,Cases,Deaths,Recov
223,USS Kidd,47,0,0.0
224,MS Zaandam [at],13,4,
225,Coral Princess [au],12,2,
226,HNLMS Dolfijn [av],8,0,
227,Leopold I,1,0,0.0


In [16]:
#simple function to save the table into a csv file
def Make_csv(dataframe):
    date = pd.Timestamp.now().strftime(format='%Y-%m-%d_%H-%M_')
    df.to_csv(f'{date}COVID_19_table.csv', index=True, header=True)

In [17]:
Make_csv(df)