**Scraping whole table at a time**

In [38]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
import os
from time import sleep
from random import randint
import numpy as np

headers = dict()
headers[
    "User-Agent"
] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"


def data_table():
  newurl = 'https://www.worldometers.info/coronavirus/'
  resp = requests.get(newurl, headers = headers) # send a get request to the url, get response
  soup = BeautifulSoup(resp.text, 'html5lib') # Yummy HTML soup
  table = soup.find('table', {"class": "main_table_countries"}) # get the table from html
  trs = table.findAll('tr') # extract all rows of the table
  if len(trs[1:])!=0:
    csv_filename = 'Corona Virus.csv'
    if os.path.exists(csv_filename): os.remove(csv_filename) # remove the file it already exists, can result in data duplicacy
    with open(csv_filename, 'a') as f:
      writer = csv.writer(f)
      columns = [th.text for th in trs[0].findChildren('th')]					
      writer.writerow(columns)
      for tr in trs[1:]:
        row = []
        tds = tr.findChildren('td')
        for td in tds:
          span = td.findChildren('span', {'id':'Regular_season'})
          if span:
            row.append(span[0].text.strip())
          else:
            row.append(td.text.strip())
        assert len(row) == len(columns)
        writer.writerow(row)
data_table()


In [41]:
df = pd.read_csv('/content/Corona Virus.csv')
df

Unnamed: 0,#,"Country,Other",TotalCases,NewCases,TotalDeaths,NewDeaths,TotalRecovered,NewRecovered,ActiveCases,"Serious,Critical",Tot Cases/1M pop,Deaths/1M pop,TotalTests,Tests/\n1M pop\n,Population,Continent,1 Caseevery X ppl,1 Deathevery X ppl,1 Testevery X ppl
0,,North America,37378156,+18986,846577,+676,29038932,+12487,7492647,17106,,,,,,North America,,,
1,,Asia,33781298,+396098,467260,+2999,29440130,+303995,3873908,29379,,,,,,Asia,,,
2,,South America,23315120,+8555,620770,+131,20882653,+8686,1811697,24542,,,,,,South America,,,
3,,Europe,42800565,+85519,975287,+1904,37088535,+110817,4736743,32412,,,,,,Europe,,,
4,,Africa,4461772,+2496,117994,+44,3981902,+2072,361876,3802,,,,,,Africa,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232,,Total:,42800565,+85519,975287,+1904,37088535,+110817,4736743,32412,,,,,,Europe,,,
233,,Total:,4461772,+2496,117994,+44,3981902,+2072,361876,3802,,,,,,Africa,,,
234,,Total:,61206,+246,1173,+5,34783,+15,25250,4,,,,,,Australia/Oceania,,,
235,,Total:,721,,15,,706,,0,0,,,,,,,,,


**Scraping Only Specific Columns**

In [46]:

newurl = 'https://www.worldometers.info/coronavirus/'
resp = requests.get(newurl) # send a get request to the url, get response
soup = BeautifulSoup(resp.text, 'html5lib') # Yummy HTML soup
table = soup.find('table', {"class": "main_table_countries"}) # get the table from html

data_col = pd.DataFrame(columns=['Country', 'Total Cases',	'New Cases',	'Total Deaths', 'New Deaths'])

for row in table.tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        Country = col[1].text.strip()
        Total_Cases = col[2].text.strip()
        New_Cases = col[3].text.strip()
        Total_Deaths = col[4].text.strip()
        New_Deaths = col[5].text.strip()
        data_col = data_col.append({"Country":Country, "Total Cases":Total_Cases, 
                                    "New Casses":New_Cases, "Total Deaths":Total_Deaths, 
                                    "New Deaths":New_Deaths}, ignore_index=True)
data_col.to_csv('Corona Data.csv', index = False)
data_col.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,New Casses
0,North America,37380822,,846587,686,21652
1,Asia,33781384,,467260,2999,396184
2,South America,23315120,,620770,131,8555
3,Europe,42800733,,975288,1905,85687
4,Africa,4461772,,117994,44,2496


**We scrap 7 Specific columns here. In this method, we can select which columns we need from the table**

**Scraping Multiple pages(page by page)**

In [None]:
pip install requests_html

**Scraping Multiple pages with oncurrent features**

In [75]:
%%time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from random import randint
from time import sleep  
import concurrent.futures
from multiprocessing.dummy import Pool as ThreadPool
from csv import reader
from sqlalchemy import create_engine
import sqlite3

conn = sqlite3.connect('PGCB.db')
c = conn.cursor()
base_url = "https://web.pgcb.gov.bd/view_generations_bn?page="

all_urls = []
ress =[]
Date = []
Time = []
Produced = []
Demand = []
Depriciation = []

def generate_urls():
  for i in range(1,100):
    all_urls.append(base_url + str(i))
    #time.sleep(0.15)
generate_urls()
def scrape(url):
  #for url in all_urls:
  sleep(randint(1,3))
  result = requests.get(url)
  sleep(randint(1,7))
  soup = BeautifulSoup(result.content, 'html.parser')
  tab = soup.find_all('table', class_ = 'table table-bordered')
  sleep(randint(1,3))
  for tabi in tab:
    for row in tabi.tbody.find_all("tr"):
      col = row.find_all("td")
      if (col != []):
        date = col[0].text
        time = col[1].text
        produced = col[2].text.strip()
        demand = col[3].text.strip()
        depriciation = col[4].text.strip()

        Date.append(date)
        Time.append(time)
        Produced.append(produced)
        Demand.append(demand)
        Depriciation.append(depriciation)

  data_df = pd.DataFrame({"Date":Date, "Time":Time, "Produced":Produced, "Demand":Demand, "Depriciation":Depriciation})  
  data_df.to_csv('Data Multiple Page.csv', index = False)
  #df.to_sql('Data', conn, if_exists='replace', index = False )
  print('Completed:', url)

with concurrent.futures.ThreadPoolExecutor(max_workers=15) as executor:
  executor.map(scrape, all_urls)

Completed: https://web.pgcb.gov.bd/view_generations_bn?page=1
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=8
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=2
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=10
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=6
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=4
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=5
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=9
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=7
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=11
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=15
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=13
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=3
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=12
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=14
Completed: https://web.pgcb.gov.bd/view_generations_bn?page=16
C

**With Concurrent Features this takes us only 1 min and 15 sec, to scrap all 100 pages**

In [77]:
df = pd.read_csv('/content/Data Multiple Page.csv')
df

Unnamed: 0,Date,Time,Produced,Demand,Depriciation
0,১৮-০৪-২০২১,২১:০০:০০,১২৬৬১,১২৬৬১,০.০০
1,১৮-০৪-২০২১,২০:০০:০০,১২৪৫১,১২৪৫১,০.০০
2,১৮-০৪-২০২১,১৯:৩০:০০,১২৩৯৮,১২৩৯৮,০.০০
3,১৮-০৪-২০২১,১৯:০০:০০,১১৮০৩,১১৮০৩,০.০০
4,১৮-০৪-২০২১,১৮:০০:০০,১০১৮২,১০১৮২,০.০০
...,...,...,...,...,...
5044,০৭-১০-২০২০,০৪:০০:০০,৯৬৩৭,৯৬৩৭,০.০০
5045,০৭-১০-২০২০,০৩:০০:০০,৯৯২২,৯৯২২,০.০০
5046,০৭-১০-২০২০,০২:০০:০০,১০১৬২,১০১৬২,০.০০
5047,০৭-১০-২০২০,০১:০০:০০,১০৬০০,১০৬০০,০.০০


**Translating the data from Bengali to English**

In [80]:

df = pd.read_csv('/content/Data Multiple Page.csv')

# creating dictionary for translation table 
trans_dict ={"০":"0","১": "1", "২": "2", "৩": "3", "৪":"4", "৫":"5","৬":"6","৭":"7","৮":"8","৯":"9"} 
  
# creating translate table from dictionary 
trans_table ="০১২৩৪৫৬৭৮৯".maketrans(trans_dict) 
  
# translating through passed transtable 
df['Produced']= df['Produced'].str.translate(trans_table)
df['Date'] = df['Date'].str.translate(trans_table)
df['Time'] = df['Time'].str.translate(trans_table)
df['Demand'] = df['Demand'].str.translate(trans_table)
df['Depriciation'] = df['Depriciation'].str.translate(trans_table)
df.to_csv('Data(Multiple Pages).csv', index = False )
df.head()


Unnamed: 0,Date,Time,Produced,Demand,Depriciation
0,18-04-2021,21:00:00,12661,12661,0.0
1,18-04-2021,20:00:00,12451,12451,0.0
2,18-04-2021,19:30:00,12398,12398,0.0
3,18-04-2021,19:00:00,11803,11803,0.0
4,18-04-2021,18:00:00,10182,10182,0.0
