In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# Import necessary packages
from bs4 import BeautifulSoup
import requests
import re
url="https://en.wikipedia.org/wiki/2019_Nigerian_general_election"
# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text
# Parse HTML code for the entire site
soup = BeautifulSoup(html_content, "lxml")
#print(soup.prettify()) # print the parsed data of html

In [3]:
election_tables = soup.find_all("table", attrs={"class": "wikitable"})
print("Number of tables on site: ",len(election_tables))

Number of tables on site:  8


In [4]:
# Lets go ahead and scrape first table with HTML code election_tables[0]
table1 = election_tables[4]
# the head will form our column names
body = table1.find_all("tr")
# Head values (Column names) are the first items of the body list
head = body[0] # 0th item is the header row
body_rows = body[1:] # All other items becomes the rest of the rows

headings = []
for item in head.find_all("th"): 
    # convert the th elements to text and strip "\n"
    item = (item.text).rstrip("\n")
    headings.append(item)
print(headings)

['State', 'Buhari', 'Atiku', 'Nicolas', 'Sowore', 'Moghalu', 'Durotoye', 'Duke', 'Mailafia']


In [5]:
all_rows = [] # will be a list for list for all rows
for row_num in range(len(body_rows)): # A row at a time
    row = [] # this will old entries for one row
    for row_item in body_rows[row_num].find_all("td"): 
        # remove \xa0 and \n and comma from row_item.text
        aa = re.sub("(\xa0)|(\n)|,","",row_item.text)
        row.append(aa)
    # append one row to all_rows
    all_rows.append(row)

In [6]:
new_rows = []
for row in all_rows:
    new_row = [t for i,t in enumerate(row) if i==0 or i%2!=0]
    new_rows.append(new_row)
print(new_rows)

[[], ['Abia', '85058', '219698', '1489', '212', '88', '720', '472', '336'], ['Adamawa', '378078', '410266', '3670', '282', '109', '162', '978', '3989'], ['Akwa Ibom', '175429', '395832', '1902', '222', '118', '95', '92', '230'], ['Anambra', '33298', '524738', '4374', '124', '4091', '45', '932', '227'], ['Bauchi', '798428 ', '209313', '2104', '183', '112', '46', '516', '296'], ['Bayelsa', '118821', '197933', '1584', '126', '50', '37', '124', '1078'], ['Benue', '347668', '356817', '2793', '309', '557', '201', '4927', '554'], ['Borno', '836496', '71788', '', '269', '78', '29', '322', '301'], ['Cross River', '117302', '295737', '', '242', '217', '88', '1395', '326'], ['Delta', '221292', '594068', '', '1626', '497', '320', '1745', '1075'], ['Ebonyi', '90726', '258573', '', '205', '192', '683', '452', '213'], ['Edo', '267842', '275691', '', '3106', '531', '273', '184', '850'], ['Ekiti', '219231', '154032', '', '400', '68', '88', '48', '406'], ['Enugu', '54423', '355553', '', '219', '1379', '

In [7]:
# all_rows becomes our data and headings the column names
df = pd.DataFrame(data=new_rows,columns=headings)
df.head()

Unnamed: 0,State,Buhari,Atiku,Nicolas,Sowore,Moghalu,Durotoye,Duke,Mailafia
0,,,,,,,,,
1,Abia,85058.0,219698.0,1489.0,212.0,88.0,720.0,472.0,336.0
2,Adamawa,378078.0,410266.0,3670.0,282.0,109.0,162.0,978.0,3989.0
3,Akwa Ibom,175429.0,395832.0,1902.0,222.0,118.0,95.0,92.0,230.0
4,Anambra,33298.0,524738.0,4374.0,124.0,4091.0,45.0,932.0,227.0


In [8]:
df.tail()

Unnamed: 0,State,Buhari,Atiku,Nicolas,Sowore,Moghalu,Durotoye,Duke,Mailafia
35,Yobe,497914.0,50763.0,,137.0,36.0,37.0,180.0,162.0
36,Zamfara,438682.0,125423.0,,186.0,44.0,24.0,81.0,186.0
37,FCT,152224.0,259997.0,,583.0,1083.0,652.0,410.0,246.0
38,Total,15191847.0,11262978.0,110196.0,33953.0,21886.0,16779.0,34746.0,97874.0
39,Source: BBC This Day Vanguard[49][50][51],,,,,,,,


In [9]:
url="https://www.bbc.co.uk/news/resources/idt-f0b25208-4a1d-4068-a204-940cbe88d1d3"

html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
bbc_tables = soup.find_all("table", attrs={"class": "r-table__table"})
print("Number of tables on site: ",len(bbc_tables))

Number of tables on site:  1


In [10]:
df1=pd.read_html(str(bbc_tables))
# convert list to dataframe
df_=pd.DataFrame(df1[0])
print(df_.head())

       State     APC     PDP   PCP   ADC   APGA
0       Abia   85058  219698  1489   336   9638
1    Adamawa  378078  410266  3670  3989    159
2  Akwa Ibom  175429  395832  1902   230     61
3    Anambra   33298  524738  4374   227  30034
4     Bauchi  798428  209313  2104   296    149


In [11]:
election_da = pd.merge(df, df_, on='State', how='left')

In [12]:
election_da.head()

Unnamed: 0,State,Buhari,Atiku,Nicolas,Sowore,Moghalu,Durotoye,Duke,Mailafia,APC,PDP,PCP,ADC,APGA
0,,,,,,,,,,,,,,
1,Abia,85058.0,219698.0,1489.0,212.0,88.0,720.0,472.0,336.0,85058.0,219698.0,1489.0,336.0,9638.0
2,Adamawa,378078.0,410266.0,3670.0,282.0,109.0,162.0,978.0,3989.0,378078.0,410266.0,3670.0,3989.0,159.0
3,Akwa Ibom,175429.0,395832.0,1902.0,222.0,118.0,95.0,92.0,230.0,175429.0,395832.0,1902.0,230.0,61.0
4,Anambra,33298.0,524738.0,4374.0,124.0,4091.0,45.0,932.0,227.0,33298.0,524738.0,4374.0,227.0,30034.0


In [13]:
election_da.shape

(40, 14)

In [14]:
election_da['State'].unique()

array([None, 'Abia', 'Adamawa', 'Akwa Ibom', 'Anambra', 'Bauchi',
       'Bayelsa', 'Benue', 'Borno', 'Cross River', 'Delta', 'Ebonyi',
       'Edo', 'Ekiti', 'Enugu', 'Gombe', 'Imo', 'Jigawa', 'Kaduna',
       'Kano', 'Katsina', 'Kebbi', 'Kogi', 'Kwara', 'Lagos', 'Nassarawa',
       'Niger', 'Ogun', 'Ondo', 'Osun', 'Oyo', 'Plateau', 'Rivers',
       'Sokoto', 'Taraba', 'Yobe', 'Zamfara', 'FCT', 'Total',
       'Source: BBC This Day Vanguard[49][50][51]'], dtype=object)

In [15]:
election_data = pd.merge(df, df_, on='State', how='outer')

In [16]:
election_data.head()

Unnamed: 0,State,Buhari,Atiku,Nicolas,Sowore,Moghalu,Durotoye,Duke,Mailafia,APC,PDP,PCP,ADC,APGA
0,,,,,,,,,,,,,,
1,Abia,85058.0,219698.0,1489.0,212.0,88.0,720.0,472.0,336.0,85058.0,219698.0,1489.0,336.0,9638.0
2,Adamawa,378078.0,410266.0,3670.0,282.0,109.0,162.0,978.0,3989.0,378078.0,410266.0,3670.0,3989.0,159.0
3,Akwa Ibom,175429.0,395832.0,1902.0,222.0,118.0,95.0,92.0,230.0,175429.0,395832.0,1902.0,230.0,61.0
4,Anambra,33298.0,524738.0,4374.0,124.0,4091.0,45.0,932.0,227.0,33298.0,524738.0,4374.0,227.0,30034.0


In [17]:
election_data.tail()

Unnamed: 0,State,Buhari,Atiku,Nicolas,Sowore,Moghalu,Durotoye,Duke,Mailafia,APC,PDP,PCP,ADC,APGA
37,FCT,152224.0,259997.0,,583.0,1083.0,652.0,410.0,246.0,152224.0,259997.0,2921.0,246.0,255.0
38,Total,15191847.0,11262978.0,110196.0,33953.0,21886.0,16779.0,34746.0,97874.0,,,,,
39,Source: BBC This Day Vanguard[49][50][51],,,,,,,,,,,,,
40,Nasarawa,,,,,,,,,289903.0,283847.0,1868.0,339.0,1523.0
41,Total:,,,,,,,,,15191847.0,11262978.0,107286.0,97874.0,66851.0


In [18]:
election_data["State"].unique()

array([None, 'Abia', 'Adamawa', 'Akwa Ibom', 'Anambra', 'Bauchi',
       'Bayelsa', 'Benue', 'Borno', 'Cross River', 'Delta', 'Ebonyi',
       'Edo', 'Ekiti', 'Enugu', 'Gombe', 'Imo', 'Jigawa', 'Kaduna',
       'Kano', 'Katsina', 'Kebbi', 'Kogi', 'Kwara', 'Lagos', 'Nassarawa',
       'Niger', 'Ogun', 'Ondo', 'Osun', 'Oyo', 'Plateau', 'Rivers',
       'Sokoto', 'Taraba', 'Yobe', 'Zamfara', 'FCT', 'Total',
       'Source: BBC This Day Vanguard[49][50][51]', 'Nasarawa', 'Total:'],
      dtype=object)

In [19]:
election_data.shape

(42, 14)

In [20]:
election_data.to_csv("election_data.csv", index= False)