# Web scraping with Selenium

## Nigeria's 2019 General Elections

In [3]:
#importing our libraries
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
import pandas as pd

In [30]:
#creating our driver
website = 'https://www.bbc.co.uk/news/resources/idt-f0b25208-4a1d-4068-a204-940cbe88d1d3'
path = 'C:/Users/user/chromedriver_win32/chromedriver.exe'
driver=webdriver.Chrome(path)
driver.get(website)
driver.maximize_window()

In [31]:
election_results=driver.find_element_by_xpath("//div[@class='r-display']/div[@class='r-info-table']/table/tbody")

In [44]:
#on the website, the states information are obtained using a dynamic dropdown to select the states you want to grab info from
#hence, why we imported Select library and the reason for the code immediately below this

state_dropdown= Select(driver.find_element_by_xpath("//select[@class= 'r-display__dropdown']"))

#creating empty lists so we pass the information scraped into it
states=[]
candidate=[]
pol_party=[]
votes=[]
percent=[]
accredited_votes=[]
valid_votes=[]

#'//select' contains the tag 'option' which has all the states we want to scrape data from
#when scraping data from a website that contains a dynamic dropdown, we locate the info in the dropdown
#through various means, I decide to use select by index which in this context is better than select by visible text
#'option' contains all the states so I decide to find the len of it which returned an output of 37
#which is also the number of states in Nigeria including the Federal Capital

drop= driver.find_element_by_xpath('//select')
last_state= len(drop.find_elements_by_tag_name('option'))

#now we create loops to iterate over the various states and extract the necessary data we need
index=0
x=1
while x <= last_state:
    state_dropdown.select_by_index(f'{index}') #select by index start from an index of 0 not 1 that is why x is set to 0
    time.sleep(3)
    states.append(driver.find_element_by_xpath(f'//select/option[{x}]').text) #unlike Python indexing, data contained in html
                                                                                #pages start from a count of 1, not 0
                                                                                #that is why x is set to 1 so it can obtain the
                                                                                #first data contained in the option tag
    
    #using string functions to extract the texts we need
    accredited_votes.append(driver.find_element_by_xpath('//ul[@class="r-display__info-list"]/li[1]').text.split('\n')[1])
    valid_votes.append(driver.find_element_by_xpath('//ul[@class="r-display__info-list"]/li[2]').text.split('\n')[1])
   
    
    
    results= election_results.find_elements_by_tag_name('tr')

    for result in results:
        candidate.append(result.find_element_by_xpath('./td[1]').text.split('(')[0])
        pol_party.append(result.find_element_by_xpath('./td[1]').text.split(' ')[-1].strip('()'))
        votes.append(result.find_element_by_xpath('./td[2]').text)
        percent.append(result.find_element_by_xpath('./td[3]').text)
        
    x=x+1
    index=index+1
    
    

## Converting our data to a dataframe using Pandas

In [45]:
#using pandas to convert our data into a dataframe and convert it into a csv file
votes_num_df=pd.DataFrame({'states':states, 'accredited_votes': accredited_votes, 'valid_votes':valid_votes})
votes_num_df.replace(',','', regex=True, inplace=True)
votes_num_df.to_csv('nig_elect2019_voters_num.csv',index=False)

#checking the top 9 rows of our dataframe
votes_num_df.head(10)

Unnamed: 0,states,accredited_votes,valid_votes
0,Abia,361561,323291
1,Adamawa,874920,811534
2,Akwa Ibom,695677,578775
3,Anambra,675273,605734
4,Bauchi,1075330,1024307
5,Bayelsa,344237,321767
6,Benue,786069,728912
7,Borno,987290,919786
8,Cross River,461033,421901
9,Delta,891647,829762


In [46]:
#for every state we extract from a page, we also extract the top 5 votes from each states, political parties...
#converting it a dataframe will return an error because for every state in a column, we have 5 other rows in other columns
#example state = ['Abia'], pol_party= ['PDP', 'APC', 'CPC'...]
#we extend the states names so it matches the number of rows in the other columns
#example state= ['Abia', 'Abia', 'Abia'], pol_party= ['PDP', 'APC', 'CPC']

state_name=[]

for i in states:
    state_name.extend([i,i,i,i,i])
    
election_result_df=pd.DataFrame({'state_name':state_name, 'candidates':candidate, 
                                 'political_party':pol_party, 'votes': votes, 'percent_diff%': percent})

#converting our data to a csv file and printing the first 9 rows
election_result_df.replace({',':'','%':''}, regex=True, inplace=True)
election_result_df.to_csv('nig_elect2019_results.csv', index=False)
election_result_df.head(10)

Unnamed: 0,state_name,candidates,political_party,votes,percent_diff%
0,Abia,Atiku Abubakar,PDP,219698,67.96
1,Abia,Muhammadu Buhari,APC,85058,26.31
2,Abia,Gbor John Wilson Terwase,APGA,9638,2.98
3,Abia,Felix Nicolas,PCP,1489,0.46
4,Abia,Kingsley Moghalu,YPP,720,0.22
5,Adamawa,Atiku Abubakar,PDP,410266,50.55
6,Adamawa,Muhammadu Buhari,APC,378078,46.59
7,Adamawa,Obadiah Mailafia,ADC,3989,0.49
8,Adamawa,Felix Nicolas,PCP,3670,0.45
9,Adamawa,Omoyele Sowore,AAC,282,0.03


In [47]:
#to close the browser window, in other words, the page we are scraping data from
driver.quit()