### A Step-by-step Guide on Python Web Scraping a Wikipedia Page

***Step 1: Fetch the web page and convert the html page into text with the help of Python request library***

In [10]:
#import the python request library to query a website
import requests
#specify the url we want to scrape from
Link = "https://en.wikipedia.org/wiki/List_of_cities_by_number_of_billionaires"
#convert the web page to text
Link_text = requests.get(Link).text
print(Link_text)

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>List of cities by number of billionaires - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XelGbApAAEQAAJZD63sAAABU","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_cities_by_number_of_billionaires","wgTitle":"List of cities by number of billionaires","wgCurRevisionId":929405489,"wgRevisionId":929405489,"wgArticleId":45271876,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Al

***Step 2***: In order to fetch useful information, convert Link_text (which is of string data type) into BeautifulSoup object. Import BeautifulSoup library from bs4

In [11]:
#import BautifulSoup library to pull data out of HTML and XML files
from bs4 import BeautifulSoup
#to convert Link_text into a BeautifulSoup Object
soup = BeautifulSoup(Link_text, 'lxml')
print(soup)

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of cities by number of billionaires - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XelGbApAAEQAAJZD63sAAABU","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_cities_by_number_of_billionaires","wgTitle":"List of cities by number of billionaires","wgCurRevisionId":929405489,"wgRevisionId":929405489,"wgArticleId":45271876,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Al

***Step 3***: With the help of the prettify() function, make the indentation proper

In [12]:
#make the indentation proper
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of cities by number of billionaires - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XelGbApAAEQAAJZD63sAAABU","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_cities_by_number_of_billionaires","wgTitle":"List of cities by number of billionaires","wgCurRevisionId":929405489,"wgRevisionId":929405489,"wgArticleId":45271876,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"

***Step 4***: To fetch the web page title, use soup.title

In [13]:
#To take a look at the title of the web page
print(soup.title)

<title>List of cities by number of billionaires - Wikipedia</title>


***Step 5***: We want only the string part of the title, not the tags

In [14]:
#Only the string not the tags
print(soup.title.string)

List of cities by number of billionaires - Wikipedia


***Step 6***: We can also explore <а></а> tags in the soup object

In [15]:
#First <a></a> tag
soup.a

<a id="top"></a>

***Step 7***: Explore all <а></а> tags

In [16]:
#all the <a> </a> tags
soup.find_all('a')

[<a id="top"></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>,
 <a class="mw-redirect" href="/wiki/Forbes_Magazine" title="Forbes Magazine">Forbes Magazine</a>,
 <a href="/wiki/New_York_City" title="New York City">New York City</a>,
 <a href="/wiki/Moscow" title="Moscow">Moscow</a>,
 <a href="/wiki/Hong_Kong" title="Hong Kong">Hong Kong</a>,
 <a href="#cite_note-1">[1]</a>,
 <a href="#Business_Insider"><span class="tocnumber">1</span> <span class="toctext"><i>Business Insider</i></span></a>,
 <a href="#The_Times_of_London"><span class="tocnumber">2</span> <span class="toctext"><i>The Times</i> of London</span></a>,
 <a href="#UBS_Wealth_Census"><span class="tocnumber">3</span> <span class="toctext"><i>UBS Wealth Census</i></span></a>,
 <a href="#See_also"><span class="tocnumber">4</span> <span class="toctext">See also</span></a>,
 <a href="#References"><span class="tocnumber">5</span> <span class="toctex

***Step 8***: Again, just the way we fetched title tags, we will fetch all table tags

In [17]:
#Fetch all the table tags
all_table = soup.find_all('table')
print(all_table)

[<table class="wikitable">
<tbody><tr>
<td><b>City</b>
</td>
<td><b>Country</b>
</td>
<td><b>Billionaires</b>
</td></tr>
<tr>
<td><a href="/wiki/New_York_City" title="New York City">New York City</a>
</td>
<td><span class="flagicon"><img alt="" class="thumbborder" data-file-height="650" data-file-width="1235" decoding="async" height="12" src="//upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_States.svg/23px-Flag_of_the_United_States.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_States.svg/35px-Flag_of_the_United_States.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_States.svg/46px-Flag_of_the_United_States.svg.png 2x" width="23"/> </span><a href="/wiki/United_States" title="United States">USA</a>
</td>
<td>103
</td></tr>
<tr>
<td><a href="/wiki/Hong_Kong" title="Hong Kong">Hong Kong</a>
</td>
<td><span class="flagicon"><img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decodi

***Step 9***: Since our aim is to get the List of Billionaires from the wiki-page, we need to find out the table class name. Go to the webpage. Inspect the table by placing cursor over the table and inspect the element using ‘Shift+Q’.

***Step 10***: Now, fetch all table tags with the class name ‘wikitable sortable’

In [20]:
#fetch all the table tags with class name="wikitable sortable"
our_table = soup.find('table', class_= 'wikitable')
print(our_table)

<table class="wikitable">
<tbody><tr>
<td><b>City</b>
</td>
<td><b>Country</b>
</td>
<td><b>Billionaires</b>
</td></tr>
<tr>
<td><a href="/wiki/New_York_City" title="New York City">New York City</a>
</td>
<td><span class="flagicon"><img alt="" class="thumbborder" data-file-height="650" data-file-width="1235" decoding="async" height="12" src="//upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_States.svg/23px-Flag_of_the_United_States.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_States.svg/35px-Flag_of_the_United_States.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_States.svg/46px-Flag_of_the_United_States.svg.png 2x" width="23"/> </span><a href="/wiki/United_States" title="United States">USA</a>
</td>
<td>103
</td></tr>
<tr>
<td><a href="/wiki/Hong_Kong" title="Hong Kong">Hong Kong</a>
</td>
<td><span class="flagicon"><img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decodin

***Step 11***: We can see that the information that we want to retrieve from the table has <а> tags in them. So, find all the <а> tags from table_links.

In [21]:
#In the table that we will fetch find the <a> </a>tags  
table_links = our_table.find_all('a')
print(table_links)

[<a href="/wiki/New_York_City" title="New York City">New York City</a>, <a href="/wiki/United_States" title="United States">USA</a>, <a href="/wiki/Hong_Kong" title="Hong Kong">Hong Kong</a>, <a href="/wiki/China" title="China">China</a>, <a href="/wiki/San_Francisco" title="San Francisco">San Francisco</a>, <a href="/wiki/United_States" title="United States">USA</a>, <a href="/wiki/Moscow" title="Moscow">Moscow</a>, <a href="/wiki/Russia" title="Russia">Russia</a>, <a href="/wiki/London" title="London">London</a>, <a href="/wiki/United_Kingdom" title="United Kingdom">UK</a>, <a href="/wiki/Beijing" title="Beijing">Beijing</a>, <a href="/wiki/China" title="China">China</a>, <a href="/wiki/Mumbai" title="Mumbai">Mumbai</a>, <a href="/wiki/India" title="India">India</a>, <a href="/wiki/Singapore" title="Singapore">Singapore</a>, <a href="/wiki/Singapore" title="Singapore">Singapore</a>, <a href="/wiki/Dubai" title="Dubai">Dubai</a>, <a href="/wiki/United_Arab_Emirates" title="United Arab

***Step 12***: In order to put the title on a list, iterate over table_links and append the title using the get() function

In [24]:
#put the title into a list 
billionaires = []
for links in table_links:
    billionaires.append(links.get('title'))
print(billionaires)

['New York City', 'United States', 'Hong Kong', 'China', 'San Francisco', 'United States', 'Moscow', 'Russia', 'London', 'United Kingdom', 'Beijing', 'China', 'Mumbai', 'India', 'Singapore', 'Singapore', 'Dubai', 'United Arab Emirates', 'Shenzhen', 'China']


***Step 13***: Now that we have our required data in the form of a list, we will be using Python Pandas library to save the data in an Excel file. Before that, we have to convert the list into a DataFrame

In [25]:
#Convert the list into a dataframe 
import pandas as pd
df = pd.DataFrame(billionaires)
print(df)

                       0
0          New York City
1          United States
2              Hong Kong
3                  China
4          San Francisco
5          United States
6                 Moscow
7                 Russia
8                 London
9         United Kingdom
10               Beijing
11                 China
12                Mumbai
13                 India
14             Singapore
15             Singapore
16                 Dubai
17  United Arab Emirates
18              Shenzhen
19                 China


***Step 14***: Use the following method to write data into an Excel file.

In [26]:
#To save the data into an excel file 
writer = pd.ExcelWriter('cities_with_billionaires.xlsx', engine='xlsxwriter')
df.to_excel(writer, sheet_name='List')
writer.save()

Now our data has been saved into an Excel workbook with the name ‘indian_billionaires.xlsx’ and inside a sheet named ‘List’.

***Step 15***: Just to make sure if the Excel workbook is saved or not, read the file using read_excel

In [28]:
#check if it’s done right or not
df1= pd.read_excel('cities_with_billionaires.xlsx')
df1

Unnamed: 0.1,Unnamed: 0,0
0,0,New York City
1,1,United States
2,2,Hong Kong
3,3,China
4,4,San Francisco
5,5,United States
6,6,Moscow
7,7,Russia
8,8,London
9,9,United Kingdom


In [30]:
df1.iloc[:,1]

0            New York City
1            United States
2                Hong Kong
3                    China
4            San Francisco
5            United States
6                   Moscow
7                   Russia
8                   London
9           United Kingdom
10                 Beijing
11                   China
12                  Mumbai
13                   India
14               Singapore
15               Singapore
16                   Dubai
17    United Arab Emirates
18                Shenzhen
19                   China
Name: 0, dtype: object

In [31]:
del df1['Unnamed: 0']
df1

Unnamed: 0,0
0,New York City
1,United States
2,Hong Kong
3,China
4,San Francisco
5,United States
6,Moscow
7,Russia
8,London
9,United Kingdom
