# Scraping Data

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
# here we select which website we want to scrape data from
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_Kingdom'

page = requests.get(url)

soup = BeautifulSoup(page.text, 'html')

In [3]:
#print(soup)

In [4]:
# After inspecting element on the wikipedia page, i found this class which contains the table we
# are working with
table = soup.find('table', class_="wikitable sortable")
#print(table)

In [5]:
# we have found, by looking above what each title starts with, which is a 'th'
titles = table.find_all('th')

In [6]:
# list comprehension, which will pull out the titles, .strip() removed all \n from the text and cleaned it a bit
table_titles = [title.text.strip() for title in titles]

In [7]:
print(table_titles)

['Rank', 'Fortune 500rank', 'Name', 'Industry', 'Revenue(USD millions)', 'Profits(USD millions)', 'Assets(USD millions)', 'Employees', 'Headquarters']


In [8]:
# Here we create our dataframe with the titles we have just pulled out from wikipedia
df = pd.DataFrame(columns = table_titles)
df

Unnamed: 0,Rank,Fortune 500rank,Name,Industry,Revenue(USD millions),Profits(USD millions),Assets(USD millions),Employees,Headquarters


In [9]:
# after some digging, i found that all the data we want to use is within the <tr> branch
# in the html code
column_data = table.find_all('tr')

In [10]:
# Had to start at position 1, as position 0 was empty and causing an error
# loops through and finds all data tabled 'td' in the html code
# it then uses strip() to clean the data and converts to text
# we then check the length of the df and then appends the data to the df

for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    length = len(df)
    df.loc[length] = individual_row_data

df.head(10)

Unnamed: 0,Rank,Fortune 500rank,Name,Industry,Revenue(USD millions),Profits(USD millions),Assets(USD millions),Employees,Headquarters
0,1,15,Shell plc,Oil and Gas,272657,20101,404379,82000,London
1,2,35,BP,Oil and Gas,164195,7565,287272,65000,London
2,3,126,Tesco,Retail,84192,2031,66219,231223,Welwyn Garden City
3,4,149,HSBC,Banking,77330,13917,2957939,219697,London
4,5,198,Aviva,Insurance,64240,2703,485481,22062,London
5,6,201,Rio Tinto,Mining,63495,21094,102896,49345,London
6,7,203,Legal & General,Insurance,62504,2819,789066,10743,London
7,8,205,Unilever,Consumer goods,62006,7151,85383,148044,London
8,9,222,Lloyds Banking Group,Banking,58476,7954,1200620,57955,London
9,10,247,Vodafone,Telecommunication,52931,2424,170749,96941,Newbury


In [11]:
# We then save this data to a csv file, i also decided to remove the indexing, as this
# could get annoying after a while if working with alot of data.
df.to_csv(r'C:\Users\Dean\uk_companies.csv', index= False)