In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:
# Step 1: Defining the URL for the Wikipedia page of the company 3i
url = 'https://en.wikipedia.org/wiki/3i'

In [4]:
# Step 2: Sending a request to the Wikipedia page and getting the content
page = requests.get(url)

# Step 3: Parsing the content of the page using BeautifulSoup to make it readable
soup = BeautifulSoup(page.text, 'html.parser')

In [5]:
# Step 4: Printing the entire content of the page (this is useful for understanding the structure)
print(soup)

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>3i - Wikipedia</title>
<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limi

In [13]:
# Step 5: Finding all tables on the page (there can be multiple tables on Wikipedia pages)
soup.find_all('table')[0]

<table class="infobox ib-company vcard"><caption class="infobox-title fn org">3i Group plc</caption><tbody><tr><td class="infobox-image ib-company-logo logo" colspan="2"><span class="mw-default-size" typeof="mw:File/Frameless"><a class="mw-file-description" href="/wiki/File:3i_Group_Plc_logo.svg"><img class="mw-file-element" data-file-height="311" data-file-width="322" decoding="async" height="212" src="//upload.wikimedia.org/wikipedia/en/thumb/7/76/3i_Group_Plc_logo.svg/220px-3i_Group_Plc_logo.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/7/76/3i_Group_Plc_logo.svg/330px-3i_Group_Plc_logo.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/7/76/3i_Group_Plc_logo.svg/440px-3i_Group_Plc_logo.svg.png 2x" width="220"/></a></span></td></tr><tr><th class="infobox-label" scope="row">Formerly</th><td class="infobox-data nickname"><style data-mw-deduplicate="TemplateStyles:r1126788409">.mw-parser-output .plainlist ol,.mw-parser-output .plainlist ul{line-height:inherit;list-sty

In [16]:
# Step 6: Finding the specific table we are interested in, which holds company information
# We use the 'class' name to find the right table (here, the table with class "infobox ib-company vcard")
soup.find('table', class_ = 'infobox ib-company vcard')

<table class="infobox ib-company vcard"><caption class="infobox-title fn org">3i Group plc</caption><tbody><tr><td class="infobox-image ib-company-logo logo" colspan="2"><span class="mw-default-size" typeof="mw:File/Frameless"><a class="mw-file-description" href="/wiki/File:3i_Group_Plc_logo.svg"><img class="mw-file-element" data-file-height="311" data-file-width="322" decoding="async" height="212" src="//upload.wikimedia.org/wikipedia/en/thumb/7/76/3i_Group_Plc_logo.svg/220px-3i_Group_Plc_logo.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/7/76/3i_Group_Plc_logo.svg/330px-3i_Group_Plc_logo.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/7/76/3i_Group_Plc_logo.svg/440px-3i_Group_Plc_logo.svg.png 2x" width="220"/></a></span></td></tr><tr><th class="infobox-label" scope="row">Formerly</th><td class="infobox-data nickname"><style data-mw-deduplicate="TemplateStyles:r1126788409">.mw-parser-output .plainlist ol,.mw-parser-output .plainlist ul{line-height:inherit;list-sty

In [17]:
table = soup.find('table', {'class': 'infobox ib-company vcard'})

In [18]:
# Step 7: Initializing two lists, one for the labels (like "Founded", "Revenue") and one for the information
Labels = []
Information = []

In [20]:
for row in table.find_all('tr'):
    header = row.find('th', {'class': 'infobox-label'})
    value = row.find('td', {'class': 'infobox-data'})
    
    if header and value:
        Labels.append(header.text.strip())
        Information.append(value.text.strip())

In [21]:
# Importing pandas to work with data in table format
import pandas as pd

In [22]:
# Step 8: Creating an empty DataFrame to store the labels and information
df = pd.DataFrame({'Labels': Labels, 'Information': Information})

In [23]:
# Step 9: Printing the empty DataFrame for reference
print(df)

                 Labels                                        Information
0              Formerly  Finance for Industry Public Limited Company (1...
1          Company type                             Public limited company
2             Traded as                         LSE: IIIFTSE 100 Component
3                  ISIN                                       GB00B1YW4409
4              Industry                                  Corporate finance
5               Founded                          1945; 79 years ago (1945)
6               Founder   Bank of England and a syndicate of British banks
7          Headquarters                                London, England, UK
8            Key people  David Hutchison (chairman)Simon Borrows (CEO) [2]
9              Services            Private equityInfrastructure investment
10     Operating income                           £4,579 million (2023)[3]
11           Net income                           £4,577 million (2023)[3]
12                  AUM  

In [24]:
# Step 10: Printing the DataFrame in a cleaner way, without row numbers
print(df.to_string(index=False))

             Labels                                                                                           Information
           Formerly Finance for Industry Public Limited Company (1973–1983)Investors in Industry Group plc (1983–1988)[1]
       Company type                                                                                Public limited company
          Traded as                                                                            LSE: IIIFTSE 100 Component
               ISIN                                                                                          GB00B1YW4409
           Industry                                                                                     Corporate finance
            Founded                                                                             1945; 79 years ago (1945)
            Founder                                                      Bank of England and a syndicate of British banks
       Headquarters     

In [26]:
# Importing regular expressions library to help clean the data
import re

In [27]:
# Step 11: Defining a function to remove references (like [1], [2]) from the information
def remove_references(text):
    return re.sub(r'\[\d+\]', '', text)

In [28]:
# Step 12: Applying the function to the 'Information' column to clean up references
df['Information'] = df['Information'].apply(remove_references)

In [31]:
# Step 13: Printing the DataFrame after removing references
print(df.to_string(index=False))

             Labels                                                                                          Information
           Formerly Finance for Industry Public Limited Company (1973–1983)\nInvestors in Industry Group plc (1983–1988)
       Company type                                                                               Public limited company
          Traded as                                                                           LSE: IIIFTSE 100 Component
               ISIN                                                                                         GB00B1YW4409
           Industry                                                                                    Corporate finance
            Founded                                                                            1945; 79 years ago (1945)
            Founder                                                     Bank of England and a syndicate of British banks
       Headquarters             

In [32]:
# Step 14: Defining a function to remove any new line characters in the information and replace them with spaces
def remove_newlines(text):
    return text.replace('\n', ' ')

# Step 15: Applying the function to the 'Information' column to ensure the text is in a single line
df['Information'] = df['Information'].apply(remove_newlines)

# Step 16: Printing the DataFrame again to see the changes
print(df.to_string(index=False))

             Labels                                                                                         Information
           Formerly Finance for Industry Public Limited Company (1973–1983) Investors in Industry Group plc (1983–1988)
       Company type                                                                              Public limited company
          Traded as                                                                          LSE: IIIFTSE 100 Component
               ISIN                                                                                        GB00B1YW4409
           Industry                                                                                   Corporate finance
            Founded                                                                           1945; 79 years ago (1945)
            Founder                                                    Bank of England and a syndicate of British banks
       Headquarters                     

In [33]:
# Step 17: Final output of the DataFrame (can be used for further analysis)
print(df)

                 Labels                                        Information
0              Formerly  Finance for Industry Public Limited Company (1...
1          Company type                             Public limited company
2             Traded as                         LSE: IIIFTSE 100 Component
3                  ISIN                                       GB00B1YW4409
4              Industry                                  Corporate finance
5               Founded                          1945; 79 years ago (1945)
6               Founder   Bank of England and a syndicate of British banks
7          Headquarters                                London, England, UK
8            Key people     David Hutchison (chairman)Simon Borrows (CEO) 
9              Services            Private equityInfrastructure investment
10     Operating income                              £4,579 million (2023)
11           Net income                              £4,577 million (2023)
12                  AUM  

In [36]:
# Step 18: Saving the DataFrame to a CSV file (for external use, like Excel)
df.to_csv('3i_group_wikipedia_data.csv', index=False, encoding='utf-8')