Web Scraping from Financial Times Websites

In [12]:
#Import Libraries for Web Scraping
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import timedelta, date, datetime

In [13]:
#Get data from current time in format (2023-10-30 213030)
current_time = datetime.now()
format_date_time = current_time.strftime("%Y-%m-%d %H%M%S")

In [14]:
#Scrape from URL 
url_to_scrape = 'https://markets.ft.com/data/equities/tearsheet/profile?s=KNM:KLS'

#to keep information by using request library
s_request = requests.session()

#GET request to the URL to send back the content of the web page 
soup_content = s_request.get(url_to_scrape)

In [15]:
#he parser used to parse the HTML content to soup_content in structured content
soup_content = BeautifulSoup(soup_content.content, 'html.parser')

In [6]:
#save parsed HTML
with open (format_date_time+".html" , "w", encoding='utf-8') as files_f:
    files_f.write(str(soup_content))
    files_f.close()

In [16]:
#find table element within HTML content (soup_content)
#"table" - search for table element 
table = soup_content.find("table", attrs={"class":"mod-ui-table mod-ui-table--freeze-pane"} )


In [17]:
#collect headers of table that often siplaying titles for column
header = table.find_all('th')

In [18]:
#Display headers of table
header


[<th class="mod-ui-table__header--text">Company</th>,
 <th data-mod-view-category="keyinformation">Revenue (TTM)</th>,
 <th data-mod-view-category="keyinformation">Net income (TTM)</th>,
 <th data-mod-view-category="keyinformation">Market cap</th>,
 <th data-mod-view-category="keyinformation">Employees</th>,
 <th data-mod-view-category="stockperformance">Price/earnings (TTM)</th>,
 <th data-mod-view-category="stockperformance">Price/book value (MRQ)</th>,
 <th data-mod-view-category="stockperformance">Price/cash flow (TTM)</th>,
 <th data-mod-view-category="stockperformance">Price/sales (TTM)</th>,
 <th data-mod-view-category="persharedata">EPS (excl. extraordinary items, TTM)</th>,
 <th data-mod-view-category="persharedata">EPS (incl. extraordinary items, TTM)</th>,
 <th data-mod-view-category="persharedata">Revenue per share (TTM)</th>,
 <th data-mod-view-category="persharedata">Book value per share (MRQ)</th>,
 <th data-mod-view-category="efficiency">Asset turnover (TTM)</th>,
 <th 

In [19]:
#define function get_header to list down HTML table header
def get_header(tr):
    #empty list to store extracted text from the header cells
    table_headers = []
    #starts a loop that iterates through each table header element in the list 'tr'
    #each header element is temporarily assigned to the variable 'tx'
    for tx in tr:
        #extract table headers element in a list
        table_headers.append(tx.get_text())

    return table_headers

all_headerow = get_header(header)

In [22]:
all_headerow

['Company',
 'Revenue (TTM)',
 'Net income (TTM)',
 'Market cap',
 'Employees',
 'Price/earnings (TTM)',
 'Price/book value (MRQ)',
 'Price/cash flow (TTM)',
 'Price/sales (TTM)',
 'EPS (excl. extraordinary items, TTM)',
 'EPS (incl. extraordinary items, TTM)',
 'Revenue per share (TTM)',
 'Book value per share (MRQ)',
 'Asset turnover (TTM)',
 'Inventory turnover (TTM)',
 'Receivables turnover (TTM)',
 'Revenues per employee (TTM)',
 'Return on avg assets (TTM)',
 'Return on avg assets (5 yr avg)',
 'Return on investment (TTM)',
 'Return on investment (5 yr avg)',
 'Gross margin (TTM)',
 'Gross margin(5 yr avg)',
 'Net profit margin (TTM)',
 'Net Profit margin(5 yr avg)',
 'Quick ratio (MRQ)',
 'Interest coverage (TTM)',
 'Total debt to capital (MRQ)',
 'Payout ratio(5 yr avg)',
 'Revenue(YOY change %)',
 'Revenue(5 yr growth rate)',
 'Net income(YOY change %)',
 'Net income(5 yr growth rate)',
 'Capital expenditure(5 yr growth rate)',
 'Dividend(5 yr growth rate)']

In [24]:
#get all data in rows table 
all_tr_in_table = table.find_all('tr')
# print(all_tr_in_table)
def get_row(tr):
    row = []
    for tx in tr:
        table_headers = []
        _data = tx.find_all('td')
        for _xx in _data:
            table_headers.append(_xx.get_text())

        row.append(table_headers)
    return row

#sent argument to get_row funtion
data = get_row(all_tr_in_table)

In [27]:
# pd.set_option('expand_frame_repr', False) # gunakan ini untuk print DataFrame dalam satu barisan
df_table = pd.DataFrame(data)
df_table.columns = all_headerow
print(df_table)
df_table.to_csv("WebSraping.csv")

                                     Company Revenue (TTM) Net income (TTM)  \
0                      Carimin Petroleum Bhd       254.74m           23.11m   
1                                Uzma Berhad       474.02m           38.99m   
2                          Icon Offshore Bhd       236.06m          158.82m   
3                              T7 Global Bhd       446.10m           25.65m   
4                                 Deleum Bhd       780.48m           44.43m   
5                      Perdana Petroleum Bhd       235.90m           32.32m   
6                           KNM Group Berhad       585.50m         -201.09m   
7                                  Wasco Bhd        2.81bn           17.09m   
8                          Sapura Energy Bhd        4.59bn          -3.06bn   
9   Malaysia Marine and Heavy Engg Hldgs Bhd        2.39bn         -342.08m   
10                        Velesto Energy Bhd       983.42m           20.59m   
11         Dayang Enterprise Holdings Berhad       9