In [1]:
# -------------------- Imports -------------------- #
import time
import pandas as pd
import json

from selenium import webdriver
from selenium.webdriver.common.by import By

In [2]:
# -------------------- Variables -------------------- #

TARGET_URL = 'https://www.ft.com/technology'
CHROME_DRIVER_PATH = r"C:\Users\miran\Desktop\GitHub\WEB-DRIVERS\chromedriver.exe"
NUMBER_OF_PAGES = 100

driver = None

In [3]:
# -------------------- Driver Configuration -------------------- #
try:
    driver = webdriver.Chrome(CHROME_DRIVER_PATH)
    driver.get(TARGET_URL)
    driver.maximize_window()
    driver.implicitly_wait(10)
    print('Configuration successful. Website loaded')
except:
    print('configuration failed!')

Configuration successful. Website loaded


In [4]:
# -------------------- Accepting Cookies -------------------- #
try:
    driver.find_element_by_class_name('o-cookie-message__button').click()
    print('Cookies accepted successfully.')
except:
    print('Cookies are not accepted')

Cookies accepted successfully.


In [5]:
# $x('//li[@class="o-teaser-collection__item o-grid-row"]//a[@class="o-teaser__tag"]'); PUBLISHER
# $x('//li[@class="o-teaser-collection__item o-grid-row"]//a[@class="js-teaser-heading-link"]'); TITLES
# $x('//li[@class="o-teaser-collection__item o-grid-row"]//a[@class="js-teaser-standfirst-link"]'); CONTENT

# XPATH I used
# $x('//li[@class="o-teaser-collection__item o-grid-row"]//*[@class="o-teaser__content"]'); FOR CONTAINER WHICH CONTAINS: Publisher, Title, Content describtion
# $x('//li[@class="o-teaser-collection__item o-grid-row"]//time[@class="o-date"]'); DATES

In [6]:
# -------------------- start timer -------------------- #
startTime = time.time()

In [7]:
# -------------------- WEB SCRAPER VARIABLES -------------------- #
dates = []
publishers = []
titles = []
content = []

In [8]:
# container = driver.find_elements_by_xpath('//li[@class="o-teaser-collection__item o-grid-row"]//*[@class="o-teaser__content"]')
# frontier = container[0].text.split('\n')
# frontier.pop() # last element is "save" bookmark. So it is not really useful.
# if ' content' in frontier:
#     frontier.remove(' content')
# print(frontier)

In [9]:
# -------------------- WEB SCRAPER -------------------- #

try:
    
    for page in range(NUMBER_OF_PAGES):

        # finds content using XPATH
        container = driver.find_elements_by_xpath('//li[@class="o-teaser-collection__item o-grid-row"]//*[@class="o-teaser__content"]') # CONTAINS: Publisher, Title, Content describtion
        dates_container = driver.find_elements_by_xpath('//li[@class="o-teaser-collection__item o-grid-row"]//time[@class="o-date"]') # CONTAINS: Dates

        # takes every element from container and adds it to a frontier where the text splits into sections: Publisher, Title, Content. Adding each element to respected array.
        for i in range(len(container)):
            frontier = container[i].text.split('\n')
            frontier.pop() # last element is "save" bookmark. So it is not really useful.
            if ' content' in frontier:
                frontier.remove(' content')
            # print(frontier)
            
            if len(frontier) == 4:
                frontier.pop()
            
            if len(frontier) == 3:
                publishers.append(frontier[0])
                titles.append(frontier[1])
                content.append(frontier[2])
            
            # it catches the news articles with no content describtion
            elif len(frontier) == 2:
                publishers.append(frontier[0])
                titles.append(frontier[1])
                content.append('NO CONTENT')
        
        # parsing dates and adding them to separate array
        for date in dates_container:
            dates.append(date.text)

        # fixing empty date slots (taking into consideration specification of the website)
        for i in range(len(dates)):
            if dates[i] == '':
                dates[i] = dates[i - 1]

        try:
            # opens next page
            page_buttons = driver.find_element_by_link_text('Next page').click()
        except:
            data = {
                "date": dates,
                "publisher": publishers,
                "title": titles,
                "content": content
            }
            
    # Closing the window
    driver.close()
    driver.quit()
    
    print(f'Scraper parsed {len(dates)} rows of content')
    
except:
    print('Scraper crashed for some reason')



Scraper parsed 2500 rows of content


In [10]:
 # -------------------- end timer -------------------- #
endTime = time.time()
totalTime = endTime - startTime
print(f"Total time required to complete parsing: {round(totalTime, 2)} seconds\n")

Total time required to complete parsing: 242.83 seconds



In [11]:
 # -------------------- Printing results -------------------- #
# print(f'\nPreview of publishers: {publishers}')
# print(f'Number of parsed publishers: {int(len(publishers))}')

# print(f'\nPreview of titles: {titles}')
# print(f'Number of parsed titles: {int(len(titles))}')

# print(f'\nPreview of content: {content}')
# print(f'Number of parsed content: {int(len(content))}')

# print(f'\nPreview of dates: {dates}')
# print(f'Number of parsed dates: {int(len(dates))}')

In [12]:
 # -------------------- Adding results to dictionary -------------------- #
data = {
    "date": dates,
    "publisher": publishers,
    "title": titles,
    "content": content
}

In [13]:
# -------------------- Saving data to a JSON file -------------------- #
# Serializing json
json_object = json.dumps(data, indent=4)

# Writing to sample.json
with open("FT_data_from_"+str(NUMBER_OF_PAGES)+"_pages.json", "w") as outfile:
    outfile.write(json_object)


In [14]:
print(f'''
number of dates: {len(data['date'])}
number of publishers: {len(data['publisher'])}
number of titles: {len(data['title'])}
number of content: {len(data['content'])}
''')

if len(data['title']) == len(data['content']) == len(data['publisher']) == len(data['date']):
    print('Arrays are the same length. Dataframe can be created.')
else:
    print('''
    Length is not equal. Dataframe cannot be created. 
    ''')


number of dates: 2500
number of publishers: 2500
number of titles: 2500
number of content: 2500

Arrays are the same length. Dataframe can be created.


In [15]:
# -------------------- Creating a DataFrame Object -------------------- #
my_df = pd.DataFrame(data, index=range(1, len(data['publisher'])+1))
my_df.tail()

# just to test if it works properly

Unnamed: 0,date,publisher,title,content
2496,JANUARY 22 2022,News in-depthSenseTime,China’s SenseTime ponders future after US blac...,Xu Li has pulled off an IPO despite accusation...
2497,JANUARY 21 2022,#techFT,Investors stream out of Netflix,"DeepMind co-founder quits, Intel picks Ohio, c..."
2498,JANUARY 21 2022,The Big Read,Why gaming is the new Big Tech battleground,Microsoft’s $75bn purchase of Activision could...
2499,JANUARY 21 2022,The FT ViewThe editorial board,A much-needed market correction,Sell-off led by technology stocks may reflect ...
2500,JANUARY 21 2022,LexNetflix Inc,Netflix: don’t tune out just yet Premium,The focus on subscriber growth ignores the fac...


### Reading our generated JSON file

In [16]:
# -------------------- Loading our generated JSON -------------------- #
my_df = pd.read_json('FT_data_from_'+str(NUMBER_OF_PAGES)+'_pages.json')
my_df.head()

Unnamed: 0,date,publisher,title,content
0,2022-11-25,US-China relations,Chinese telecoms groups Huawei and ZTE barred ...,Washington cites national security in order th...
1,2022-11-25,The FT ViewThe editorial board,How to make Britain a bigger force in tech,The UK cannot create the next Silicon Valley b...
2,2022-11-25,LexTesla Inc,Tesla/China: recalls draw attention at a risky...,Electric carmaker is far too invested in China...
3,2022-11-25,UK politics & policy,UK limits use of Chinese-made surveillance sys...,Decision taken on security grounds following M...
4,2022-11-25,Stuart Kirk,Stock investors needn’t lose sleep over rates,Our columnist has bought into US equities and ...


In [17]:
my_df.shape

(2500, 4)

In [18]:
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       2500 non-null   datetime64[ns]
 1   publisher  2500 non-null   object        
 2   title      2500 non-null   object        
 3   content    2500 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 78.2+ KB


In [19]:
my_df[my_df['content'] == "NO CONTENT" ]

Unnamed: 0,date,publisher,title,content
11,2022-11-24,FT live news,Live news updates from November 24: Hopes rise...,NO CONTENT
12,2022-11-23,FT live news,Live news updates from November 23: Fed offici...,NO CONTENT
43,2022-11-22,FT live news,Live news updates from November 22: Glazers mu...,NO CONTENT
58,2022-11-21,FT live news,Live news updates from November 21: Iger repla...,NO CONTENT
114,2022-11-15,FT live news,Live news updates from November 15: Two killed...,NO CONTENT
125,2022-11-14,FT live news,Live news updates from November 14: Amazon pre...,NO CONTENT
128,2022-11-14,LexAveva Group,Aveva/Schneider: bumpitrage yields small victo...,NO CONTENT
181,2022-11-08,FT live news,Live news updates from November 8: Brussels to...,NO CONTENT
200,2022-11-07,FT live news,Live news updates from November 7: Ukraine rec...,NO CONTENT
245,2022-11-01,FT live news,Live news updates from November 1: US ‘concern...,NO CONTENT


In [20]:
# removing rows with no data in content

counter = 0
for x in my_df.index:
    if my_df.loc[x, "content"] == "NO CONTENT" or my_df.loc[x, "content"] == "LIVE":
        my_df.drop(x, inplace = True)
        counter += 1

print(f'Number of rows dropped: {counter}')

Number of rows dropped: 47


In [21]:
print(my_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2453 entries, 0 to 2499
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       2453 non-null   datetime64[ns]
 1   publisher  2453 non-null   object        
 2   title      2453 non-null   object        
 3   content    2453 non-null   object        
dtypes: datetime64[ns](1), object(3)
memory usage: 160.4+ KB
None


In [22]:
my_df.head(20)

Unnamed: 0,date,publisher,title,content
0,2022-11-25,US-China relations,Chinese telecoms groups Huawei and ZTE barred ...,Washington cites national security in order th...
1,2022-11-25,The FT ViewThe editorial board,How to make Britain a bigger force in tech,The UK cannot create the next Silicon Valley b...
2,2022-11-25,LexTesla Inc,Tesla/China: recalls draw attention at a risky...,Electric carmaker is far too invested in China...
3,2022-11-25,UK politics & policy,UK limits use of Chinese-made surveillance sys...,Decision taken on security grounds following M...
4,2022-11-25,Stuart Kirk,Stock investors needn’t lose sleep over rates,Our columnist has bought into US equities and ...
5,2022-11-24,UK business & economy,Small businesses that tapped UK Covid loans fa...,Anti-corruption campaigners seek release of ba...
6,2022-11-24,Electric vehicles,Arrival’s founder to step down as chief of ele...,Denis Sverdlov will be replaced by former head...
7,2022-11-24,#techAsia,JD.com’s pay cuts and LG’s US battery bet Premium,The inside story on the Asia tech trends that ...
8,2022-11-24,Special Report,Next Tech Growth Markets,The FT-Omdia Digital Economies Index highlight...
9,2022-11-24,US midterm elections,How #electiontwitter took over the political p...,A clutch of amateur polling enthusiasts has ga...


In [23]:
my_df.shape

(2453, 4)