In [1]:
### IMPORTS AND DRIVER SETUP

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, ElementNotVisibleException, UnexpectedAlertPresentException, NoAlertPresentException
from selenium.common.exceptions import SessionNotCreatedException, WebDriverException
from selenium.webdriver.chrome.options import Options

import time
import pandas as pd
import os
import re
import sys
import linecache
import numpy as np
import datefinder

from bs4 import BeautifulSoup as soup
from pandas import NA

chrome_driver_binary = "/Users/andrin/Library/CloudStorage/OneDrive-LondonSchoolofEconomics/DS Project/chromedriver"

driver = webdriver.Chrome(chrome_driver_binary)



In [48]:
url = 'https://librarysearch.lse.ac.uk/view/action/uresolver.do?operation=resolveService&package_service_id=10444710030002021&institutionId=2021&customerId=2020&VE=true'

spec_url = "https://global-factiva-com.gate3.library.lse.ac.uk/ha/default.aspx?page_driver=searchBuilder_Search#./!?&_suid=1699211604258013312993362481285"

In [49]:
driver.get(url)

In [None]:
inspect_urls = []

while True:
    try:
        elems = driver.find_elements_by_xpath("//a[@href]")
        for elem in elems:
            inspect_urls.append(elem.get_attribute("href"))

        time.sleep(5)

        driver.execute_script("return arguments[0].scrollIntoView(true);", WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a.nextItem"))))
        driver.find_element(By.CSS_SELECTOR, 'a.nextItem').click()
        print("Navigating to Next Page")

        time.sleep(15)

    except UnexpectedAlertPresentException as uap:
        try:
            driver.switch_to.alert.accept()
            print(uap)
            driver.refresh()
        except NoAlertPresentException as nap:
            print(nap)
            driver.refresh()

        
    except (TimeoutException, WebDriverException) as e:
        time.sleep(2)
        print("Last page reached")
        break

display(inspect_urls)



In [None]:
all_article_links = []

for link in inspect_urls:
    if "accessionno" in link:
        all_article_links.append(link)


short_links = []

for link in all_article_links:
    short_link = re.sub("&fcpil=en&napc=S&sa_from=&cat=a&page_driver=searchBuilder_Search", '', link)   
    short_link = re.sub("https://global-factiva-com.gate3.library.lse.ac.uk/du/article.aspx/?accessionno=BIZINS0020231103ejb3001bb&drn=drn:archive.newsarticle.", '', link)
    short_links.append(short_link)


unique_indices = sorted(np.unique(short_links, return_index=True)[1].tolist())

unique_articles = []

for i in unique_indices:
    unique_articles.append(all_article_links[i])


display(unique_articles)
print(len(unique_articles))

In [45]:
### SCRAPING ARTICLES FUNCTION

def Scrape(article_urls,scraped_articles=None):
    
    if scraped_articles is None:
        scraped_articles = []


    error_articles = []

    for article_number, article in enumerate(article_urls):
        try:
            driver.get(article)
        except TimeoutException:
            time.sleep(3)
            driver.refresh()
            time.sleep(60)
            driver.get(article)

        time.sleep(3)

        #Grab All Text Content
        try:
            all_text = str(driver.find_element_by_xpath("/html/body").text)
        except NoSuchElementException:
            all_text = "We are unable to process your request at this time. Please try again in a few minutes."
            print(article_number+1," !!! WARNING: BLANK PAGE --------------------------------------------------")

        #Find Headline
        try:
            headline = str(driver.find_element_by_xpath("//*[@class='enHeadline']").text).replace("FT.com site : ","")
        except NoSuchElementException:
            headline = pd.NA
            print(article_number+1," NO HEADLINE DETECTED - CONTINUING...")
            

        #Find Author
        try:
            author = driver.find_element_by_xpath("//*[@class='author']").text
        except NoSuchElementException:
            author = pd.NA
            print(article_number+1," NO AUTHOR DETECTED - CONTINUING...")
        
        #Find Text Paragraphs
        try:
            text_lines = driver.find_elements_by_xpath("//*[@class='articleParagraph enarticleParagraph']")
        except NoSuchElementException:
            text_lines = pd.NA
            print(article_number+1," !!! WARNING: NO TEXT CONTENT DETECTED - CONTINUING...")

        
        #Find Date
        dates_in_text = list(datefinder.find_dates(all_text, strict = True))
        if len(dates_in_text) > 0:
            date = str(dates_in_text[0])[:10]
        else:
            date = pd.NA

        #Aggregate Article Text Contents 
        content = ""
        for line in text_lines:
            content = content + str(line.text)+"\n"

        #Article Did Not Load Warning
        if "We are unable to process your request at this time. Please try again in a few minutes." in str(all_text):
            print("ERROR: ARTICLE DID NOT LOAD !!!")
            time.sleep(3)
            driver.refresh()
            time.sleep(3)
            error_articles.append(article)
            error_message = "ARTICLE DID NOT LOAD"
        else:
            error_message = pd.NA

        #Create Article Element and Add to List of Scraped Articles
        article_element = [article_number+1,headline,author,date,content,error_message]
        #print(article_element)
        
        scraped_articles.append(article_element)


        time.sleep(1)

        print("Article Number: ", article_number+1, " Progress: ", round((article_number+1)*100/len(article_urls),2), "%")

    if len(error_articles) > 0:
        Scrape(error_articles,scraped_articles)

    print(len(scraped_articles))
            
    return scraped_articles


In [None]:
article_list_scraped = Scrape(unique_articles)
articles_df = pd.DataFrame(article_list_scraped)

articles_df.columns = ['ARTICLE_NUMBER','TITLE','AUTHOR', 'DATE','TEXT','ERRORS']

articles_df.to_csv("Factiva_NYT.csv", encoding='utf-8', index=False)