# Part 1: Scraping Articles
Scrape insights articles from KPMG websites.
Extract key information such as:
1. Title
2. url: Article URL 
3. title_description
4. content (Full text)
5. pdf_content: If an article has an associated PDF, download the PDF and extract its content.

In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import sys
import time
import requests
import fitz  
import io

options = webdriver.ChromeOptions()
options.add_argument("--headless")

service = Service(r"C:\Users\ravis\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)

insights_url = "https://kpmg.com/xx/en/our-insights.html"
driver.get(insights_url)

wait = WebDriverWait(driver, 10)
tiles = wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, "cmp-filterlist__tile")))

for tile in tiles:
    driver.execute_script("arguments[0].scrollIntoView(true);", tile)
    time.sleep(0.5)

def extract_articles():
    tiles = driver.find_elements(By.CLASS_NAME, "cmp-filterlist__tile")
    page_articles = []
    for tile in tiles:
        try:
            link = tile.find_element(By.CLASS_NAME, "cmp-filterlist__tile--action-link")
            url = link.get_attribute("href")
            title = tile.find_element(By.CLASS_NAME, "cmp-filterlist__tile--title").text.strip()
            description = tile.find_element(By.CLASS_NAME, "cmp-filterlist__tile--description").text.strip() or "N/A"
            page_articles.append({
                "title": title,
                "url": url,
                "title_description": description
            })
        except Exception as e:
            print(f"Error extracting article: {e}")
    return page_articles

articles = extract_articles()
print(f"Processed page 1")
total_articles = len(articles)
current_page = 2
max_articles = 400

while total_articles < max_articles and current_page < 236:
    try:
        next_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "cmp-filterlist__pagination--next")))
        if "disabled" in next_button.get_attribute("class"):
            break
        driver.execute_script("arguments[0].click();", next_button)
        time.sleep(2)
        new_articles = extract_articles()
        articles.extend(new_articles)
        total_articles = len(articles)
        print(f"Processed page {current_page}")
        current_page += 1
        if total_articles >= max_articles:
            break
    except Exception as e:
        print(f"Error navigating to next page: {e}")
        break

# PDF content extractor using PyMuPDF
def extract_text_from_pdf_url(url):
    try:
        if not url.startswith("http"):
            url = "https://kpmg.com" + url
        response = requests.get(url)
        with io.BytesIO(response.content) as pdf_stream:
            doc = fitz.open(stream=pdf_stream, filetype="pdf")
            return "\n".join([page.get_text() for page in doc])
    except Exception as e:
        print(f"Error reading PDF from {url}: {e}")
        return "[Failed to extract PDF]"

# Extract article content and PDF content
for article in articles:
    try:
        driver.get(article['url'])
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        time.sleep(2)

        # Extract page text
        text_blocks = driver.find_elements(By.CLASS_NAME, "cmp-text")
        full_text = "\n".join([block.text.strip() for block in text_blocks if block.text.strip()])
        article["content"] = full_text

        # Find PDF URL and extract content
        pdf_link = ""
        pdf_text = ""
        try:
            download_sections = driver.find_elements(By.CLASS_NAME, "cmp-download")
            for section in download_sections:
                try:
                    pdf_anchor = section.find_element(By.TAG_NAME, "a")
                    href = pdf_anchor.get_attribute("href")
                    if href and href.endswith(".pdf") and "/content/dam/kpmgsites/" in href:
                        pdf_link = href
                        pdf_text = extract_text_from_pdf_url(href)
                        break
                except:
                    continue
        except:
            pass

        article["pdf_content"] = pdf_text

    except Exception as e:
        print(f"Error extracting article content from {article['url']}: {e}")
        article["content"] = ""
        article["pdf_content"] = "[Failed to extract]"

# Save final output
df = pd.DataFrame(articles[:max_articles])
df.to_csv("kpmg_insights_full.csv", index=False)

driver.quit()
print(f"Extracted {len(df)} articles to 'kpmg_insights_full.csv'")


Processed page 1
Processed page 2
Processed page 3
Processed page 4
Processed page 5
Processed page 6
Processed page 7
Processed page 8
Processed page 9
Processed page 10
Processed page 11
Processed page 12
Processed page 13
Processed page 14
Processed page 15
Processed page 16
Processed page 17
Processed page 18
Processed page 19
Processed page 20
Processed page 21
Processed page 22
Processed page 23
Processed page 24
Processed page 25
Processed page 26
Processed page 27
Processed page 28
Processed page 29
Processed page 30
Processed page 31
Processed page 32
Processed page 33
Processed page 34
Processed page 35
Processed page 36
Processed page 37
Processed page 38
Processed page 39
Processed page 40
Processed page 41
Processed page 42
Processed page 43
Processed page 44
Processed page 45
Processed page 46
Processed page 47
Processed page 48
Processed page 49
Processed page 50
Extracted 400 articles to 'kpmg_insights_full.csv'
