In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
import urllib.request
import requests
import bs4
from bs4 import BeautifulSoup


Description:
    This script utilizes Selenium WebDriver to automate the task of scraping content from a Medium author's profile page.

Steps:
1. Initialize WebDriver:
    - Set up the ChromeDriver service using the provided path to the ChromeDriver executable.
    - Initialize the Chrome WebDriver, passing the service.

2. Navigate to the Author's Profile Page:
    - Open the author's Medium profile page specified by the URL.

3. Load Dynamic Content:
    - Scroll down the page to load more content dynamically until reaching the bottom.
    - Get the initial height of the page and continuously scroll down while waiting for more content to load.
    - Break the loop if no more content is loaded, indicated by no increase in the page height.

4. Extract HTML Content:
    - After loading all content, switch to the newly opened window (if any).
    - Get the HTML content of the page using JavaScript execution.

5. Close WebDriver:
    - Quit the WebDriver to release resources.


In [3]:
chromedriver_path = "chromedriver.exe" 
service = Service(executable_path=chromedriver_path)
driver = webdriver.Chrome(service=service)

author_profile_url = "https://williamkoehrsen.medium.com/" 
driver.get(author_profile_url)
main_window_handle = driver.current_window_handle
last_height = driver.execute_script("return document.body.scrollHeight")


while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    time.sleep(10)
    
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    
    last_height = new_height
for window_handle in driver.window_handles:
    if window_handle != main_window_handle:
        driver.switch_to.window(window_handle)
        break

res = driver.execute_script("return document.documentElement.outerHTML")
driver.quit()

In [4]:
soup = BeautifulSoup(res, 'html.parser')

In [6]:
author = soup.title.string
author = author.split(" – ")[0]
author

'Will Koehrsen'

In [None]:
# Using Soup to extract Links and Tags Related To Blog

## Extract Tags of Blogs


In [7]:
import re

In [8]:
tags_element = soup.find_all("div", class_=re.compile("nf dl fk ng nh ni nj be b do z"))
tags_temp = []
for ind,tag in enumerate(tags_element):
    if(ind%2 == 0):
        tags_temp.append(tag.get_text(strip = True))    

In [9]:
tags_temp[0:5]

['Data Science', 'Data Science', 'Reading', 'Reading', 'Productivity']

In [10]:
len(tags_temp)

155

## Extract Links of Blogs

In [11]:
filtered_links = soup.find_all('div',class_='ab ks')
href_links=[]
for index,div in enumerate(filtered_links):
    if index%2 != 0:
        continue
    link = div.find('a')
    if link:
        href = link.get('href')
        href = "https://williamkoehrsen.medium.com"+href
        href = href.split('?')[0]
        href_links.append(href)

In [12]:
href_links[0:2]

['https://williamkoehrsen.medium.com/drivendata-interview-3ab44269ef84',
 'https://williamkoehrsen.medium.com/a-data-science-conversation-9ca398573d2f']

In [13]:
len(href_links)

155

In [14]:
from tqdm.notebook import tqdm # TO VIsualize the Scraping Process

In [41]:
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException

In [46]:
def scrape_data(cssSelector, webDriverWait, max_retry=3):
    """
    Description:
        This function is designed to scrape text content from a web page using Selenium WebDriver.
        
    Parameters:
        cssSelector (str): A CSS selector used to locate the desired element on the web page.
        webDriverWait (WebDriverWait): An instance of WebDriverWait configured with the desired timeout.
        max_retry (int, optional): The maximum number of retry attempts in case of StaleElementReferenceException.
                                   Defaults to 3.
                                   
    Returns:
        str: The text content of the located element, stripped of leading and trailing whitespaces.
             If the element is not found or if it has no text content, it returns "0".
    """
    retry_count = 0
    element = None
    while retry_count < max_retry:
        try:
            element = webDriverWait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, cssSelector)))
            return element.text.strip() if element.text.strip() else "0"
        
        except StaleElementReferenceException:
            print("Stale Element Exception occurred. Retrying...")
            retry_count += 1
        except NoSuchElementException:
            return "0"
    return "0"

In [47]:
def scrape_paragraph(cssSelector, wait,max_retry=3):
    """
    Description:
        This function is designed to scrape text content from multiple paragraphs on a web page using Selenium WebDriver.
        
    Parameters:
        cssSelector (str): A CSS selector used to locate the desired paragraph elements on the web page.
        wait (WebDriverWait): An instance of WebDriverWait configured with the desired timeout.
        max_retry (int, optional): The maximum number of retry attempts in case of StaleElementReferenceException.
                                   Defaults to 3.
                                   
    Returns:
        str: The concatenated text content of all located paragraph elements, separated by spaces.
             If no paragraph elements are found or if they have no text content, it returns "0".
    """
    retry_count = 0
    paragraph = None
    try:
        all_elements = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, cssSelector)))
        paragraphs = [element.text for element in all_elements]
        return " ".join(paragraphs) if paragraphs else "0"
    except StaleElementReferenceException:
        print("Stale Element Exception occurred. Retrying...")
        return "0"
    return "0"

In [48]:
# Scraping the Websites from the links scraped above
upvotes = []
comment_count = []
title = []
read_time = []
publish_date = []
paragraphs = []
author = []
links = []
tags = []
for ind, link in tqdm(enumerate(href_links), desc="Processing Links", total=len(href_links)):
    chromedriver_path = "chromedriver.exe" 
    service = Service(executable_path=chromedriver_path)
    driver = webdriver.Chrome(service=service)
    
    try:
        driver.get(link)
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        wait = WebDriverWait(driver, 30)
        
        title_element = scrape_data('h1[data-testid*="storyTitle"]', wait, 3)
        author_element = scrape_data('a[data-testid*="authorName"]', wait, 3)
        upvotes_element = scrape_data('div.pw-multi-vote-count button', wait, 3)
        comment_count_element = scrape_data('span[class*="pw-responses-count"]', wait, 3)
        read_time_element = scrape_data('span[data-testid*="storyReadTime"]', wait, 3)
        publish_date_element = scrape_data('span[data-testid*="storyPublishDate"]', wait, 3)
        paragraph_element = scrape_paragraph('p.pw-post-body-paragraph', wait, 3)
        
        title.append(title_element)
        author.append(author_element)
        upvotes.append(upvotes_element)
        comment_count.append(comment_count_element)
        read_time.append(read_time_element)
        publish_date.append(publish_date_element)
        paragraphs.append(paragraph_element)
        links.append(link)
        tags.append(tags_temp[ind])
    
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    
    finally:
        driver.quit()

Processing Links:   0%|          | 0/155 [00:00<?, ?it/s]

In [18]:
import pandas as pd

In [19]:
max_length = max(len(title), len(author), len(upvotes), len(comment_count),
                 len(publish_date), len(read_time), len(paragraphs), len(links), len(tags))

In [33]:
max_length # Getting Max Length TO Pad the Columns With Shorter Length

155

In [20]:
# Pad the shorter lists with a placeholder value (e.g., '')
title += [''] * (max_length - len(title))
author += [''] * (max_length - len(author))
upvotes += [0] * (max_length - len(upvotes))  
comment_count += [0] * (max_length - len(comment_count))  
publish_date += [''] * (max_length - len(publish_date))
read_time += [0] * (max_length - len(read_time))  
paragraphs += [''] * (max_length - len(paragraphs))
links += [''] * (max_length - len(links))
tags += [''] * (max_length - len(tags))

In [21]:
df = pd.DataFrame({"title":title,"Author":author,"Upvote":upvotes,"CommentCount":comment_count,"Publish Date":publish_date, "Read Time":read_time, "Paragraph":paragraphs,"Links":links,"Tag":tags})

In [22]:
df

Unnamed: 0,title,Author,Upvote,CommentCount,Publish Date,Read Time,Paragraph,Links,Tag
0,DrivenData Interview,Will Koehrsen,544,2,"Dec 14, 2020",13 min read,"In October 2020, I was interviewed by DrivenDa...",https://williamkoehrsen.medium.com/drivendata-...,Data Science
1,A Data Science Conversation,Will Koehrsen,411,43,"Mar 10, 2020",3 min read,Talking is a lot like writing in that it force...,https://williamkoehrsen.medium.com/a-data-scie...,Data Science
2,"12 Lessons from 55,000 pages of books",Will Koehrsen,1.4K,10,"Jan 2, 2020",14 min read,Reading 136 books in a year does not get you t...,https://williamkoehrsen.medium.com/12-lessons-...,Reading
3,Books of 2019,Will Koehrsen,464,1,"Jan 1, 2020",58 min read,Before we get started: reading books does not ...,https://williamkoehrsen.medium.com/books-of-20...,Reading
4,“Just Do It” Won’t Get You to Your Goals,Will Koehrsen,398,2,"Dec 27, 2019",12 min read,Rule number one for achieving goals: don’t tak...,https://williamkoehrsen.medium.com/just-do-it-...,Productivity
...,...,...,...,...,...,...,...,...,...
150,The Triumph of Peace,Will Koehrsen,7.9K,43,"Jul 5, 2017",14 min read,A review of The Better Angels of Our Nature: W...,https://williamkoehrsen.medium.com/the-triumph...,Books
151,Home of the Scared,Will Koehrsen,7.9K,43,"Jul 1, 2017",9 min read,A review of A Culture of Fear: Why Americans a...,https://williamkoehrsen.medium.com/home-of-the...,Politics
152,Capstone Project: Mercedes-Benz Greener Manufa...,Will Koehrsen,120,43,"Jun 30, 2017",42 min read,Author’s Note: This is the report I completed ...,https://williamkoehrsen.medium.com/capstone-pr...,Machine Learning
153,"The Vanquishing of War, Plague and Famine",Will Koehrsen,70,43,"Jun 18, 2017",14 min read,Part 1 of the Optimist’s Guide to the 21st Cen...,https://williamkoehrsen.medium.com/the-vanquis...,Climate Change


In [36]:
df.to_csv('MediumBlogScrapped.csv', index=False)