In [6]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime, date
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

MIN_DATE = date(2025, 6, 10)  # Use simple date object (no time)
BASE_URL = "https://www.teamblind.com"
TOPIC_URL = f"{BASE_URL}/topics/General-Topics/Layoffs"
OUTPUT_FILE = 'posttesting.json'  # JSON Lines format


HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9",
}


In [34]:

# Configure Selenium
def setup_driver():
    chrome_options = Options()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--window-size=1200,800")
    chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
    chrome_options.add_experimental_option('useAutomationExtension', False)
    driver = webdriver.Chrome(options=chrome_options)
    driver.execute_cdp_cmd(
        "Network.setUserAgentOverride",
        {"userAgent": HEADERS["User-Agent"]}
    )
    return driver

In [13]:
def extract_json_ld(html):
    """Extract JSON-LD data from post page"""
    soup = BeautifulSoup(html, 'lxml')
    script = soup.find('script', {'id': 'article-discussion-forum-posting-schema', 'type': 'application/ld+json'})
    
    if not script:
        return None
        
    try:
        return json.loads(script.string)
    except json.JSONDecodeError:
        return None

In [28]:


def extract_post_details(html_text):
    """
    Extract post details from the HTML response text.
    
    Args:
        html_text (str): The HTML content of the page as a string.
    
    Returns:
        dict: A dictionary containing the extracted details:
              - like_count (int)
              - view_count (int)
              - author_company (str)
              - author_id (str)
              - button_container (bool) (True if found, False otherwise)
    """
    soup = BeautifulSoup(html_text, 'lxml')
    
    # Find the button container
    button_container = soup.find('div', class_='flex gap-2 md:gap-4')
    
    # Initialize counts
    like_count = 0
    view_count = 0
    if button_container:
        # Find like button by aria-label
        like_button = button_container.find('button', {'aria-label': 'Like this post'})
        if like_button and 'data-count' in like_button.attrs:
            try:
                like_count = int(like_button['data-count'])
            except ValueError:
                pass
        
        # Find view button by aria-label
        view_button = button_container.find('button', {'aria-label': 'Views'})
        if view_button and 'data-count' in view_button.attrs:
            try:
                view_count = int(view_button['data-count'])
            except ValueError:
                pass
    
    # Extract author details
    author_company = ""
    author_id = ""
    author_div = soup.find('div', class_='flex h-full items-center text-xs text-gray-800')
    if author_div:
        # Extract company name
        company_link = author_div.find('a')
        if company_link:
            author_company = company_link.get_text(strip=True)
        
        # Extract author ID - find the last text node in the div
        text_nodes = [text for text in author_div.stripped_strings]
        if text_nodes:
            # Author ID is the last text node after the SVG
            author_id = text_nodes[-1]
    
    return {
        "like_count": like_count,
        "view_count": view_count,
        "author_company": author_company,
        "author_id": author_id,
        "button_container": bool(button_container)
    }

In [None]:


def scrape_post(url):
    """Scrape individual post page and return (data, is_old) tuple"""
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        
        post_data = extract_json_ld(response.text)
        if not post_data:
            print(f"No JSON-LD data found for {url}")
            return None, False
            
        # Extract just the date part (first 10 characters: YYYY-MM-DD)
        post_date_str = post_data["datePublished"][:10]
        post_date = datetime.strptime(post_date_str, "%Y-%m-%d").date()
        if post_date < MIN_DATE: return None, True  # Post is too old

        # Extract post details
        post_details = extract_post_details(response.text)

        return {
            "headline": post_data["headline"],
            "text": post_data["text"],
            "date": post_date_str,  # Store only the date part
            "url": post_data["url"],
            "author": post_details["author_id"], 
            "authorCompany": post_details["author_company"],
            "likeCount": post_details["like_count"],
            "commentCount": post_data["commentCount"],
            "viewCount": post_details["view_count"],
        }, False
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return None, False


In [33]:
#scrape_post("https://www.teamblind.com/post/any-layoffs-at-navan-in-dec-2024-46yqmj4g")
#scrape_post("https://www.teamblind.com/post/advice-needed-my-wife-just-returned-from-maternity-leavecompany-pushing-her-to-return-in-person-jvx4fqkh")


scrape_post("https://www.teamblind.com/post/laid-off-right-before-parental-leave-starts-zvuwxlzt")  # Test the function with the topic URL

({'headline': 'Laid off right before parental leave starts',
  'text': 'My firm offers 10 weeks of parental (paternal) leave. I always had a good relationship with my manager (or at least I thought so), excellent review ratings, good standing among teammates. But the caveat is I’m the only IC who’s remote with one more manager but he frequently travels onsite (like once every two months).\n\nI initially took 3 weeks off earlier this year and assured my manager I wouldn’t leave longer stretches and overburden the team. Now I submitted another 2 weeks which was approved but was blindsided last week when he moved our 1-1 from Wed to Fri (he happens to do that at times) and I saw the HR on the call which is when he broke the news knowing Monday is when my parental leave starts. They did offer me 2 months severance though\n\nFeeling betrayed 😒',
  'date': '2025-06-22',
  'url': 'https://www.teamblind.com/post/laid-off-right-before-parental-leave-starts-zvuwxlzt',
  'author': 'ohdbdj9',
  'a

In [None]:



def process_new_posts(driver, processed_links, all_posts):
    """Process newly loaded posts and return stop reason (or None)"""
    # Get all current post elements
    current_elements = driver.find_elements(By.CSS_SELECTOR, 'article[data-testid="article-preview-card"]')
    
    new_posts = []
    for element in current_elements:
        try:
            link = element.find_element(By.CSS_SELECTOR, 'a[data-testid="article-preview-click-box"]')
            href = link.get_attribute('href')
            if href and href not in processed_links:
                new_posts.append(href)
                processed_links.add(href)
        except Exception:
            continue
    
    if not new_posts:
        return "no_new_posts"  # No new posts found
    
    print(f"Processing {len(new_posts)} new posts...")
    
    for i, post_url in enumerate(new_posts):
        #print(f"  Scraping post {i+1}/{len(new_posts)}")
        post_data, is_old = scrape_post(post_url)
        
        if is_old:
            print(f"Found old post ({post_url}), stopping processing")
            return "old_post_found"
            
        if post_data:
            all_posts.append(post_data)
            if len(all_posts) % 1000 == 0:
                last_post_date = all_posts[-1]['date'] if all_posts else "N/A"
                print(f"Total posts scraped until now: {len(all_posts)}. Last post date: {last_post_date}")
            # Save results after each post
            save_results(all_posts)
    
    return None

def save_results(posts):
    """Save results to JSON file incrementally"""
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(posts, f, ensure_ascii=False, indent=2)



In [9]:
def scrape_layoffs():
    """Main scraping function with incremental processing"""
    all_posts = []
    processed_links = set()
    consecutive_no_new = 0
    max_consecutive_no_new = 15  # Safety limit for no new posts
    
    # Set up Selenium driver
    driver = setup_driver()
    stop_reason = None
    
    try:
        print("Loading initial page...")
        driver.get(TOPIC_URL)
        
        # Wait for initial content
        WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'article[data-testid="article-preview-card"]'))
        )
        
        # Process initial batch of posts
        reason = process_new_posts(driver, processed_links, all_posts)
        if reason == "old_post_found":
            stop_reason = "Stopped due to old post in initial batch"
            return all_posts, stop_reason
        
        # Scroll and process incrementally
        while True:
            print("Scrolling to bottom...")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2.5)  # Allow content to load
            
            # Process new posts
            reason = process_new_posts(driver, processed_links, all_posts)
            print(f"Total posts scraped until now: {len(all_posts)}")
            if reason == "old_post_found":
                stop_reason = "Stopped due to old post found"
                print(stop_reason)
                break
            elif reason == "no_new_posts":
                consecutive_no_new += 1
                print(f"No new posts detected ({consecutive_no_new}/{max_consecutive_no_new})")
                
                # Break if we've had too many consecutive scrolls with no new posts
                if consecutive_no_new >= max_consecutive_no_new:
                    stop_reason = f"Stopped after {max_consecutive_no_new} consecutive scrolls with no new posts"
                    print(stop_reason)
                    break
            else:
                consecutive_no_new = 0  # Reset counter if we found new posts
        
    finally:
        # Close the browser
        driver.quit()
    
    return all_posts, stop_reason

In [35]:
posts, reason = scrape_layoffs()
#save_results(posts)
print(f"Saved {len(posts)} posts to teamblind_layoffs_posts.json")
print(f"Stopping reason: {reason}")

Loading initial page...


KeyboardInterrupt: 