In [47]:
import feedparser
import json
import os
import re
import requests
from email.utils import parsedate_to_datetime
from bs4 import BeautifulSoup
import bleach
from urllib.parse import urljoin
from datetime import datetime

In [109]:
url_list = [
    "https://therealdeal.com/new-york/",
    "https://therealdeal.com/miami/",
    "https://therealdeal.com/la/",
    "https://therealdeal.com/chicago/",
    "https://therealdeal.com/san-francisco/",
    "https://therealdeal.com/texas/"
]

In [114]:
# Prepend the domain to URIs that start with "/"
def extract_post_fields(post):
    original_uri = post.get('uri', '')
    full_url = "www.therealdeal.com" + original_uri if original_uri.startswith("/") else original_uri
    return {
        'title': post.get('title'),
        'url': full_url,
        'date': post.get('date')
    }

def fetch_and_extract(url, session):
    try:
        response = session.get(url)
        response.raise_for_status()  # Raise exception for HTTP errors
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []
    
    soup = BeautifulSoup(response.content, 'html.parser')
    script_tag = soup.find("script", {"type": "application/json"})
    if not (script_tag and script_tag.string):
        print(f"No JSON found in {url}")
        return []
    
    try:
        json_text = script_tag.string.strip()
        data = json.loads(json_text)
    except json.JSONDecodeError as e:
        print(f"JSON decode error in {url}: {e}")
        return []
    
    # Navigate through the JSON structure
    editorial_posts = data.get('props', {}) \
                           .get('pageProps', {}) \
                           .get('data', {}) \
                           .get('editorialPickPosts', [])
    regular_posts = data.get('props', {}) \
                        .get('pageProps', {}) \
                        .get('data', {}) \
                        .get('posts', {}) \
                        .get('nodes', [])
    
    # Extract fields from both lists
    extracted_editorial = [extract_post_fields(post) for post in editorial_posts]
    extracted_regular = [extract_post_fields(post) for post in regular_posts]
    
    return extracted_editorial + extracted_regular

# Assuming url_list is defined
combined_posts_list = []

with requests.Session() as session:
    for url in url_list:
        posts = fetch_and_extract(url, session)
        # If you prefer a flat list (all posts in one list) rather than a list per URL:
        combined_posts_list.extend(posts)
        # Otherwise, if you want to keep the posts per URL:
        # combined_posts_list.append(posts)


In [123]:
def get_pub_date(soup):
    """
    Returns a cleaned publication date string from known container classes.
    Checks for an updated date if present.
    """
    pub_div = soup.find('div', class_='PublishedDate_root__Rn_Fz RightRailCommon_publishedDate__FW5gI')
    if pub_div:
        updated_span = pub_div.find('span', class_='updated')
        if updated_span:
            return updated_span.get_text().replace("Updated", "").strip()
        first_span = pub_div.find('span')
        if first_span:
            return first_span.get_text().strip()
    # Fallback: check for full-width published date.
    pub_div = soup.find('div', class_='PublishedDate_root__Rn_Fz FullWidthCommon_publishedDate__Ba6lp')
    if pub_div:
        return pub_div.get_text().strip()
    return None

# Your parse_article function (as defined)
def parse_article(url):
    """
    Fetches the article page at `url`, extracts and cleans:
      - Title, subhead, authors, publication date, and full article content.
    Unwanted HTML elements are removed.
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return {}
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    title_tag = soup.find('h1', class_='Heading_root__aznJy')
    title = title_tag.get_text(strip=True) if title_tag else ""
    
    subhead_tag = soup.find('p', class_='Subheading_root__MWlO8')
    subhead = subhead_tag.get_text(strip=True) if subhead_tag else ""
    
    authors_tag = soup.find('section', class_='Authors_root__depgJ')
    authors = authors_tag.get_text(strip=True) if authors_tag else ""
    # Ensures that 'ByAuthor Name' becomes 'By Author Name'
    authors = re.sub(r"^By(?=\S)", "By ", authors)

    
    # Assuming you have a helper function get_pub_date() defined elsewhere:
    pub_date_str = get_pub_date(soup)
    pub_date_dt = None
    if pub_date_str:
        try:
            pub_date_dt = datetime.strptime(pub_date_str, '%b %d, %Y, %I:%M %p')
        except Exception as e:
            print(f"Date conversion error for {url}: {e}")
    
    article_tag = soup.find('article', id='the-content')
    related_links = []  # To capture any links from the article.
    content = ""
    if article_tag:
        # Remove unwanted elements.
        for tag in article_tag.find_all(["div", "button", "figure", "figcaption"]):
            tag.decompose()
        # Extract and remove all anchor tags, storing their absolute URLs.
        for a in article_tag.find_all('a'):
            href = a.get('href')
            if href:
                related_links.append(urljoin("https://therealdeal.com", href))
            a.replace_with(a.get_text())
        content = article_tag.get_text(separator="\n", strip=True)
    
    clean_text = bleach.clean(content, tags=[], strip=True)
    for unwanted in ["Sign Up for the undefined Newsletter", "Read more"]:
        clean_text = clean_text.replace(unwanted, "")
    
    return {
        "url": url,
        "title": title,
        "subhead": subhead,
        "authors": authors,
        "pub_date": pub_date_str,
        "pub_date_dt": pub_date_dt.isoformat() if pub_date_dt else "",
        "content": clean_text,
        "related_links": related_links,
    }

# Suppose combined_posts_list is your list of JSON-extracted post data:
# Each entry in combined_posts_list looks like:
# {
#     'title': 'Some JSON title',
#     'url': 'www.therealdeal.com/new-york/2025/03/18/some-post/',
#     'date': '2025-03-18T15:47:01'
# }

final_articles = []

for post in combined_posts_list:
    # Ensure the URL is fully qualified.
    url = post.get("url", "")
    if not url.startswith("http"):
        url = "https://" + url
    # Fetch additional details from the article page.
    article_details = parse_article(url)
    # Merge the two dictionaries. In case of key overlap,
    # article_details will override the JSON values.
    merged = {**post, **article_details}
    final_articles.append(merged)

# final_articles now contains the combined data for each URL.

In [125]:
final_articles

[{'title': 'Mortgage fraud co-conspirator sentenced to “significant” 30 months',
  'url': 'https://www.therealdeal.com/new-york/tristate/2025/03/18/moshe-silber-sentenced-after-mortgage-fraud/',
  'date': '2025-03-18T15:15:50',
  'subhead': 'Moshe Silber, now banned from industry, once held $1B in real estate',
  'authors': 'By Keith Larsen',
  'pub_date': 'Mar 18, 2025, 3:15 PM',
  'pub_date_dt': '2025-03-18T15:15:00',
  'content': 'A federal judge in New Jersey sentenced Moshe Silber to 30 months in prison for his involvement in a\nbroad mortgage fraud scandal\n.\nSilber\n, 35, had turned himself in three weeks ago for violating the terms of his bond. He faced up to five years after pleading guilty to one count of conspiracy to commit mortgage fraud in 2024.\nThe U.S. district court judge, Robert Kirsch, repeatedly said that Silber had committed a “significant crime” and a “significant sentence was necessary.” Silber, a co-conspirator of Boruch Drillman, had teamed up to buy an Ohio 