In [1]:
%pip install requests beautifulsoup4 pandas


Note: you may need to restart the kernel to use updated packages.


In [2]:
import time, random, csv
from typing import List, Dict, Optional
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
import pandas as pd

BASE_THREAD_URL = "https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans"
HEADERS = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36 (edu-demo)"}

def page_url(page: int) -> str:
    return BASE_THREAD_URL if page <= 1 else f"{BASE_THREAD_URL}/p{page}"


In [3]:
session = requests.Session()
session.headers.update(HEADERS)

test_page = 1  # try 1, or 300/400+ to see recent pages
url = page_url(test_page)
resp = session.get(url, timeout=30)
print(resp.status_code, url)
html = resp.text
html[:1500]  # preview


200 https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans


'<!DOCTYPE html>\n<html lang="en">\n\n<head>\n    <meta charset="utf-8">\n    <meta http-equiv="X-UA-Compatible" content="IE=edge">\n    <meta name="viewport" content="width=device-width, initial-scale=1">\n    <style type="text/css">@font-face {font-family:Open Sans;font-style:normal;font-weight:400;src:url(/cf-fonts/s/open-sans/5.0.20/latin/400/normal.woff2);unicode-range:U+0000-00FF,U+0131,U+0152-0153,U+02BB-02BC,U+02C6,U+02DA,U+02DC,U+0304,U+0308,U+0329,U+2000-206F,U+2074,U+20AC,U+2122,U+2191,U+2193,U+2212,U+2215,U+FEFF,U+FFFD;font-display:swap;}@font-face {font-family:Open Sans;font-style:normal;font-weight:400;src:url(/cf-fonts/s/open-sans/5.0.20/vietnamese/400/normal.woff2);unicode-range:U+0102-0103,U+0110-0111,U+0128-0129,U+0168-0169,U+01A0-01A1,U+01AF-01B0,U+0300-0301,U+0303-0304,U+0308-0309,U+0323,U+0329,U+1EA0-1EF9,U+20AB;font-display:swap;}@font-face {font-family:Open Sans;font-style:normal;font-weight:400;src:url(/cf-fonts/s/open-sans/5.0.20/hebrew/400/normal.woff2);unicod

In [4]:
def gettext(el) -> str:
    if not el: return ""
    return el.get_text(" ", strip=True)

def parse_page(html: str, current_page: int) -> List[Dict]:
    soup = BeautifulSoup(html, "html.parser")
    rows = []
    candidates = []
    candidates.extend(soup.select("li.Comment"))
    candidates.extend(soup.select("div.Comment"))
    candidates.extend(soup.select("article.Comment"))
    candidates.extend([el for el in soup.find_all(id=True) if str(el.get("id","")).startswith("Comment_")])

    seen = set()
    for item in candidates:
        cid = item.get("id") or ""
        if cid and cid in seen: 
            continue

        user_el = item.select_one(".Username") or item.select_one("a.Username") or item.select_one("a[href*='/profile/']")
        userid = gettext(user_el)

        time_el = item.select_one("a.Permalink time") or item.select_one("time") or item.select_one("a.Permalink")
        date_text = time_el.get("datetime") if time_el and time_el.get("datetime") else (gettext(time_el) if time_el else "")

        perma_a = item.select_one("a.Permalink") or item.find("a", attrs={"class": lambda c: c and "Permalink" in c})
        permalink = ""
        if perma_a and perma_a.get("href"):
            href = perma_a["href"]
            permalink = href if href.startswith("http") else urljoin(BASE_THREAD_URL, href)

        msg_el = item.select_one(".Message") or item.select_one(".message") or item.select_one(".UserContent") or item
        message = gettext(msg_el)

        if userid and message:
            rows.append({"userid": userid, "date": date_text, "message": message,
                         "comment_id": cid, "permalink": permalink, "page": current_page})
            if cid: seen.add(cid)
    return rows

sample = parse_page(html, test_page)
len(sample), sample[:2]  # quick peek


(100,
 [{'userid': 'merc1',
   'date': '2002-03-25T05:54:02+00:00',
   'message': 'I personally think that with a few tweaks the C320 could also sit at the top of this group.  It\'s still more of a entry-level luxury sedan than Mercedes would lead you to believe.   The C320 "Sport" needs to have it\'s suspension retuned to accept 17 inch tires without a harsh ride.  MB also needs to stop forcing metal trim and black leather on everyone that wants the sport package.  The C240\'s 6-speed wouldn\'t hurt either if offered on the C320, though it needs to be "fixed" first. That said, my favorite of the group is (gasp!) the new A4 3.0.   The A4 easily has the best interior of any sub-40K car, combine that with stunning looks and a body that is as rigid as any ever made and you have my personal favorite.   The single thing I don\'t like about the A4 is the grey trim around the bottom of the car.  This trim doesn\'t match any of the body colors, as it should be body color anyway.   This is an e

In [5]:
def scrape(max_posts: int = 10000, out_csv: str = "edmunds_posts.csv",
           min_sleep: float = 1.0, max_sleep: float = 2.0,
           start_page: int = 1, max_pages: Optional[int] = None) -> int:
    count = 0
    page = start_page
    with open(out_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=["userid","date","message","comment_id","permalink","page"])
        writer.writeheader()
        while True:
            if max_pages is not None and (page - start_page + 1) > max_pages:
                break
            url = page_url(page)
            print(f"[INFO] Fetching page {page}: {url}")
            try:
                r = session.get(url, timeout=30)
            except Exception as e:
                print(f"[WARN] Request error on page {page}: {e}")
                time.sleep(5); page += 1; continue
            if r.status_code != 200:
                print(f"[WARN] HTTP {r.status_code} on page {page}")
                if r.status_code in (403, 429, 503):
                    time.sleep(10); continue
                page += 1; continue
            rows = parse_page(r.text, page)
            if not rows:
                print(f"[INFO] No rows parsed on page {page}. Stopping.")
                break
            for row in rows:
                writer.writerow(row)
                count += 1
                if count >= max_posts:
                    print(f"[DONE] Reached target {max_posts} posts.")
                    return count
            page += 1
            time.sleep(random.uniform(min_sleep, max_sleep))
    print(f"[DONE] Wrote {count} rows to {out_csv}")
    return count


In [6]:
written = scrape(max_posts=300, out_csv="edmunds_posts_sample.csv", start_page=1, max_pages=10)
written


[INFO] Fetching page 1: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans
[INFO] Fetching page 2: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p2
[INFO] Fetching page 3: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p3
[DONE] Reached target 300 posts.


300

In [7]:
df = pd.read_csv("edmunds_posts_sample.csv")
final = df[["userid","date","message"]].copy()
final.to_csv("edmunds_posts_final.csv", index=False)
final.head(10), final.shape


(          userid                       date  \
 0          merc1  2002-03-25T05:54:02+00:00   
 1         fredvh  2002-03-25T07:06:29+00:00   
 2  blueguydotcom  2002-03-25T17:02:27+00:00   
 3    hungrywhale  2002-03-25T23:04:37+00:00   
 4           riez  2002-03-26T00:44:13+00:00   
 5  blueguydotcom  2002-03-26T20:20:10+00:00   
 6    hungrywhale  2002-03-26T21:02:10+00:00   
 7  blueguydotcom  2002-03-26T21:45:33+00:00   
 8    hungrywhale  2002-03-27T00:00:02+00:00   
 9           riez  2002-03-27T02:00:56+00:00   
 
                                              message  
 0  I personally think that with a few tweaks the ...  
 1  I am debating a new purchase and these two are...  
 2  Great handling, RWD, excellent engine and the ...  
 3  And no manual tranny.  That may not matter to ...  
 4  One beauty of BMW 3 Series is that there are s...  
 5  good grief, so you wait 9 months for the manua...  
 6  I'll give it a fair shot when the manual comes...  
 7  I understand it's 

In [8]:
scrape(max_posts=10000, out_csv="edmunds_posts_10k_full.csv", start_page=1, max_pages=300)
pd.read_csv("edmunds_posts_10k_full.csv")[["userid","date","message"]].to_csv("edmunds_posts_10k.csv", index=False)


[INFO] Fetching page 1: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans
[INFO] Fetching page 2: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p2
[INFO] Fetching page 3: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p3
[INFO] Fetching page 4: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p4
[INFO] Fetching page 5: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p5
[INFO] Fetching page 6: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p6
[INFO] Fetching page 7: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p7
[INFO] Fetching page 8: https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p8
[INFO] Fetching page 9: https://forums.edmunds.com/discussi