In [None]:
# made with GPT5

# imports
from selenium import webdriver # selenium webdriver api
from selenium.webdriver.chrome.service import Service # control chromedriver process
from selenium.webdriver.common.by import By # locator strategies (css, xpath, etc.)
from selenium.webdriver.chrome.options import Options # configure chrome options
from selenium.webdriver.support.ui import WebDriverWait # explicit waits for elements
from selenium.webdriver.support import expected_conditions as EC # predicates for waits
import pandas as pd # tabular data and dataframe creation
import time # sleep between page loads
import re # regular expressions for text cleanup
from tqdm import tqdm # neat progress bar
# optional: if you still see weird output, force tqdm to stdout only
# import sys

# some initial parameters
TARGET_COUNT = 10_000 # desired total number of comments to scrape
BASE_URL = "https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans"  # online thread base

# setup the browser
chrome_options = Options() # create a chrome options object
chrome_options.add_argument('--headless=new') # run chrome in headless mode
chrome_options.add_argument('--no-sandbox') # disable sandbox
chrome_options.add_argument('--disable-dev-shm-usage') # avoid shared memory issues on linux
chrome_options.add_argument('--disable-gpu') # disable gpu acceleration
chrome_options.add_argument('--window-size=1920,1080') # ensure large viewport

service = Service('/usr/local/bin/chromedriver') # adjust this if necessary
driver = webdriver.Chrome(service=service, options=chrome_options) # start webdriver instance
wait = WebDriverWait(driver, 20) # explicit wait helper with 20s timeout

# url helper for pagination
def page_url(page_num:int) -> str: # function to build the correct url
    """page 1 is the base url; page 2+ use /p{n}."""
    if page_num <= 1:
        return BASE_URL
    return f"{BASE_URL}/p{page_num}"

# wait until posts render
def wait_for_posts():
    wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, ".MessageList"))) # wait until post container appears
    time.sleep(0.5) # small buffer for rendering

# javascript to remove blockquotes
REMOVE_QUOTES_JS = r"""
const message = arguments[0].cloneNode(true);
message.querySelectorAll(
  'blockquote, .UserQuote, .QuoteText, .QuoteAuthor, .QuoteFoldingWrapper'
).forEach(el => el.remove());
return message.innerText;
"""

# helper to normalize whitespace
def clean_text(s:str | None) -> str | None:
    if not s:
        return None
    s = re.sub(r'\r\n|\r', '\n', s) # normalize line endings
    s = re.sub(r'[ \t\f\v]+', ' ', s) # collapse extra spaces/tabs
    s = re.sub(r'\n{3,}', '\n\n', s) # limit blank lines
    return s.strip() # trim whitespace

# scrape all posts on the current page
def scrape_current_page() -> list[dict]:
    rows = [] # store results
    posts = driver.find_elements(By.CSS_SELECTOR, ".MessageList .Item") # find post containers
    for post in posts:
        try:
            # author
            try:
                author = post.find_element(By.CSS_SELECTOR, "a.Username").text.strip()
            except:
                author = None

            # date
            try:
                date_el = post.find_element(By.XPATH, ".//time | .//abbr")
                dtg = (date_el.get_attribute("datetime")
                       or date_el.get_attribute("title")
                       or date_el.text.strip())
            except:
                dtg = None

            # comment (remove blockquotes)
            try:
                msg_el = post.find_element(By.CSS_SELECTOR, "div.Message")
                raw_text = driver.execute_script(REMOVE_QUOTES_JS, msg_el)
                comment = clean_text(raw_text)
            except:
                comment = None

            rows.append({"author": author, "date": dtg, "comment": comment})
        except:
            pass # skip bad posts
    return rows

# crawl loop until target count reached
all_rows = []
page = 1

# single, tidy progress bar (no stacked lines)
progress = tqdm(
    total=TARGET_COUNT,
    desc="comments scraped",
    unit="comment",
    dynamic_ncols=True,
    mininterval=0.3,
    smoothing=0.1,
    # file=sys.stdout,  # uncomment if your env sends tqdm to stderr weirdly
    bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}] {postfix}"
)

try:
    while len(all_rows) < TARGET_COUNT:
        url = page_url(page) # build url
        driver.get(url) # open page

        try:
            wait_for_posts() # wait until posts render
        except Exception as e:
            # use tqdm.write so it doesn't break the bar
            tqdm.write(f"no posts container on page {page} ({url}). stopping. error: {e}")
            break

        rows = scrape_current_page() # scrape rows
        if not rows:
            tqdm.write("no posts on this page — likely end of thread.")
            break

        all_rows.extend(rows) # add results
        progress.update(len(rows)) # update bar by number of comments
        progress.set_postfix_str(f"pages={page}") # show current page without new lines

        if len(all_rows) >= TARGET_COUNT:
            break

        page += 1
        time.sleep(1.0) # polite pause
finally:
    driver.quit() # close browser
    progress.close() # close progress bar

# build dataframe
car_comments_df = pd.DataFrame(all_rows[:TARGET_COUNT]) # clip to target count
print(f"\ntotal collected: {len(car_comments_df)}")

car_comments_df


python3.12(60869) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
comments scraped: |          | 10001/? [13:52<00:00, 12.02comment/s] , pages=200



total collected: 10000


Unnamed: 0,author,date,comment
0,cybersol,2002-03-23T19:04:32+00:00,Entry level performance luxury sedans are a ho...
1,merc1,2002-03-25T05:54:02+00:00,I personally think that with a few tweaks the ...
2,fredvh,2002-03-25T07:06:29+00:00,I am debating a new purchase and these two are...
3,blueguydotcom,2002-03-25T17:02:27+00:00,"Great handling, RWD, excellent engine and the ..."
4,hungrywhale,2002-03-25T23:04:37+00:00,And no manual tranny. That may not matter to y...
...,...,...,...
9995,sevenfeet0,2007-09-26T17:29:17+00:00,"Well, part of the reason the back seat room is..."
9996,blueguydotcom,2007-09-26T17:43:26+00:00,"Wow, interesting. Can't argue with that. Why n..."
9997,circlew,2007-09-26T17:51:27+00:00,"Well, no ELLPS can be comfortable for you! Go ..."
9998,150mphclub,2007-09-26T17:55:02+00:00,I am with you sevenfeet. I sometimes drive 700...


In [None]:
# clean comments in place
car_comments_df['comment'] = car_comments_df['comment'].str.replace(r'[^A-Za-z0-9\s]','', regex = True) # keep only letters, numbers, spaces
car_comments_df['comment'] = car_comments_df['comment'].str.replace(r'\s+',' ', regex = True).str.strip() # collapse multiple spaces
car_comments_df['comment'] = car_comments_df['comment'].str.lower() # lowercase everything

# convert date to datetime
car_comments_df['date'] = pd.to_datetime(
    car_comments_df['date'],
    errors = 'coerce', # invalid or missing go to NaT
    utc = True # standardize to UTC
)

car_comments_df


Unnamed: 0,author,date,comment
0,cybersol,2002-03-23 19:04:32+00:00,entry level performance luxury sedans are a ho...
1,merc1,2002-03-25 05:54:02+00:00,i personally think that with a few tweaks the ...
2,fredvh,2002-03-25 07:06:29+00:00,i am debating a new purchase and these two are...
3,blueguydotcom,2002-03-25 17:02:27+00:00,great handling rwd excellent engine and the be...
4,hungrywhale,2002-03-25 23:04:37+00:00,and no manual tranny that may not matter to yo...
...,...,...,...
9995,sevenfeet0,2007-09-26 17:29:17+00:00,well part of the reason the back seat room isn...
9996,blueguydotcom,2007-09-26 17:43:26+00:00,wow interesting cant argue with that why not g...
9997,circlew,2007-09-26 17:51:27+00:00,well no ellps can be comfortable for you go wi...
9998,150mphclub,2007-09-26 17:55:02+00:00,i am with you sevenfeet i sometimes drive 700 ...


In [None]:
car_comments_df.dtypes

author                  object
date       datetime64[ns, UTC]
comment                 object
dtype: object

In [None]:
# Save full dataset to CSV
car_comments_df.to_csv("car_comments.csv", index=False, encoding="utf-8")

print("Saved to car_comments.csv")


Saved to car_comments.csv


In [None]:
from google.colab import files

# Trigger a download of the CSV to your computer
files.download("car_comments.csv")

# manually push back to git if changes made