In [1]:
import requests
from bs4 import BeautifulSoup
import re
import os
import time
import json

In [None]:
# Base URL
url = "https://www.examtopics.com/discussions/cisco/"
exam_name = "200-301"

headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/122.0.0.0 Safari/537.36"
        ),
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Referer": "https://google.com",
        "Connection": "keep-alive",
    }

# Get the first page to find number of pages
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, "html.parser")

# Find number of pages
page_indicator = soup.find("span", class_="discussion-list-page-indicator")
strong_tags = page_indicator.find_all("strong")
num_pages = int(strong_tags[1].text)

question_links = []
# Loop through all pages
for i in range(1, num_pages + 1):
    print(f"Page {i}/{num_pages}".ljust(20), end="\r")
    page_url = url + f"{i}/"

    page_response = requests.get(page_url, headers=headers)
    soup = BeautifulSoup(page_response.content, "html.parser")
    titles = soup.find_all("div", class_="dicussion-title-container")
    for title in titles:
        if title.text:
            title_text = title.text.strip()
            if exam_name in title_text:
                a_tag = title.find("a")
                if a_tag and "href" in a_tag.attrs:
                    question_links.append(a_tag["href"])

print()
print("Number of questions found:", len(question_links))

Page 721/721        
Number of questions found: 1395


In [14]:
with open('links.json', 'w') as f:
    json.dump(question_links, f, indent=2)

In [17]:
def scrape_page(link):
    question_object = {}

    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/122.0.0.0 Safari/537.36"
        ),
        "Accept-Language": "en-US,en;q=0.9",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Referer": "https://google.com",
        "Connection": "keep-alive",
    }

    try:
        response = requests.get(link, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, "html.parser")
    except Exception as e:
        return {
            "question": "",
            "answers": [],
            "comments": [],
            "most_voted": None,
            "link": link,
            "question_number": "unknown",
            "error": f"Request or parsing failed: {e}"
        }

    question_number_match = re.search(r"question-(\d+)", link)
    question_number = question_number_match.group(1) if question_number_match else "unknown"

    # Extract question
    question = ""
    try:
        question_div = soup.find("div", class_="question-body")
        question_content = question_div.find("p", class_="card-text") if question_div else None
        if question_content:
            question = question_content.decode_contents().strip()
    except Exception:
        pass

    # Extract most voted answers
    most_voted = None
    try:
        voted_answers = soup.find("div", class_="voted-answers-tally")
        if voted_answers:
            script_content = voted_answers.find("script")
            if script_content and script_content.string:
                voted_json = json.loads(script_content.string)
                most_voted_object = next((item for item in voted_json if item.get('is_most_voted')), None)
                if most_voted_object:
                    most_voted = most_voted_object.get("voted_answers", None)
    except Exception:
        pass

    # Extract answer options
    answers = []
    try:
        if question_div:
            answers_div = question_div.find("div", class_="question-choices-container")
            if answers_div:
                answer_options = answers_div.find_all("li")
                if answer_options:
                    answers = [re.sub(r'\s+', ' ', answer_option.text).strip() for answer_option in answer_options]
    except Exception:
        pass

    # Extract comments and replies
    comments = []
    try:
        discussion_div = soup.find("div", class_="discussion-container")
        comment_divs = discussion_div.find_all("div", class_="comment-container", recursive=False) if discussion_div else []
        for comment_div in comment_divs:
            comment = {}
            try:
                comment_content_div = comment_div.find("div", class_="comment-content")
                comment_content = comment_content_div.text.strip() if comment_content_div else ""
            except Exception:
                comment_content = ""

            try:
                comment_selected_answer = comment_div.find("div", class_="comment-selected-answers")
                selected_answer = comment_selected_answer.find("span").text.strip() if comment_selected_answer else ""
            except Exception:
                selected_answer = ""

            replies = []
            try:
                comment_replies_div = comment_div.find("div", class_="comment-replies")
                if comment_replies_div:
                    reply_divs = comment_replies_div.find_all("div", class_="comment-container")
                    for reply in reply_divs:
                        try:
                            reply_content = reply.find("div", class_="comment-content").text.strip()
                        except Exception:
                            reply_content = ""
                        replies.append(reply_content)
            except Exception:
                pass

            comment["content"] = comment_content
            comment["selected_answer"] = selected_answer
            comment["replies"] = replies

            comments.append(comment)
    except Exception:
        pass

    question_object["question"] = question
    question_object["answers"] = answers
    question_object["comments"] = comments
    question_object["question_number"] = question_number
    question_object["link"] = link
    question_object["most_voted"] = most_voted
    question_object["error"] = None

    return question_object

In [18]:
def load_saved_questions(json_path):
    if not os.path.exists(json_path):
        return []
    with open(json_path, 'r', encoding='utf-8') as f:
        try:
            return json.load(f)
        except json.JSONDecodeError:
            return []

In [61]:
questions = load_saved_questions("questions.json")
prefix = "https://www.examtopics.com"
questions_num = len(question_links)
for i, link in enumerate(question_links):
    question_number_match = re.search(r"question-(\d+)", link)
    question_number = question_number_match.group(1) if question_number_match else "unknown"
    if question_number in [q["question_number"] for q in questions]:
        print(f"{i+1}/{questions_num} - Skipping {prefix+link}".ljust(60), end="\r")
        continue
    print(f"{i+1}/{questions_num} - Scraping {prefix+link}".ljust(60), end="\r")
    question_object = scrape_page(prefix+link)
    if question_object["error"]:
        print()
        print(f"Error: {question_object['error']}")
        break
    questions.append(question_object)
    time.sleep(5)

1395/1395 - Skipping https://www.examtopics.com/discussions/cisco/view/38787-exam-200-301-topic-1-question-39-discussion//n/

In [60]:
with open("questions.json", "w", encoding="utf-8") as f:
    json.dump(questions, f, ensure_ascii=False, indent=2)