In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

class Scraper:
    def __init__(self, provider):
        self.session = requests.Session()
        # Pretend to be a real Chrome browser
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
            "Accept-Language": "en-US,en;q=0.5",
        })
        self.provider = provider.lower()
        self.base_url = f"https://www.examtopics.com/discussions/{self.provider}/"

    def get_num_pages(self):
        try:
            response = self.session.get(self.base_url, timeout=15)
            if response.status_code != 200:
                print(f"Connection failed: Status {response.status_code}. Site may be blocking scripts.")
                return 0

            soup = BeautifulSoup(response.content, "html.parser")
            indicator = soup.find("span", {"class": "discussion-list-page-indicator"})

            if indicator:
                # Extracts the '571' from 'Page 1 of 571'
                text = indicator.get_text(strip=True)
                total_pages = int(text.split("of")[-1].strip())
                return total_pages
            return 0
        except Exception as e:
            print(f"Error fetching page count: {e}")
            return 0

    def fetch_page_links(self, page, search_string):
        try:
            # Adding a tiny delay to avoid triggering anti-bot
            time.sleep(0.5)
            response = self.session.get(f"{self.base_url}{page}/", timeout=10)
            if response.status_code != 200:
                return []

            soup = BeautifulSoup(response.content, "html.parser")
            discussions = soup.find_all("a", {"class": "discussion-link"})

            links = []
            for d in discussions:
                if search_string in d.text.upper():
                    full_link = "https://www.examtopics.com" + d["href"]
                    links.append(full_link)
            return links
        except Exception:
            return []

    def get_discussion_links(self, num_pages, search_string):
        links = []
        # max_workers=5 is safer to avoid getting your IP banned
        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = [executor.submit(self.fetch_page_links, page, search_string) for page in range(1, num_pages + 1)]
            with tqdm(total=num_pages, desc="Searching Pages", unit="page") as pbar:
                for future in as_completed(futures):
                    page_links = future.result()
                    links.extend(page_links)
                    pbar.update(1)
        return links

def extract_topic_question(link):
    # Regex to find Topic X and Question Y in the URL
    match = re.search(r'topic-(\d+)-question-(\d+)', link)
    return (int(match.group(1)), int(match.group(2))) if match else (999, 999)

def write_grouped_links_to_file(filename, links):
    # Sort by Topic, then by Question
    sorted_links = sorted(list(set(links)), key=extract_topic_question)

    with open(filename, 'w') as f:
        current_topic = None
        for link in sorted_links:
            topic, q = extract_topic_question(link)
            if topic != current_topic:
                f.write(f"\n--- TOPIC {topic} ---\n")
                current_topic = topic
            f.write(f"{link}\n")

def main():
    print("--- ExamTopics Scraper ---")
    provider = input("Enter provider (e.g., amazon, microsoft): ").strip()
    scraper = Scraper(provider)

    num_pages = scraper.get_num_pages()
    if num_pages == 0:
        # Fallback: sometimes the scraper can't see the page count but can see the pages
        num_pages = int(input("Could not detect page count. How many pages should I search? (e.g. 100): "))

    print(f"Searching through {num_pages} pages...")
    search_string = input("Enter exam code (e.g., AIF-C01): ").strip().upper()

    links = scraper.get_discussion_links(num_pages, search_string)

    if links:
        filename = f"{search_string}_links.txt"
        write_grouped_links_to_file(filename, links)
        print(f"\nSuccess! Found {len(links)} links. Saved to {filename}")
    else:
        print("\nNo links found. Check if the exam code is correct or if the site is blocking requests.")

if __name__ == "__main__":
    main()

--- ExamTopics Scraper ---
Enter provider (e.g., amazon, microsoft): amazon
Searching through 571 pages...
Enter exam code (e.g., AIF-C01): AIF-C01


Searching Pages: 100%|██████████| 571/571 [04:23<00:00,  2.17page/s]


Success! Found 323 links. Saved to AIF-C01_links.txt





In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import requests
from bs4 import BeautifulSoup
import time
from tqdm import tqdm

# Configuration
INPUT_FILE = 'AIF-C01_links.txt'
OUTPUT_FILE = 'AIF-C01_Questions_Answers.txt'

# Use the same headers to avoid bot detection
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
}

def scrape_question_details(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        if response.status_code != 200:
            return None

        soup = BeautifulSoup(response.content, 'html.parser')

        # 1. Extract Question Text
        question_text = soup.find('p', class_='card-text').get_text(strip=True)

        # 2. Extract Options
        options = []
        option_list = soup.find_all('li', class_='multi-choice-item')
        for opt in option_list:
            options.append(opt.get_text(strip=True))

        # 3. Extract "Most Voted" Answer from Discussion
        # Note: The 'official' answer is often hidden, but the community 'most voted' is usually better
        voted_answer = "Answer not found"
        answer_element = soup.find('span', class_='most-voted-answer-badge')
        if answer_element:
            # Usually looks like "Most Voted: A"
            voted_answer = answer_element['title'] if 'title' in answer_element.attrs else answer_element.get_text(strip=True)

        return {
            'url': url,
            'question': question_text,
            'options': options,
            'answer': voted_answer
        }
    except Exception as e:
        return None

def main():
    # Read links from your file
    with open(INPUT_FILE, 'r') as f:
        links = [line.strip() for line in f if line.startswith('http')]

    print(f"Starting to scrape {len(links)} questions...")

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as out:
        for link in tqdm(links, desc="Scraping Content"):
            data = scrape_question_details(link)
            if data:
                out.write(f"URL: {data['url']}\n")
                out.write(f"QUESTION: {data['question']}\n")
                out.write("OPTIONS:\n")
                for opt in data['options']:
                    out.write(f" - {opt}\n")
                out.write(f"COMMUNITY ANSWER: {data['answer']}\n")
                out.write("-" * 50 + "\n\n")

            # Anti-ban delay
            time.sleep(1.5)

    print(f"\nFinished! Your data is saved in {OUTPUT_FILE}")

if __name__ == "__main__":
    main()

Starting to scrape 323 questions...


Scraping Content: 100%|██████████| 323/323 [13:18<00:00,  2.47s/it]


Finished! Your data is saved in AIF-C01_Questions_Answers.txt





In [3]:
import requests
from bs4 import BeautifulSoup
import re
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# --- SETTINGS ---
# Adjust sleep to avoid IP bans (1.5 - 2.0s is safer for large exams)
SLEEP_BETWEEN_REQUESTS = 1.5

class ExamScraper:
    def __init__(self, provider):
        self.session = requests.Session()
        self.session.headers.update({
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Accept-Language": "en-US,en;q=0.5",
        })
        self.provider = provider.lower()
        self.base_url = f"https://www.examtopics.com/discussions/{self.provider}/"

    def get_num_pages(self):
        try:
            response = self.session.get(self.base_url, timeout=15)
            soup = BeautifulSoup(response.content, "html.parser")
            indicator = soup.find("span", {"class": "discussion-list-page-indicator"})
            if indicator:
                text = indicator.get_text(strip=True)
                return int(text.split("of")[-1].strip())
            return 0
        except Exception as e:
            print(f"Error fetching page count: {e}")
            return 0

    def fetch_links_from_page(self, page, search_string):
        try:
            time.sleep(0.5)
            url = f"{self.base_url}{page}/"
            response = self.session.get(url, timeout=10)
            soup = BeautifulSoup(response.content, "html.parser")
            discussions = soup.find_all("a", {"class": "discussion-link"})

            links = []
            for d in discussions:
                if search_string in d.text.upper():
                    links.append("https://www.examtopics.com" + d["href"])
            return links
        except:
            return []

    def scrape_question_content(self, url):
        try:
            time.sleep(SLEEP_BETWEEN_REQUESTS)
            response = self.session.get(url, timeout=15)
            if response.status_code != 200: return None

            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract Question
            q_elem = soup.find('p', class_='card-text')
            question = q_elem.get_text(strip=True) if q_elem else "N/A"

            # Extract Options
            options = [" ".join(opt.get_text().split()) for opt in soup.find_all('li', class_='multi-choice-item')]

            # Extract Answers (Community & Official)
            community_ans = "N/A"
            vote_badge = soup.find('span', class_='most-voted-answer-badge')
            if vote_badge:
                community_ans = vote_badge.get('title', vote_badge.get_text(strip=True))

            official_ans = "N/A"
            off_elem = soup.find('span', class_='correct-answer')
            if off_elem:
                official_ans = off_elem.get_text(strip=True)

            return {
                'url': url,
                'question': question,
                'options': options,
                'community': community_ans,
                'official': official_ans
            }
        except:
            return None

def extract_topic_q(link):
    match = re.search(r'topic-(\d+)-question-(\d+)', link)
    return (int(match.group(1)), int(match.group(2))) if match else (999, 999)

def main():
    provider = input("Enter provider (e.g., amazon): ").strip()
    exam_code = input("Enter exam code (e.g., AIF-C01): ").strip().upper()

    scraper = ExamScraper(provider)
    total_p = scraper.get_num_pages()

    if total_p == 0:
        total_p = int(input("Total pages not found. Enter manual scan range (e.g. 50): "))

    # 1. FETCH ALL LINKS
    print(f"\n--- Phase 1: Finding {exam_code} Links ---")
    all_links = []
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = [executor.submit(scraper.fetch_links_from_page, p, exam_code) for p in range(1, total_p + 1)]
        for f in tqdm(as_completed(futures), total=total_p):
            all_links.extend(f.result())

    unique_links = sorted(list(set(all_links)), key=extract_topic_q)
    print(f"Found {len(unique_links)} questions.")

    # 2. FETCH CONTENT FOR EACH LINK
    print(f"\n--- Phase 2: Scraping Questions & Answers ---")
    output_file = f"{exam_code}_Study_Guide.txt"

    with open(output_file, 'w', encoding='utf-8') as f:
        for link in tqdm(unique_links):
            data = scraper.scrape_question_content(link)
            if data:
                f.write(f"URL: {data['url']}\n")
                f.write(f"QUESTION: {data['question']}\n")
                f.write("OPTIONS:\n")
                for opt in data['options']: f.write(f"  {opt}\n")
                f.write(f"COMMUNITY VOTED: {data['community']}\n")
                f.write(f"OFFICIAL ANSWER: {data['official']}\n")
                f.write("-" * 40 + "\n\n")

    print(f"\nSuccess! File saved as: {output_file}")

if __name__ == "__main__":
    main()


--- Phase 1: Finding AIF-C01 Links ---


100%|██████████| 571/571 [04:23<00:00,  2.17it/s]


Found 323 questions.

--- Phase 2: Scraping Questions & Answers ---


100%|██████████| 323/323 [13:05<00:00,  2.43s/it]


Success! File saved as: AIF-C01_Study_Guide.txt



