In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time

def get_courses_from_page(soup, start_id):
    course_cards = soup.select('li')
    page_courses = []
    course_id = start_id

    for card in course_cards:
        # Course Name
        course_name_tag = card.select_one('a.cds-CommonCard-titleLink h3')
        course_name = course_name_tag.text.strip() if course_name_tag else None

        # Provider
        provider_tag = card.select_one('p.cds-ProductCard-partnerNames')
        provider = provider_tag.text.strip() if provider_tag else None

        # Skills you'll gain
        skills_tag = None
        skills_tag = card.find('p', string=lambda t: t and "Skills you'll gain" in t)
        if not skills_tag:
            p_tags = card.select('p')
            for p in p_tags:
                strong_tag = p.find('strong')
                if strong_tag and "Skills you'll gain" in strong_tag.text:
                    skills_tag = p
                    break
        skills = None
        if skills_tag:
            skills = skills_tag.text.replace("Skills you'll gain:", "").strip()

        # Rating
        rating_div = card.find('div', attrs={'aria-label': 'Rating'})
        rating = None
        if rating_div:
            rating_span = rating_div.find('span')
            rating = rating_span.text.strip() if rating_span else None

        # Total Reviews
        total_reviews = None
        review_divs = card.select('div.css-vac8rf')
        for div in review_divs:
            text = div.text.strip()
            if "reviews" in text:
                total_reviews = text
                break

        # Metadata: level and duration
        metadata_p = card.select_one('div.cds-CommonCard-metadata p.css-vac8rf')
        metadata = metadata_p.text.strip() if metadata_p else None

        level = None
        duration = None
        if metadata:
            parts = [part.strip() for part in metadata.split('·')]
            if len(parts) >= 1:
                level = parts[0]
            if len(parts) >= 3:
                duration = parts[2]

        # ✅ Only add if course name exists
        if course_name:
            page_courses.append({
                'id': course_id,
                'Course Name': course_name,
                'Provider': provider,
                'Skills': skills,
                'Rating': rating,
                'Total Reviews': total_reviews,
                'Level': level,
                'Duration': duration
            })
            course_id += 1

    return page_courses, course_id


# === Config ===
TARGET_COUNT = 100  # number of courses you want
BASE_URL = "https://www.coursera.org/search?productTypeDescription=Professional%20Certificates&sortBy=BEST_MATCH&page={}"

courses_data = []
page = 1
current_id = 1

while len(courses_data) < TARGET_COUNT:
    print(f"Scraping page {page}...")

    url = BASE_URL.format(page)
    response = requests.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.text, 'html.parser')
    page_courses, current_id = get_courses_from_page(soup, current_id)

    if not page_courses:
        print("No more courses found.")
        break

    courses_data.extend(page_courses)
    page += 1
    time.sleep(1)  # polite delay

# Trim to exact target count
courses_data = courses_data[:TARGET_COUNT]

# ✅ Save to CSV
csv_file = 'coursera_courses.csv'
with open(csv_file, mode='w', newline='', encoding='utf-8') as f:
    writer = csv.DictWriter(f, fieldnames=['id', 'Course Name', 'Provider', 'Skills', 'Rating', 'Total Reviews', 'Level', 'Duration'])
    writer.writeheader()
    writer.writerows(courses_data)

print(f"\n✅ Scraped {len(courses_data)} courses and saved to '{csv_file}'")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...

✅ Scraped 100 courses and saved to 'coursera_courses.csv'
