In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re
import time
import os

# Define a class named Coursera
class Coursera:
    # Constructor method to initialize the class attributes
    def __init__(self, main_url, header, keyword=None):
        self.main_url = main_url  # URL of the main Coursera page
        self.header = header      # Headers to be used in HTTP requests
        self.keyword = keyword    # Optional keyword for filtering courses

    # Method to scrape course URLs from multiple pages
    def course_url_scraper(self, last_page):
        course_urls_all_page = []  # List to store all course URLs
        # Iterate through each page until last_page
        for page_num in range(1, last_page + 1):
            # Construct URL for the current page
            url = self.main_url[:self.main_url.find('page=') + 5] + str(page_num) + self.main_url[self.main_url.find('&sort'):]
            # Send GET request to the URL
            r = requests.get(url, headers=self.header)
            # Parse the HTML content
            soup = BeautifulSoup(r.content, 'html.parser')

            # Extract course URLs from the page
            course_urls = [f"https://www.coursera.org{i.get('href')}" for i in soup.find('div', class_='cds-9 css-0 cds-11 cds-grid-item cds-56 cds-81').find_all('a')][:12]
            
            # Append course URLs to the list
            course_urls_all_page.extend(course_urls)

            # Pause execution for 1 second to avoid overwhelming the server
            time.sleep(1)
            print(page_num)  # Print current page number for tracking progress

        # Write all course URLs to a CSV file
        pd.DataFrame({'Course Url': course_urls_all_page}).to_csv(f'CourseUrlsKeyword{self.keyword}.csv', index=False)

    # Method to scrape course content from URLs provided in a CSV file
    def course_content_scraper(self, path, urls_csv):
        # Construct the full path of the CSV file containing course URLs
        url_ = path + '/' + urls_csv
        # Read the CSV file into a DataFrame
        df = pd.read_csv(url_)
        # Initialize lists to store course information
        titles = []
        ratings = []
        levels = []
        durations = []
        schedules = []
        reviews = []
        what_you_will_learn_list = []
        skill_gains = []
        modules_list = []
        instructors_list = []
        offered_by_list = []
        keywords = []
        urls_courses = []

        # Extract the keyword from the CSV filename
        k = urls_csv[17:-4]

        # Iterate through each course URL in the DataFrame
        for u in df['Course Url']:
            # Check if the URL contains 'query' (i.e., an internal search page)
            if 'query' not in u:
                # Send GET request to the course URL
                r = requests.get(u, headers=self.header)
                # Parse the HTML content
                soup = BeautifulSoup(r.content, 'html.parser')

                try:
                    # Extract course title
                    title = soup.find('h1').text

                    # Extract course information such as ratings, levels, durations, schedules, and reviews
                    information_1 = soup.find_all('div', class_='cds-119 cds-Typography-base css-h1jogs cds-121')
                    r = []
                    l = []
                    d = []
                    s = []

                    # Separate different types of information based on their content
                    for x in information_1:
                        if len(x.text) == 3:
                            r.append(x.text)
                        elif 'level' in x.text:
                            l.append(x.text)
                        elif 'hour' in x.text:
                            d.append(x.text)
                        elif 'schedule' in x.text:
                            s.append(x.text)

                    # Extract additional information like reviews, what you will learn, and skill gains
                    information_2 = [i.text for i in soup.find_all('p', class_='cds-119 cds-Typography-base css-dmxkm1 cds-121')]
                    review_ = []
                    for e in information_2:
                        if 'reviews' in e:
                            review_.append(e[1:-1])
                            break

                    what_you_will_learn = [i.text for i in soup.find('div', class_='css-15ko5n9')]
                    skill_gain = [i.text for i in soup.find_all('span', class_='css-1l1jvyr')]

                    # Extract module information
                    modules = [i.text for i in soup.find_all('h3', class_='cds-119 cds-Typography-base css-h1jogs cds-121')]
                    try:
                        modules = modules[:modules.index('Instructors')]
                    except:
                        modules = modules[:modules.index('Instructor')]

                    # Extract information about who is offering the course
                    offered_by = []
                    for x in soup.find_all('div', class_='css-15g7tpu'):
                        txt = re.sub('Learn more', '', x.text)
                        offered_by.append(txt)

                    # Extract instructor information
                    instructors = []
                    for x in soup.find_all('span', class_='cds-119 cds-Typography-base css-e7lgfl cds-121'):
                        if x.text not in offered_by:
                            instructors.append(x.text)

                    # Remove duplicate instructors
                    instructors = list(set(instructors))

                    # Pause execution for 1 second to avoid overwhelming the server
                    time.sleep(1)

                    # Append information to respective lists
                    titles.append(title)
                    keywords.append(k)
                    urls_courses.append(u)

                    if len(r) > 0: ratings.append(list(set(r))[0])
                    else: ratings.append(np.nan)

                    if len(l) > 0: levels.append(list(set(l))[0])
                    else: levels.append(np.nan)

                    if len(d) > 0: durations.append(list(set(d))[0])
                    else: durations.append(np.nan)

                    if len(s) > 0: schedules.append(list(set(s))[0])
                    else: schedules.append(np.nan)

                    if len(review_) > 0: reviews.append(list(set(review_))[0])
                    else: reviews.append(np.nan)

                    try:
                        what_you_will_learn.index("What you'll learn")
                        what_you_will_learn_list.append(what_you_will_learn[1])
                    except:
                        what_you_will_learn_list.append(np.nan)

                    skill_gains.append(skill_gain)
                    modules_list.append(modules)
                    instructors_list.append(instructors)
                    offered_by_list.append(offered_by)

                    # Print progress counter
                    print(len(titles))

                    # Write interim results to a CSV file every 100 iterations
                    if len(titles) % 100 == 0:
                        pd.DataFrame({
                            'Course Title': titles,
                            'Rating': ratings,
                            'Level': levels,
                            'Duration': durations,
                            'Schedule': schedules,
                            'Review': reviews,
                            'What you will learn': what_you_will_learn_list,
                            'Skill gain': skill_gains,
                            'Modules': modules_list,
                            'Instructor': instructors_list,
                            'Offered By': offered_by_list,
                            'Keyword': keywords,
                            'Course Url': urls_courses
                        }).to_csv('trash.csv', index=False)
                except:
                    pass

        # Write all collected information to a CSV file
        pd.DataFrame({
            'Course Title': titles,
            'Rating': ratings,
            'Level': levels,
            'Duration': durations,
            'Schedule': schedules,
            'Review': reviews,
            'What you will learn': what_you_will_learn_list,
            'Skill gain': skill_gains,
            'Modules': modules_list,
            'Instructor': instructors_list,
            'Offered By': offered_by_list,
            'Keyword': keywords,
            'Course Url': urls_courses
        }).to_csv(f'CoursesAbout{k}.csv', index=False)

In [2]:
header = {
    'User-Agent':'Your User Agent'
}

In [None]:
# Scrape courses urls for every keyword.

main_url = 'https://www.coursera.org/search?query=Math%20and%20Logic&page=80&sortBy=BEST_MATCH'
keyword = 'Math and Logic'
Coursera(main_url, header, keyword).course_url_scraper(41)

In [None]:
# Extract useful information from the courses. folder path in where the courses urls located.

folder_path = 'PathWhereUrlsFilesAre'
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
for url in csv_files:
    Coursera(url, header).course_content_scraper(folder_path, url)

In [None]:
### Concatenation.

# Specify the folder path containing CSV files
folder_path = 'PathWhereDataFilesAre'

# Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty DataFrame to store the merged data
merged_data = pd.DataFrame()

# Iterate through each CSV file and merge its data into the main DataFrame
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    merged_data = pd.concat([merged_data, df], ignore_index=True)

# Optionally, you can save the merged data to a new CSV file
merged_data.to_csv('PathWhereYouWantToCreateThisFile/CourseraDataset.csv', index=False)

# Print the merged data
print(merged_data)
merged_data.duplicated().sum()