In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os # Import os to manage file paths

# --- Configuration ---

URL = "https://github.com/topics"
HTML_FILE = "webpage.html"
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def fetch_and_save_html(url, file_path):
    """Fetches HTML content from a URL and saves it to a file."""
    print(f"--- 1. Fetching HTML from {url} ---")
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)

        # 1.1. Print status code
        print(f"Status Code: {response.status_code}")

        # 1.2. Check for successful request
        response.raise_for_status() # Raises an error for bad status codes
        print("Request successful.")

        # 1.3. Print first 100 characters
        html_content = response.text
        print(f"First 100 characters:\n{html_content[:100]}")
        print("-" * 50)

        # 1.4. Save content to file with correct encoding
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(html_content)
        print(f"Successfully saved HTML content to {file_path}")
        return True

    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return False

def parse_html_file(file_path):
    """Parses a local HTML file and extracts topic titles and descriptions."""
    print(f"\n--- 2. Parsing {file_path} with BeautifulSoup ---")

    if not os.path.exists(file_path):
        print(f"Error: File not found at {file_path}")
        return None

    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()

        # 2.1. Create BeautifulSoup object
        soup = BeautifulSoup(html_content, 'html.parser')

        # 2.2. Identify and extract information
        # Based on inspecting github.com/topics:
        # Titles are in: <p class="f3 lh-condensed mb-0 mt-1 Link--primary">
        # Descriptions are in: <p class="f5 color-fg-muted mb-0 mt-1">

        title_tags = soup.find_all('p', class_='f3 lh-condensed mb-0 mt-1 Link--primary')
        desc_tags = soup.find_all('p', class_='f5 color-fg-muted mb-0 mt-1')

        topic_titles = [tag.get_text().strip() for tag in title_tags]
        topic_descriptions = [tag.get_text().strip() for tag in desc_tags]

        # 2.3. Print length and content of extracted lists
        print("\n--- Extracted Titles ---")
        print(f"Found {len(topic_titles)} titles.")
        for i, title in enumerate(topic_titles[:5]): # Print first 5 for brevity
            print(f"  {i+1}. {title}")

        print("\n--- Extracted Descriptions ---")
        print(f"Found {len(topic_descriptions)} descriptions.")
        for i, desc in enumerate(topic_descriptions[:5]): # Print first 5 for brevity
            print(f"  {i+1}. {desc}")

        return topic_titles, topic_descriptions

    except Exception as e:
        print(f"Error parsing HTML file: {e}")
        return None

def create_dataframe(titles, descriptions):
    """Creates a pandas DataFrame from the extracted data."""
    print("\n--- 3. Creating pandas DataFrame ---")

    # 3.1. Create dictionary
    # Ensure lists are of the same length for the DataFrame
    min_length = min(len(titles), len(descriptions))

    if len(titles) != len(descriptions):
        print(f"Warning: Mismatch in counts. Titles: {len(titles)}, Descriptions: {len(descriptions)}.")
        print(f"Truncating to the shorter length: {min_length}")

    data_dict = {
        'Title': titles[:min_length],
        'Description': descriptions[:min_length]
    }

    # 3.2. Convert to pandas DataFrame
    df = pd.DataFrame(data_dict)

    # 3.3. Print the DataFrame
    print("Successfully created DataFrame:")
    print(df)

    print("\nDataFrame Info:")
    df.info()

if __name__ == "__main__":
    print("Starting GitHub Topics Scraper...")

    # Step 1: Fetch and save HTML
    if fetch_and_save_html(URL, HTML_FILE):

        # Step 2: Parse the saved HTML
        extracted_data = parse_html_file(HTML_FILE)

        if extracted_data:
            titles, descriptions = extracted_data

            # Step 3: Create and display the DataFrame
            create_dataframe(titles, descriptions)

    print("\nScraping process complete.")

Starting GitHub Topics Scraper...
--- 1. Fetching HTML from https://github.com/topics ---
Status Code: 200
Request successful.
First 100 characters:


<!DOCTYPE html>
<html
  lang="en"
  
  data-color-mode="auto" data-light-theme="light" data-dark-t
--------------------------------------------------
Successfully saved HTML content to webpage.html

--- 2. Parsing webpage.html with BeautifulSoup ---

--- Extracted Titles ---
Found 16 titles.
  1. Awesome Lists
  2. Chrome
  3. Code quality
  4. Compiler
  5. CSS

--- Extracted Descriptions ---
Found 16 descriptions.
  1. An awesome list is a list of awesome things curated by the community.
  2. Chrome is a web browser from the tech company Google.
  3. Automate your code review with style, quality, security, and testâ€‘coverage checks when you need them.
  4. Compilers are software that translate higher-level programming languages to lower-level languages (e.g. machine code).
  5. Cascading Style Sheets (CSS) is a language used most ofte