In [1]:
import requests
from bs4 import BeautifulSoup
import re
import os
from urllib.parse import urljoin
import time

In [2]:
def getnames(filename = "names.txt"):
    with open(filename,"r",encoding='utf-8') as f:
        lines = f.readlines()
        examples = []
        for line in lines:
            examples.append(line.split(".html")[0])
        return examples
examples = getnames()

In [3]:
len(examples)

49

In [4]:
def get_all_example_links(base_url):
    """
    获取所有示例页面的链接
    """
    try:
        response = requests.get(base_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 查找所有示例链接 - 根据图片中的左侧导航栏结构
        example_links = []
        
        # 方法1: 查找包含示例的导航区域
        nav_sections = soup.find_all('div', class_=re.compile(r'sidebar|nav|menu', re.I))
        for section in nav_sections:
            links = section.find_all('a', href=re.compile(r'/examples/'))
            for link in links:
                if 'anscombe' not in link.get('href', ''):  # 排除已处理的示例
                    full_url = urljoin(base_url, link['href'])
                    example_links.append(full_url)
        
        # 方法2: 如果没有找到，尝试其他选择器
        if not example_links:
            links = soup.find_all('a', href=re.compile(r'/examples/'))
            for link in links:
                full_url = urljoin(base_url, link['href'])
                example_links.append(full_url)
        
        # 去重
        example_links = list(set(example_links))
        return example_links
        
    except Exception as e:
        print(f"获取示例链接时出错: {e}")
        return []

In [5]:
def extract_example_code(url):
    """
    从单个示例页面提取Python代码
    """
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # 查找代码块 - 根据图片中的代码区域
        code_blocks = []
        
        # 方法1: 查找pre标签中的代码
        pre_blocks = soup.find_all('pre')
        for pre in pre_blocks:
            code = pre.get_text().strip()
            if 'import seaborn' in code or 'sns.' in code:
                code_blocks.append(code)
        
        # 方法2: 查找div中的代码块
        if not code_blocks:
            div_blocks = soup.find_all('div', class_=re.compile(r'code|highlight', re.I))
            for div in div_blocks:
                code = div.get_text().strip()
                if 'import seaborn' in code or 'sns.' in code:
                    code_blocks.append(code)
        
        # 方法3: 查找所有包含Python代码的标签
        if not code_blocks:
            all_code = soup.find_all(string=re.compile(r'import seaborn|sns\.|matplotlib|plt\.'))
            for code_text in all_code:
                if len(code_text.strip()) > 50:  # 确保是完整的代码块
                    code_blocks.append(code_text.strip())
        
        return code_blocks[0] if code_blocks else None
        
    except Exception as e:
        print(f"从 {url} 提取代码时出错: {e}")
        return None

In [6]:
def create_jupyter_notebook(examples_data, output_file='seaborn_examples.ipynb'):
    """
    创建包含所有示例的Jupyter notebook
    """
    notebook = {
        "cells": [],
        "metadata": {
            "kernelspec": {
                "display_name": "Python 3",
                "language": "python",
                "name": "python3"
            },
            "language_info": {
                "codemirror_mode": {
                    "name": "ipython",
                    "version": 3
                },
                "file_extension": ".py",
                "mimetype": "text/x-python",
                "name": "python",
                "nbconvert_exporter": "python",
                "pygments_lexer": "ipython3",
                "version": "3.8.0"
            }
        },
        "nbformat": 4,
        "nbformat_minor": 4
    }
    
    # 添加导入单元格
    import_cell = {
        "cell_type": "code",
        "execution_count": None,
        "metadata": {},
        "outputs": [],
        "source": [
            "import seaborn as sns\n",
            "import matplotlib.pyplot as plt\n",
            "import numpy as np\n",
            "import pandas as pd\n",
            "%matplotlib inline\n",
            "%config InlineBackend.figure_format='retina'",
            "sns.set_theme(style=\"whitegrid\")  # 设置默认样式"
        ]
    }
    notebook["cells"].append(import_cell)
    
    # 为每个示例添加单元格
    for title, code in examples_data:
        code = node.replace("import seaborn as sns","")
        code = node.replace("import matplotlib.pyplot as plt","")
        code = code.lstrrp("\n")
        if code:
            # 添加标题单元格
            title_cell = {
                "cell_type": "markdown",
                "metadata": {},
                "source": [f"# {title}\n\n示例代码:"]
            }
            notebook["cells"].append(title_cell)
            
            # 添加代码单元格
            code_cell = {
                "cell_type": "code",
                "execution_count": None,
                "metadata": {},
                "outputs": [],
                "source": code+"\nplt.show()"
            }
            notebook["cells"].append(code_cell)
    
    # 保存notebook文件
    import json
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(notebook, f, indent=2, ensure_ascii=False)
    
    print(f"Jupyter notebook已保存为: {output_file}")

In [7]:
def main():
    base_url = "https://seaborn.org.cn/examples/"
    example_names = examples
    example_links = [f"{base_url}{name}.html" for name in example_names]
    
    print(f"找到 {len(example_links)} 个示例")
    
    # 提取所有示例的代码
    examples_data = []
    for i, url in enumerate(example_links, 1):
        print(f"正在处理示例 {i}/{len(example_links)}: {url}")
        code = extract_example_code(url)
        
        if code:
            # 从URL提取示例标题
            title = url.split('/')[-1].replace('.html', '').replace('_', ' ').title()
            examples_data.append((title, code))
            print(f"  ✓ 成功提取代码")
        else:
            print(f"  ✗ 未找到代码")
        
        # 添加延迟避免请求过于频繁
        time.sleep(0)
    
    # 创建Jupyter notebook
    if examples_data:
        create_jupyter_notebook(examples_data)
        print(f"成功提取 {len(examples_data)} 个示例的代码")
    else:
        print("未提取到任何代码")

if __name__ == "__main__":
    main()

找到 49 个示例
正在处理示例 1/49: https://seaborn.org.cn/examples/anscombes_quartet.html
  ✓ 成功提取代码
正在处理示例 2/49: https://seaborn.org.cn/examples/different_scatter_variables.html
  ✓ 成功提取代码
正在处理示例 3/49: https://seaborn.org.cn/examples/errorband_lineplots.html
  ✓ 成功提取代码
正在处理示例 4/49: https://seaborn.org.cn/examples/faceted_histogram.html
  ✓ 成功提取代码
正在处理示例 5/49: https://seaborn.org.cn/examples/faceted_lineplot.html
  ✓ 成功提取代码
正在处理示例 6/49: https://seaborn.org.cn/examples/grouped_barplot.html
  ✓ 成功提取代码
正在处理示例 7/49: https://seaborn.org.cn/examples/grouped_boxplot.html
  ✓ 成功提取代码
正在处理示例 8/49: https://seaborn.org.cn/examples/grouped_violinplots.html
  ✓ 成功提取代码
正在处理示例 9/49: https://seaborn.org.cn/examples/heat_scatter.html
  ✓ 成功提取代码
正在处理示例 10/49: https://seaborn.org.cn/examples/hexbin_marginals.html
  ✓ 成功提取代码
正在处理示例 11/49: https://seaborn.org.cn/examples/histogram_stacked.html
  ✓ 成功提取代码
正在处理示例 12/49: https://seaborn.org.cn/examples/horizontal_boxplot.html
  ✓ 成功提取代码
正在处理示例 13/49: https://seaborn.org.c

NameError: name 'node' is not defined