## BS4 基本教程

In [1]:
from bs4 import BeautifulSoup

# 示例 HTML 内容
html_doc = """
<html>
  <body>
    <h1>产品列表</h1>
    <div class="product-card" id="item1">
      <h2 class="product-name">机械键盘
	  	<h3 class="haha"> test </h3>
	  </h2>
	  <p class="price">499元
	  </p>
    </div>
    <div class="product-card" id="item2">
      <h2 class="product-name">蓝牙鼠标</h2>
      <p class="price">199元</p>
    </div>
  </body>
</html>
"""

# 创建 BeautifulSoup 对象
soup = BeautifulSoup(html_doc, 'html.parser')

In [2]:
# 查找第一个h2标签
first_h2 = soup.find('h2')
print(first_h2)

<h2 class="product-name">机械键盘
	  	<h3 class="haha"> test </h3>
</h2>


In [3]:
# 查找所有p标签，并指定class属性
all_prices = soup.find_all('p', class_='price')
for price_tag in all_prices:
    print(price_tag)

<p class="price">499元
	  </p>
<p class="price">199元</p>


In [4]:
# 使用CSS选择器查找第一个产品卡片中的h2标签
first_card_h2 = soup.select_one('div.product-card h2 h3')
print({first_card_h2.text})

{' test '}


In [5]:
# 使用CSS选择器查找所有产品卡片
all_cards = soup.select('div.product-card')
for x in all_cards:
	print(x)
	print("-----------------------------------")

<div class="product-card" id="item1">
<h2 class="product-name">机械键盘
	  	<h3 class="haha"> test </h3>
</h2>
<p class="price">499元
	  </p>
</div>
-----------------------------------
<div class="product-card" id="item2">
<h2 class="product-name">蓝牙鼠标</h2>
<p class="price">199元</p>
</div>
-----------------------------------


In [6]:
# 演示 parent
price_tag = soup.find('p', class_='price')
parent_div = price_tag.parent
print(f"价格标签的父级是: {parent_div.name}, class={parent_div.get('class')}")

价格标签的父级是: div, class=['product-card']


In [7]:
# 演示 children
h1_tag = soup.find('h1')
print("h1标签的下一个兄弟节点: ")
# 因为中间有空白文本节点，所以使用 .next_sibling.next_sibling
print(h1_tag.next_sibling.next_sibling)

h1标签的下一个兄弟节点: 
<div class="product-card" id="item1">
<h2 class="product-name">机械键盘
	  	<h3 class="haha"> test </h3>
</h2>
<p class="price">499元
	  </p>
</div>


In [8]:
card_name = first_card_h2.text
print(f"提取的文本内容: {card_name}")

提取的文本内容:  test 


In [9]:
# 提取属性值
card_id = all_cards[0]['id']
print(f"提取的 id 属性值: {card_id}")

提取的 id 属性值: item1


In [10]:
# 提取标签名
tag_name = first_h2.name
print(f"提取的标签名: {tag_name}")


提取的标签名: h2


## pandas 基本教程

In [11]:
import pandas as pd	


## 提取未预处理的函数代码

In [8]:
import os
import re
import subprocess
import json
import tempfile
import shutil
from bs4 import BeautifulSoup

from clang.cindex import Index, Config, CursorKind

In [14]:
def find_function_in_ast(node, original_file_path, target_line):
    """
    函数功能: 递归遍历AST树, 查找包含目标行的函数定义节点
    """
    
    if node.kind == CursorKind.FUNCTION_DECL: #检查节点是否是函数类型
        start_line = node.extent.start.line
        end_line = node.extent.end.line
        
        if start_line <= target_line <= end_line:
            file_path = node.location.file.name if node.location.file else ''
            if os.path.normpath(file_path) == os.path.normpath(original_file_path):
                return node

    #若当前节点不是函数类型, 则递归遍历子节点
    for child in node.get_children():
        result = find_function_in_ast(child, original_file_path, target_line)
        if result:
            return result

    return None

def extract_function_with_libclang_ast(original_file_path, target_line):
    try:
        index = Index.create()
        translation_unit = index.parse(original_file_path) #得到翻译单元
        
        # Traverse the AST to find the target function node
        function_node = find_function_in_ast(translation_unit.cursor, original_file_path, target_line)
        
        if function_node:
            # Extract start and end line numbers from the found function node
            start_line = function_node.extent.start.line
            end_line = function_node.extent.end.line
            
            # Read the code from the original file and extract the function part
            with open(original_file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            
            function_code = "".join(lines[start_line - 1: end_line])
            return function_code
        else:
            print(f"Error: Could not find function containing line {target_line} in AST.")
            return None
            
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [None]:
# 在原始文件中提取代码
print(extract_function_with_libclang_ast("./main.c",13))

int main(){
	int myMacro(x,A);
	Loop(x1,5)
	printf("%d\n", x1); //test
}


In [None]:
## 提取预处理后的函数代码