#### 读取网页HTML ####

##### V0.2: leaf_nodes, interactive_elements #####

In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup
from collections import defaultdict
import glob

# 定义布局标签
target_tags = ['div', 'section', 'article', 'aside', 'nav']

# 统计交互元素
def count_interactive_elements(soup):
    a_tags = len(soup.find_all('a', href=True))
    button_tags = len(soup.find_all('button'))
    onclick_tags = len([tag for tag in soup.find_all(attrs={'onclick': True}) if tag.name not in ['a', 'button']])
    input_tags = len(soup.find_all(['input', 'select', 'textarea']))
    return a_tags + button_tags + onclick_tags + input_tags

# 遍历DOM，统计层级与叶子节点
def traverse_dom(node, current_level, level_counter, leaf_counter, max_level):
    if node.name in target_tags:
        level = min(current_level, 20)  # 超过L20归为L20
        level_counter[level] += 1
        if not any(child.name in target_tags for child in node.find_all(recursive=False)):
            leaf_counter[0] += 1
        if current_level > max_level[0]:
            max_level[0] = current_level
    for child in node.find_all(recursive=False):
        traverse_dom(child, current_level + 1, level_counter, leaf_counter, max_level)

# 主程序
results = []

root_dir = '/home/allen1997/wayback_machine_downloader/websites/'

for website in os.listdir(root_dir):
    website_path = os.path.join(root_dir, website)
    if not os.path.isdir(website_path):
        continue
    
    for year in range(2009, 2020):
        year_dir = os.path.join(website_path, str(year))
        if not os.path.exists(year_dir):
            results.append([website, year] + ['NA']*23)
            continue
        
        # 匹配形如 *_index.html 的文件
        matched_files = glob.glob(os.path.join(year_dir, '*_index.html'))
        if matched_files:
            year_path = matched_files[0]  # 如果有多个，只取第一个
        else:
            results.append([website, year] + ['NA']*23) # 没有则置空
            continue
        
        try:
            with open(year_path, 'r', encoding='utf-8', errors='ignore') as file:
                html_content = file.read()
            soup = BeautifulSoup(html_content, 'html.parser')
            
            level_counter = defaultdict(int)
            leaf_counter = [0]
            max_level = [0]
            
            body = soup.body
            if body:
                traverse_dom(body, 0, level_counter, leaf_counter, max_level)
            
            levels = [level_counter[i] for i in range(1, 21)]
            interactive = count_interactive_elements(soup)
            
            results.append([website, year, max_level[0]] + levels + [leaf_counter[0], interactive])
        except Exception as e:
            print(e)
            results.append([website, year] + ['ERROR']*23)

columns = ['website', 'year', 'max_depth'] + [f'L{i}' for i in range(1, 21)] + ['leaf_nodes', 'interactive_elements']
df = pd.DataFrame(results, columns=columns)
df.to_excel('outputs/web_layout_interaction_panel.xlsx', index=False)

print("统计完成，数据已保存为 web_layout_interaction_panel.xlsx")

  k = self.parse_starttag(i)
  soup = BeautifulSoup(html_content, 'html.parser')


统计完成，数据已保存为 web_layout_interaction_panel.xlsx


In [None]:
import os
import pandas as pd
from bs4 import BeautifulSoup

# 定义模块关键词
effective_keywords = ['module', 'block', 'wrapper', 'container', 'card', 'panel', 'section', 'content', 'box']

# 定义交互元素标签
def count_interactive_elements(soup):
    a_tags = len(soup.find_all('a', href=True))
    button_tags = len(soup.find_all('button'))
    onclick_tags = len([tag for tag in soup.find_all(attrs={'onclick': True}) if tag.name not in ['a', 'button']])
    input_tags = len(soup.find_all(['input', 'select', 'textarea']))
    return a_tags + button_tags + onclick_tags + input_tags

# 计算最大嵌套层级
def calculate_max_depth(modules):
    max_depth = 0
    for module in modules:
        depth = 1
        parent = module.find_parent()
        while parent:
            if parent in modules:
                depth += 1
            parent = parent.find_parent()
        if depth > max_depth:
            max_depth = depth
    return max_depth

# 计算当前节点所处层级
def get_module_level_in_list(element, modules):
    level = 0
    current = element
    while current.parent is not None:
        current = current.parent
        if current in modules:
            level += 1
        else:
            break
    return level

# 统计叶子模块
def count_leaf_modules(modules):
    leaf_modules = [m for m in modules if not any(child in modules for child in m.find_all(['div', 'section', 'article', 'aside', 'nav']))]
    
    # 统计层级
    leaf_info = []
    for leaf in leaf_modules:
        level = get_module_level_in_list(leaf, modules)
        leaf_info.append({'element': leaf, 'level': level})
    
        # print(f"标签: {leaf.name}, 层级: {level}")

    return len(leaf_modules), leaf_info  

# 判断是否为有效模块
def is_effective_module(tag):
    if not tag.has_attr('class') and not tag.has_attr('id'):
        return False
    identifiers = []
    if tag.has_attr('class'):
        identifiers.extend(tag.get('class'))
    if tag.has_attr('id'):
        identifiers.append(tag.get('id'))
    # print(identifiers)
    # return any(kw in ident for ident in identifiers for kw in effective_keywords)
    return True

# 统计模块层级
def count_module_levels(modules):
    level1 = [
        m for m in modules 
        if not any(parent in modules for parent in m.find_parents(['div', 'section', 'article', 'aside', 'nav']))
    ]
    level2 = [m for m in modules if m.find_parent() in level1]
    level3plus = len(modules) - len(level1) - len(level2)
    
    return len(level1), len(level2), level3plus

# 主程序
results = []

root_dir = './test'

for website in os.listdir(root_dir):
    website_path = os.path.join(root_dir, website)
    if not os.path.isdir(website_path):
        continue
    
    for year in range(2009, 2020):
        year_path = os.path.join(website_path, str(year), 'index.html')
        if not os.path.exists(year_path):
            results.append([website, year, 'NA', 'NA', 'NA', 'NA'])
            continue
        
        try:
            with open(year_path, 'r', encoding='utf-8', errors='ignore') as file:
                html_content = file.read()
            soup = BeautifulSoup(html_content, 'html.parser')
            

            modules = [tag for tag in soup.find_all(['div', 'section', 'article', 'aside', 'nav']) if is_effective_module(tag)]

            max_depth = calculate_max_depth(modules) # 最大嵌套层级
            leaf_count, leaf_info = count_leaf_modules(modules) # 叶子布局节点数量
            l1, l2, l3 = count_module_levels(modules) # l1-l3布局节点数量

            interactive = count_interactive_elements(soup) # 交互节点数量

            print(website, year, max_depth, leaf_count, l1, l2, l3, interactive)
            
            results.append([website, year, max_depth, leaf_count, l1, l2, l3, interactive])

        except Exception as e:
            print(e)
            results.append([website, year, 'ERROR', 'ERROR', 'ERROR', 'ERROR', 'ERROR', 'ERROR'])

# 保存为CSV
df = pd.DataFrame(results, columns=['website', 'year', 'max_depth', 'leaf_count', 'level1_modules', 'level2_modules', 'level3plus_modules', 'interactive_elements'])
df.to_excel('web_layout_interaction_panel.xlsx', index=False)

print("统计完成，数据已保存为 web_layout_interaction_panel.xlsx")


Amazon.com 2009 11 182 3 2 284 210
Amazon.com 2010 11 293 11 7 508 251
Amazon.com 2011 10 205 11 8 259 205
Amazon.com 2012 11 223 4 5 399 376
Amazon.com 2013 9 169 4 6 251 412
Amazon.com 2014 12 96 2 1 207 448
Amazon.com 2015 13 67 1 1 170 273
Amazon.com 2016 20 56 1 3 154 121
Amazon.com 2017 14 119 1 4 244 193
Amazon.com 2018 0 0 0 0 0 0
Amazon.com 2019 11 239 6 9 456 242
统计完成，数据已保存为 web_layout_interaction_panel.xlsx
