In [1]:
import json
eccv24_paperlist=json.load(open('eccv2024.json'))

In [2]:
from bs4 import BeautifulSoup
import requests

# 获取网页内容

def get_abs(url):
    response = requests.get(url)
    html_content = response.content

    # 解析HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # 查找指定ID的元素
    element = soup.find(id="abstractExample")
    if element:
        # 获取文本内容，去除前缀和不必要的字符
        return element.get_text(strip=True).replace('Abstract:', '')
    else:
        return ''



In [3]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
# 假设 eccv24_paperlist 是包含字典的列表，每个字典代表一篇论文，且包含 'site' 键
# 假设 get_abs(site) 是一个函数，给定 site URL 返回该页面的摘要内容

def fetch_abstract(item):
    site = item['site']
    abs = get_abs(site)
    item['abstract'] = abs
    return item
excel_res=[]
# 使用多线程来处理每个论文
with ThreadPoolExecutor() as executor:
    # 提交任务并获取结果
    futures = [executor.submit(fetch_abstract, item) for item in eccv24_paperlist]
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching Abstracts"):
        item = future.result()
        #print(item)  # 打印更新后的项
        excel_res.append(item)


Fetching Abstracts: 100%|██████████| 2387/2387 [05:56<00:00,  6.70it/s]


In [6]:
import json
import pandas as pd

# 读取 JSON 文件

# 指定关键词
keywords = ["title", "github", "abstract",'status','site','project']  # 将此处替换为实际的关键词

# 提取指定关键词的数据
extracted_data = []
for item in excel_res:
    extracted_item = {key: item.get(key, 'None') for key in keywords}
    extracted_data.append(extracted_item)

# 转换为 DataFrame
df = pd.DataFrame(extracted_data)

# 保存为 Excel 文件
df.to_excel('eccv24.xlsx', index=False)
print("数据已成功保存到 eccv24.xlsx")


数据已成功保存到 eccv24.xlsx
