In [1]:
import json
paperlist=json.load(open('nips2024.json'))

In [2]:
from bs4 import BeautifulSoup
import requests

# 获取网页内容

def get_abs(url):
    response = requests.get(url)
    html_content = response.content

    # 解析HTML
    soup = BeautifulSoup(html_content, 'html.parser')

    # 查找指定ID的元素
    element = soup.find(id="abstractExample")
    if element:
        # 获取文本内容，去除前缀和不必要的字符
        return element.get_text(strip=True).replace('Abstract:', '')
    else:
        return ''



In [3]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import pandas as pd
import json
import os

# 假设 eccv24_paperlist 是包含字典的列表，每个字典代表一篇论文，且包含 'site' 键
# 假设 get_abs(site) 是一个函数，给定 site URL 返回该页面的摘要内容

def fetch_abstract(item):
    site = item['site']
    try:
        abs_text = get_abs(site)
        item['abstract'] = abs_text
    except Exception as e:
        print(f"Error fetching abstract for site {site}: {e}")
        item['abstract'] = 'Error'
    return item

# 指定关键词
keywords = ["title", "github", "abstract", "status", "site", "project"]

# 中间结果文件名
temp_file = 'nips2024_temp.xlsx'
output_file = 'nips2024.xlsx'

# 读取已保存的中间结果（如果存在）
if os.path.exists(temp_file):
    df_existing = pd.read_excel(temp_file)

    # 排除 abstract 为 'Error' 的条目
    df_existing = df_existing[df_existing['abstract'] != 'Error']
    processed_sites = set(df_existing['site'].tolist())
    excel_res = df_existing.to_dict(orient='records')
    print(f"Loaded {len(excel_res)} valid items from existing file.")
else:
    processed_sites = set()
    excel_res = []

# 筛选出未处理的论文列表
remaining_papers = [item for item in paperlist if item['site'] not in processed_sites]
print(f"{len(remaining_papers)} papers remaining to process.")

# 使用多线程来处理每个论文
with ThreadPoolExecutor() as executor:
    futures = [executor.submit(fetch_abstract, item) for item in remaining_papers]
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Fetching Abstracts"):
        item = future.result()
        excel_res.append(item)

        # 实时保存中间结果
        extracted_data = [{key: i.get(key, 'None') for key in keywords} for i in excel_res]
        df_temp = pd.DataFrame(extracted_data)
        df_temp.to_excel(temp_file, index=False)

# 最后一次保存完整结果为 Excel 文件
df_final = pd.DataFrame(extracted_data)
df_final.to_excel(output_file, index=False)
print(f"数据已成功保存到 {output_file}")

# 删除中间结果文件
if os.path.exists(temp_file):
    os.remove(temp_file)
    print(f"临时文件 {temp_file} 已删除。")


Loaded 4551 valid items from existing file.
2 papers remaining to process.


Fetching Abstracts: 100%|██████████| 2/2 [00:06<00:00,  3.32s/it]


数据已成功保存到 nips2024.xlsx
临时文件 nips2024_temp.xlsx 已删除。
