In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from datetime import datetime
import re
import json
import time
import sys
import os

In [4]:
class LinkExtractor:
    TARGETS = {
        "https://robo.datayes.com/v2/fastreport/investment?subType=%E4%B8%8D%E9%99%90&induName=":"策略.json",
        "https://robo.datayes.com/v2/fastreport/financial?subType=%E4%B8%8D%E9%99%90&induName=":"金工.json",
        "https://robo.datayes.com/v2/fastreport/fund?subType=%E4%B8%8D%E9%99%90&induName=":"基金.json"
    }

    def __init__(self, url, filename):
        self.url = url
        self.filename = filename
        self.data = []

        # 初始化浏览器驱动（需自行安装对应浏览器驱动）
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")  # 无头模式
        options.add_argument("--disable-gpu")
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")
        self.driver = webdriver.Chrome(options=options)
        
        self.driver.get(self.url)
        self.handle_popup()
        self.extract_links()

    
    def handle_popup(self):
        try:
            # 显式等待弹窗关闭按钮出现（最多等待10秒）
            close_btn = WebDriverWait(self.driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button.modal-close"))
            )
            close_btn.click()
            time.sleep(1)  # 确保弹窗完全关闭
        except Exception as e:
            print("未找到弹窗关闭按钮或已关闭:", str(e))



    def extract_links(self):
    # 定位所有条目父容器（使用部分class匹配）
        entries = self.driver.find_elements(By.XPATH, 
            '//div[contains(@class, "ReportList__StyledReportItem")]'
        )
    
        for entry in entries:
            try:
            # 提取标题和链接
                title_elem = entry.find_element(By.XPATH,
                    './/a[contains(@class, "title")]'
                )
                title = title_elem.get_attribute("title").strip()
                href = title_elem.get_attribute("href").strip()
            
            # 提取时间
                time_elem = entry.find_element(By.XPATH,
                    './/span[@class="item time"]'
                )
                raw_time = time_elem.text.strip()
                
            # 内联转换逻辑
                if re.match(r'^\d{1,2}:\d{2}$', raw_time):  # 匹配 17:24 格式
                    processed_time = datetime.now().strftime("%Y-%m-%d")
                else:
                    processed_time = raw_time  # 保留原值
            
            # 提取机构名称
                org_elem = entry.find_element(By.XPATH,
                    './/span[@class="item org-name"]'
                )
                org_name = org_elem.text.strip()
                
                if all([title, href, time, org_name]):
                    self.data.append({
                        "title": title,
                        "link": href,
                        "time": processed_time,
                        "org_name": org_name
                    })
                
            except NoSuchElementException as e:
                print(f"元素定位失败: {str(e)}")
                continue
            except Exception as e:
                print(f"处理条目时发生异常: {str(e)}")
                continue
    
#    def save_to_json(self):
 #       with open("1.json", "w", encoding="utf-8") as f:
  #          json.dump(self.data, f, ensure_ascii=False, indent=2)
   #     print("数据已保存至1.json")

        @staticmethod
        def get_element_text(element, selector):
            try:
                return element.find_element(By.CSS_SELECTOR, selector).text.strip()
            except NoSuchElementException:
                return ""

        @staticmethod
        def get_element_attr(element, selector, attr):
            try:
                return element.find_element(By.CSS_SELECTOR, selector).get_attribute(attr).strip()
            except NoSuchElementException:
                return ""

    def process_url(url, filename):
        """处理单个URL并返回统计信息"""
        try:
            # 爬取数据
            extractor = LinkExtractor(url, filename)
            new_data = extractor.data
            
            # 加载历史数据
            existing_data = []
            if os.path.exists(filename):
                with open(filename, "r", encoding="utf-8") as f:
                    existing_data = json.load(f)
        
            # 去重处理
            existing_links = {item["link"] for item in existing_data}
            unique_new = [item for item in new_data if item["link"] not in existing_links]
        
            # 保存数据
            if unique_new:
                combined_data = existing_data + unique_new
                with open(filename, "w", encoding="utf-8") as f:
                    json.dump(combined_data, f, ensure_ascii=False, indent=2)
        
            return {
                "new_count": len(unique_new),
                "total_count": len(existing_data) + len(unique_new)
            }
        except Exception as e:
            raise RuntimeError(f"处理失败: {str(e)}")

if __name__ == "__main__":
        # 创建存储目录
    os.makedirs("raw-data", exist_ok=True)#这里是原始数据保存位置
    total_new = 0
    
        # 遍历所有配置目标
    for url, filename in LinkExtractor.TARGETS.items():
        full_path = os.path.join("raw-data", filename)#这里是原始数据保存位置
        try:
            result = LinkExtractor.process_url(url, full_path)
            print(f"URL: {url}")
            print(f"文件: {filename}")
            print(f"新增: {result['new_count']} 条, 当前总量: {result['total_count']}\n")
            total_new += result['new_count']
        except Exception as e:
            print(f"{url} 处理失败: {str(e)}")
    
    print(f"所有任务完成，总计新增 {total_new} 条数据")
    print("数据文件位置: ./raw-data/ 目录")

URL: https://robo.datayes.com/v2/fastreport/investment?subType=%E4%B8%8D%E9%99%90&induName=
文件: 策略.json
新增: 20 条, 当前总量: 72

URL: https://robo.datayes.com/v2/fastreport/financial?subType=%E4%B8%8D%E9%99%90&induName=
文件: 金工.json
新增: 3 条, 当前总量: 46

URL: https://robo.datayes.com/v2/fastreport/fund?subType=%E4%B8%8D%E9%99%90&induName=
文件: 基金.json
新增: 1 条, 当前总量: 43

所有任务完成，总计新增 24 条数据
数据文件位置: ./raw-data/ 目录
