# 前提说明
这是一个 demo 代码，主要展示整体流程，不包含批量处理等

可自行进行完整补充

## 目录
- 爬取 YC S24 项目信息
    - 爬取项目总列表
    - 读取爬取后的列表文件，爬取项目完整信息
        - 爬取文章主体外的其他信息，如项目名片和创始人名片
        - 使用 jina.ai 获取文章主体内容
- 使用 LLM 进行信息抽取
- 生成海报的html

In [3]:
# 一些配置信息

## 请替换成你的ChromeDriver路径
chrome_driver_path = "./Chrome/ChromeDriver/chromedriver"
chrome_driver_path_bin = "./Chrome/chrome-linux64/chrome"

## 请替换成你的代理地址和端口
proxy = "http://localhost:10809"


## 大模型API地址和API_key
Base_url = "https://open.bigmodel.cn/api/paas/v4"
API_key = "你的API_key"



## 爬取 YC S24 项目信息

### 爬取项目总列表

In [4]:
import csv
import json
import os
import time
from openai import OpenAI
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# 个人的一些配置信息
os.environ.pop('all_proxy', None)
os.environ.pop('ALL_PROXY', None)

# 启动 Chrome 浏览器
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # 使用无头模式
options.binary_location = chrome_driver_path_bin  # 设置 Chrome 二进制文件路径

# 配置代理
options.add_argument(f'--proxy-server={proxy}')

service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=options)


url = "https://www.ycombinator.com/companies?batch=s24"
driver.get(url)

# 获取页面高度并滚动到底
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(5)  # 等待页面加载

    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:  # 如果页面高度没有变化，说明已经到底部
        break
    last_height = new_height

# 获取页面的 HTML 内容
html = driver.page_source

# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(html, "html.parser")

# 提取公司名称和链接
rows = []
for link in soup.find_all('a', class_='_company_86jzd_338'):
    url = link.get('href')
    if url:  # 检查 URL 是否存在
        full_url = "https://www.ycombinator.com" + url
    else:
        full_url = ''  # 如果 href 为空，则设为空字符串

    name_element = link.find('span', class_='_coName_86jzd_453')
    company_name = name_element.get_text() if name_element else ''
    
    # 将链接和公司名称添加到行
    rows.append([full_url, company_name])

# 将数据保存到 CSV 文件
csv_filename = "companies.csv"
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["URL", "Company Name"])  # 写入表头
    writer.writerows(rows)  # 写入数据

# 关闭浏览器
driver.quit()

print(f"Data has been written to {csv_filename}")

Data has been written to companies.csv


### 读取爬取后的列表文件，逐个爬取完整信息

#### 爬取除文章主体外的其他信息，如项目名片和创始人名片

In [5]:
from pprint import pprint

# 启动 Chrome 浏览器
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # 使用无头模式
options.binary_location = chrome_driver_path_bin  # 设置 Chrome 二进制文件路径

# 配置代理
options.add_argument(f'--proxy-server={proxy}')

service = Service(chrome_driver_path)
driver = webdriver.Chrome(service=service, options=options)

project_detail_url = "https://www.ycombinator.com/companies/void"
driver.get(project_detail_url)

# 使用显式等待确保页面加载完毕
WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.CLASS_NAME, 'ycdc-card')))

detail_html = driver.page_source

driver.quit()

soup = BeautifulSoup(detail_html, 'html.parser')

founders_info = []
company_cards = soup.find_all('div', class_='ycdc-card space-y-1.5 sm:w-[300px]')
founders_cards = soup.find_all('div', class_='shrink-0 space-y-1.5 rounded-md border-[1px] border-[#999] bg-[#FDFDF7] p-6 sm:w-[300px]')

for card in company_cards:
    # LOGO 获取
    img = card.find('img')['src'] if card.find('img') else '无图片'
    
    # Company Name 获取
    company_name_div = card.find('div', class_='text-lg font-bold', recursive=False)
    print(company_name_div)
    company_name = company_name_div.text.strip() if company_name_div else '无公司名'

    # Company Description 获取
    founded = card.find(string='Founded:').find_next().text.strip() if card.find(string='Founded:') else '无成立年份'
    team_size = card.find(string='Team Size:').find_next().text.strip() if card.find(string='Team Size:') else '无团队规模'
    location = card.find(string='Location:').find_next().text.strip() if card.find(string='Location:') else '无位置'
    group_partner = card.find(string='Group Partner:').find_next()
    group_partner_name = group_partner.text.strip() if group_partner else '无合伙人'  
    group_partner_link = group_partner['href'] if group_partner else '无合伙人链接'

    # 将公司信息存储到字典中
    company_data = {
        'logo': img,
        'company_name': company_name,
        'founded': founded,
        'team_size': team_size,
        'location': location,
        'group_partner_name': group_partner_name,
        'group_partner_link': group_partner_link
    }

    # 将字典添加到列表中
    founders_info.append(company_data)

for card in founders_cards:
    # 提取照片
    photo_url = card.find('img')['src']
    
    # 提取姓名
    name = card.find('div', class_='font-bold').text.strip()

    # 提取 LinkedIn 链接
    linkedin_link = card.find('a', href=True, title='LinkedIn profile')['href']

    # 将信息存储到字典中
    founder_data = {
        'photo_url': photo_url,
        'name': name,
        'linkedin': linkedin_link
    }

    # 将字典添加到列表中
    founders_info.append(founder_data)

print("Company Info:")
pprint(founders_info)

print("\nFounders Info:")
pprint(founders_info)

<div class="text-lg font-bold">Void</div>
Company Info:
[{'company_name': 'Void',
  'founded': '2024',
  'group_partner_link': 'https://www.ycombinator.com/people/jared-friedman',
  'group_partner_name': 'Jared Friedman',
  'location': 'San Francisco',
  'logo': 'https://bookface-images.s3.us-west-2.amazonaws.com/logos/c3f60489646b8949075e4fdc612cbb8365cc1720.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAQC4NIECAAIVN5AAU%2F20241007%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20241007T122407Z&X-Amz-Expires=2227&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEM%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJGMEQCICinVSBXaXDl%2Bn7PXDLE7WXz1LALZZ65wSS4InIjYHryAiBAB7i938pMcJLQ3AsO50eH7xLZXlyzL8EYM7B5qMaHXSrlAwgoEAAaDDAwNjIwMTgxMTA3MiIMi3whU1kFaq%2Fcb9CDKsIDXAqO%2Bzp7mB6Ii93JnjiXBPleLNPhG1tvPc6RF866X7fsLHuYpfPC8H9rLXP9X%2BHfnBo9qLTUbRufqaY0IPnB%2BdswveGzzN3gRGzPWbk6L8Mg30eY2pGQFTSas5VwDuoh%2BZu7ONQajxhCc1%2FkliXrdQer%2Fx8t4WmBW3esWeaDW0LGuIYg2VsApWj4PbXQP46%2Bw0EQ7Yij5LocgFbnwto2eFq97KMN

#### 使用 jina.ai 获取文章主体内容

In [6]:
import requests
from bs4 import BeautifulSoup

# 单个链接
link = project_detail_url
base_url = 'https://r.jina.ai/'
full_url = base_url + link

# 设置代理
proxies = {
    "http": proxy,
    "https": proxy,
}

# 定义一个函数，用于爬取单个链接内容
def scrape_content(full_url):
    try:
        response = requests.get(full_url, proxies=proxies, timeout=30)
        response.raise_for_status()  # 如果状态码不是200，抛出HTTPError异常
        soup = BeautifulSoup(response.content, 'html.parser')
        return soup.get_text()  # 返回网页的纯文本
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {full_url}: {e}")
        return "Error"  # 如果请求失败，返回错误信息

# 爬取单个链接内容
print(f"Scraping {full_url}...")
scraped_content = scrape_content(full_url)
print(f"Scraped Content: {scraped_content}")


Scraping https://r.jina.ai/https://www.ycombinator.com/companies/void...
Scraped Content: Title: Void: The open source Cursor alternative | Y Combinator

URL Source: https://www.ycombinator.com/companies/void

Markdown Content:
![Image 1](https://bookface-images.s3.amazonaws.com/small_logos/0ab88e3a4fc8224ae094d26b68f966408fe4cf3f.png)

### The open source Cursor alternative

Void is an open source AI code editor. It provides developers with the AI features of Cursor, GitHub Copilot, and more, without sending their code to an external API.

![Image 2: Void](https://bookface-images.s3.us-west-2.amazonaws.com/logos/c3f60489646b8949075e4fdc612cbb8365cc1720.png?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=ASIAQC4NIECAFIKN3J4A%2F20241007%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20241007T122413Z&X-Amz-Expires=2245&X-Amz-Security-Token=IQoJb3JpZ2luX2VjEM%2F%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJHMEUCIFYomW1JngLDk%2BB%2BqOyokIQZ3sLbrNKywrUKDp6NV3ZAAiEAthPerfTfr4AuX5zIs8fK3oOVy%

## 使用大语言模型总结

In [11]:
extract_client = OpenAI(base_url = Base_url, api_key=API_key)

def call_llm(system_prompt: str, text: str):
    """使用给定的提示和对话格式调用LLM。"""
    response = extract_client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": text},
        ],
        model="glm-4-plus",
        max_tokens=4096,
        temperature=0.8,
    )
    return response.choices[0].message.content

rule = {"一句话总结": "通过xxx实现xxx",
     "Problem": "xxx行业存在xxx问题，导致xxx",
     "Solution": "xxx解决方案"}


system_prompt = "你是一个资深的投资人，你有超强的信息整理能力和深刻的洞察力，你总是能深刻且清晰的挖掘出项目中的痛点和背景，以及解决方案。"
query_prompt = f"""
<article>
{scraped_content}
</article>

<task>
本次任务是文档中总结出项目的信息，包括"一句话总结"，"项目提出的背景/问题/行业痛点"，"项目的解决方案"
</task>
<rule>
你的回答应当简洁而深刻，并且使用中文回答
你的输出应当为json格式，并且避免```json 此类格式性内容
直接输出json即可,格式严格参考 {json.dumps(rule, ensure_ascii=False)}"
</rule>
"""

llm_summary = call_llm(system_prompt, query_prompt)
# 解析 llm_summary

print(llm_summary)

parsed_summary = json.loads(llm_summary)

# 保存到新的字典
summary_dict = {
    "一句话总结": parsed_summary.get("一句话总结", ""),
    "Problem": parsed_summary.get("Problem", ""),
    "Solution": parsed_summary.get("Solution", "")
}

print(summary_dict)

{"一句话总结": "Void是一个开源的AI代码编辑器，旨在为开发者提供Cursor、GitHub Copilot等AI功能，同时保持代码的隐私性。", "Problem": "现有的AI IDE如Cursor是闭源的，这导致开发者在使用这些工具时需要将代码发送到外部API，引发隐私担忧，同时使用成本高，且权力集中在单个实体手中。", "Solution": "Void通过开源的方式解决了这些问题，开发者可以选择自行托管AI模型，保持数据的私密性，也可以直接连接到Claude、GPT或Gemini等模型，而无需担心数据通过中间层进行通信。"}
{'一句话总结': 'Void是一个开源的AI代码编辑器，旨在为开发者提供Cursor、GitHub Copilot等AI功能，同时保持代码的隐私性。', 'Problem': '现有的AI IDE如Cursor是闭源的，这导致开发者在使用这些工具时需要将代码发送到外部API，引发隐私担忧，同时使用成本高，且权力集中在单个实体手中。', 'Solution': 'Void通过开源的方式解决了这些问题，开发者可以选择自行托管AI模型，保持数据的私密性，也可以直接连接到Claude、GPT或Gemini等模型，而无需担心数据通过中间层进行通信。'}


## 生成海报

### 生成海报html版

In [12]:
def generate_html(founders_info,summary_dict,big_logo_path):
    html_template = """
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="UTF-8" />
    <meta
      name="viewport"
      content="width=device-width, initial-scale=1.0" />
    <title>Document</title>
    <link rel="stylesheet" href="style2.css" />
    <style>
    @font-face {{
        font-family: 'HYRunYuan';
        src: url('./汉仪润圆-65W.ttf') format('truetype');
    }}
    body {{
        font-family: 'HYRunYuan', sans-serif;
    }}
    </style>
  </head>
  <body>
    <div class="music_9_57">
      <div class="music_9_58">
        <img
          class="music_9_59"
          src="{big_logo_path}" />
        <div class="music_9_60">
          <span class="music_9_60_0_8">{company_name}</span><span class="music_9_60_8_44"><br />成立年份：{established_year}<br />团队规模：{team_size}<br />地理位置：{location}</span>
        </div>
      </div>
      <div class="music_9_66">
        <div class="music_9_67">
          <span class="music_9_67_0_52">🌟 {summary}</span><span class="music_9_67_52_53">。</span>
        </div>
        <div class="music_9_71"></div>
      </div>
      <div class="music_9_003">
        <div class="music_9_004"><span class="music_9_004_0_9">🤔 Problem</span><span class="music_9_004_9_10">m</span></div>
        <div class="music_9_008">
          <span class="music_9_009_0_99">{problem}</span>
        </div>
      </div>
      <div class="music_9_013">
        <div class="music_9_014"><span class="music_9_014_0_10">🧐 Solution</span><span class="music_9_014_10_11">n</span></div>
        <div class="music_9_018">
          <span class="music_9_019_0_129">{solution}</span>
        </div>
      </div>
      <div class="music_9_023">
        <span class="music_9_024">YC S2024 项目整理 | 特工宇宙</span
        ><img
          class="music_9_028"
          src="特工宇宙.png" />
      </div>
    </div>
  </body>
</html>""".format(
        big_logo_path=big_logo_path,
        company_name=founders_info[0]['company_name'],
        established_year=founders_info[0]['founded'],
        team_size=founders_info[0]['team_size'],
        location=founders_info[0]['location'],
        summary=summary_dict['一句话总结'],
        problem=summary_dict["Problem"],
        solution=summary_dict["Solution"]
    )
    return html_template

big_logo_path = './example_logo/example.png'
poster_html = generate_html(founders_info,summary_dict,big_logo_path)
# 保存 HTML 到文件
output_dir = "poster_html"
os.makedirs(output_dir, exist_ok=True)
output_file_path = os.path.join(output_dir, "poster.html")

with open(output_file_path, "w", encoding="utf-8") as file:
  file.write(poster_html)

print(f"HTML has been saved to {output_file_path}")

HTML has been saved to poster_html/poster.html


In [None]:
def generate_html(logo_base64, agent_logo_base64, font_base64,company_name, established_year, team_size, location, summary, problem, solution):
    html_template = """
<!DOCTYPE html>
<html lang="en">
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>Document</title>
    <style>
    /* 嵌入字体定义 */
    @font-face {
        font-family: 'HYRunYuan';
        src: url(data:font/ttf;base64,{font_base64}) format('truetype');
    }
    
    body {{
        font-family: 'HYRunYuan', sans-serif;
    }}

    /* 这里可以添加您的CSS样式 */
    /* 注意：CSS会在后续补充 */
    </style>
  </head>
  <body>
    <div class="music_9_57">
      <div class="music_9_58">
        <img
          class="music_9_59"
          src="data:image/png;base64,{logo_base64}" />
        <div class="music_9_60">
          <span class="music_9_60_0_8">{company_name}</span>
          <span class="music_9_60_8_44">
            <br />成立年份：{established_year}
            <br />团队规模：{team_size}
            <br />地理位置：{location}
          </span>
        </div>
      </div>
      <div class="music_9_66">
        <div class="music_9_67">
          <span class="music_9_67_0_52">{summary}</span>
          <span class="music_9_67_52_53">。</span>
        </div>
        <div class="music_9_71"></div>
      </div>
      <div class="music_9_003">
        <div class="music_9_004">
          <span class="music_9_004_0_9">樂 Problem</span>
          <span class="music_9_004_9_10">m</span>
        </div>
        <div class="music_9_008">
          <span class="music_9_009_0_99">{problem}</span>
        </div>
      </div>
      <div class="music_9_013">
        <div class="music_9_014">
          <span class="music_9_014_0_10">類 Solution</span>
          <span class="music_9_014_10_11">n</span>
        </div>
        <div class="music_9_018">
          <span class="music_9_019_0_129">{solution}</span>
        </div>
      </div>
      <div class="music_9_023">
        <span class="music_9_024">YC S2024 项目整理 | 特工宇宙</span>
        <img
          class="music_9_028"
          src="data:image/png;base64,{agent_logo_base64}" />
      </div>
    </div>
  </body>
</html>""".format(
        logo_base64=logo_base64,
        agent_logo_base64=agent_logo_base64,
        font_base64=font_base64,
        company_name=company_name,
        established_year=established_year,
        team_size=team_size,
        location=location,
        summary=summary,
        problem=problem,
        solution=solution
    )
    return html_template

logo_base64 = "你的公司logo base64字符串"
agent_logo_base64 = "特工宇宙logo base64字符串"
font_base64 = "字体文件 base64字符串"
established_year = ""
team_size = ""
location = ""
summary = ""
problem = ""
solution = ""
def handler(args: Args[Input])->Output:
    logo_base64 = args.input.logo_base64
    agent_logo_base64 = args.input.agent_logo_base64
    font_base64 = args.input.font_base64
    company_name = args.input.company_name
    established_year = args.input.established_year
    team_size = args.input.team_size
    location = args.input.location
    summary = args.input.summary
    problem = args.input.problem
    solution = args.input.solution
    html_content = generate_html(logo_base64, agent_logo_base64, font_base64,company_name, established_year, team_size, location, summary, problem, solution)    
    
    return {"html_content":html_content}



### 生成海报图片

该部分使用 nodejs 完成

请运行 screenshot.js（需要安装 puppeteer）

  gc.collect()


BrowserError: Browser closed unexpectedly:
