In [4]:
from datetime import datetime, timedelta
from newsapi import NewsApiClient
import os

api_key = os.environ.get("NEWS_API_KEY")
newsapi = NewsApiClient(api_key=api_key)

start_date = datetime(2024, 12, 18)
end_date = datetime(2025, 2, 9)
date_step = timedelta(days=1) 

import pandas as pd
import time

query = "insurance"

news_data = []
page_size = 100 
max_pages = 10  

current_date = start_date
while current_date < end_date:
    next_date = current_date + date_step
    for page in range(1, max_pages + 1):
        try:
            response = newsapi.get_everything(
                qintitle=query,  
                language="en", 
                from_param=current_date.strftime("%Y-%m-%d"), 
                to=next_date.strftime("%Y-%m-%d"),  
                page_size=page_size, 
                page=page 
            )

            # 检查返回内容
            if "articles" in response:
                articles = response["articles"]
                for article in articles:
                    news_data.append({
                        "title": article["title"], 
                        "description": article["description"], 
                        "url": article["url"],  
                        "publishedAt": article["publishedAt"]  
                    })

            print(f"From {current_date} to {next_date}, Page {page} scraped, Total: {len(news_data)} articles.")

            if len(news_data) >= 10000:
                break

            # 延时以防触发 API 限制
            time.sleep(2)

        except Exception as e:
            print(f"Error from {current_date} to {next_date}, Page {page}: {e}")
            break

    current_date = next_date
    if len(news_data) >= 10000: 
        break

df = pd.DataFrame(news_data)
df.to_csv("insurance_news_1_10.csv", index=False, encoding="utf-8")

From 2024-12-18 00:00:00 to 2024-12-19 00:00:00, Page 1 scraped, Total: 100 articles.
Error from 2024-12-18 00:00:00 to 2024-12-19 00:00:00, Page 2: {'status': 'error', 'code': 'maximumResultsReached', 'message': 'You have requested too many results. Developer accounts are limited to a max of 100 results. You are trying to request results 100 to 200. Please upgrade to a paid plan if you need more results.'}
From 2024-12-19 00:00:00 to 2024-12-20 00:00:00, Page 1 scraped, Total: 200 articles.
Error from 2024-12-19 00:00:00 to 2024-12-20 00:00:00, Page 2: {'status': 'error', 'code': 'maximumResultsReached', 'message': 'You have requested too many results. Developer accounts are limited to a max of 100 results. You are trying to request results 100 to 200. Please upgrade to a paid plan if you need more results.'}
From 2024-12-20 00:00:00 to 2024-12-21 00:00:00, Page 1 scraped, Total: 290 articles.
Error from 2024-12-20 00:00:00 to 2024-12-21 00:00:00, Page 2: {'status': 'error', 'code': '