In [11]:
import requests
import pandas as pd
from lxml import etree
import time
from tqdm import tqdm

In [9]:
#北京市2019-2024年天气数据
city = 'beijing'  
start_year = 2019  
end_year = 2024   

In [7]:
def get_weather_data(city, start_year, end_year):
    base_url = 'http://www.tianqihoubao.com/lishi/'
    headers = {
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
    }

    all_data = []

    # 遍历年份和月份
    for year in tqdm(range(start_year, end_year + 1), desc='爬取进度'):
        for month in range(1, 13):
            url = f"{base_url}{city}/month/{year}{month:02d}.html"

            try:
                response = requests.get(url, headers=headers, timeout=10)
                if response.status_code == 200:
                    # 使用xpath解析数据
                    html = etree.HTML(response.text)

                    # 获取所有数据行
                    rows = html.xpath('//table[@class="b"]/tr')[1:]  # 跳过表头

                    for row in rows:
                        # 提取每行数据
                        date = row.xpath('./td[1]/text()')[0].strip()

                        # 天气状况（白天/夜间）
                        weather = row.xpath('./td[2]/text()')[0].strip()
                        weather_parts = weather.split('/')
                        day_weather = weather_parts[0].strip() if len(weather_parts) > 0 else ''
                        night_weather = weather_parts[1].strip() if len(weather_parts) > 1 else ''

                        # 气温（最高/最低）
                        temperature = row.xpath('./td[3]/text()')[0].strip()
                        temp_parts = temperature.split('/')
                        high_temp = temp_parts[0].strip() if len(temp_parts) > 0 else ''
                        low_temp = temp_parts[1].strip() if len(temp_parts) > 1 else ''

                        # 风力（白天/夜间）
                        wind = row.xpath('./td[4]/text()')[0].strip()
                        wind_parts = wind.split('/')
                        day_wind = wind_parts[0].strip() if len(wind_parts) > 0 else ''
                        night_wind = wind_parts[1].strip() if len(wind_parts) > 1 else ''

                        all_data.append({
                            '日期': date,
                            '白天天气': day_weather,
                            '夜间天气': night_weather,
                            '最高气温': high_temp,
                            '最低气温': low_temp,
                            '白天风力': day_wind,
                            '夜间风力': night_wind
                        })

                # 添加延时，避免请求过快
                time.sleep(1)

            except Exception as e:
                print(f"获取数据失败: {url}, 错误: {str(e)}")
                continue

    # 转换为DataFrame
    df = pd.DataFrame(all_data)

    # 保存数据
    output_file = f"{city}_weather_data.xlsx"
    df.to_excel(output_file, index=False)
    print(f"数据已保存到: {output_file}")

    return df