In [2]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import csv
import os
from datetime import datetime, timedelta
# 获取网页内容
def fetch_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.content, 'html.parser')
    else:
        print(f"Failed to retrieve {url}")
        return None

# 获取历史排名页面的选手数据和排名
def get_players_and_rankings(base_url, date):
    url = f"{base_url}/history/{date}.html"
    soup = fetch_page(url)
    if soup is None:
        return []

    # 找到排名表格
    ranking_table = soup.find('table')
    players = ranking_table.find_all('tr')[1:33]  # Skip the header row and get top 50 players

    # 提取选手名字、个人页面链接和排名
    player_data = []
    for player in players:
        rank_tag = player.find_all('td')[0].text.strip()  # 获取排名
        name_tag = player.find('a')
        if name_tag:
            name = name_tag.text.strip()
            profile_url = f"{base_url}/{name_tag['href']}"
            player_data.append((rank_tag, name, profile_url))
    
    return player_data

# 抓取选手的比赛记录（限制时间段）
def scrape_player_games(player_name, profile_url, start_date, end_date):
    games = []
    soup = fetch_page(profile_url)
    if soup is None:
        return games
    
    # 找到包含比赛数据的表格
    tables = soup.find_all('table')
    game_table = None
    for table in tables:
        if "日期" in table.get_text():
            game_table = table
            break
    
    if game_table is None:
        print(f"Could not find the game table for player {player_name}")
        return games

    # 提取比赛行
    game_rows = game_table.find_all('tr')[1:]  # Skip the header row
    for row in game_rows:
        cols = row.find_all('td')
        if len(cols) >= 5:
            date_str = cols[0].text.strip()
            try:
                date = datetime.strptime(date_str, "%Y-%m-%d")
            except ValueError:
                continue  # Skip rows with invalid dates
            
            # 根据时间范围进行筛选
            if start_date <= date <= end_date:
                rating = cols[1].text.strip()
                bw_game = cols[2].text.strip()
                result = cols[3].text.strip()
                opponent = cols[4].text.strip()
                
                games.append({
                    'Player': player_name,
                    'Date': date_str,
                    'Rating': rating,
                    'Black and White': bw_game,
                    'Result': result,
                    'Opponent': opponent
                })
    
    return games

# 将比赛记录写入文件
def save_to_file(games, file_name):
    with open(file_name, 'a', encoding='utf-8') as file:
        for game in games:
            line = (f"Player: {game['Player']}, Date: {game['Date']}, Rating: {game['Rating']}, "
                    f"Black and White: {game['Black and White']}, Result: {game['Result']}, Opponent: {game['Opponent']}\n")
            file.write(line)

# 保存选手排名和名字到CSV文件
def save_rankings_to_csv(player_data, csv_file):
    with open(csv_file, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Rank', 'Player', 'Profile URL'])  # CSV文件头
        for rank, name, url in player_data:
            writer.writerow([rank, name, url])

def create_output_paths(date):
    start_date = datetime.strptime(date, "%Y-%m-%d")
    end_date = start_date + timedelta(days=365)  # 加1年
    end_date_str = end_date.strftime("%Y-%m-%d")
    
    folder_path = f"output/{date}_to_{end_date_str}"
    output_file = f"{folder_path}/player_games.txt"
    rank_file = f"{folder_path}/player_rankings.txt"
    
    # 如果文件夹不存在，创建文件夹
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    return output_file, rank_file

def main():                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
    for i in range(1980,2019):
        time=i
        base_url = "https://www.goratings.org/zh"
        date = f"{time}-01-01"  # 动态传入的日期
        
        # 动态生成输出文件路径并创建必要的文件夹
        output_file, csv_file = create_output_paths(date)
        
        # 清空之前的文件内容
        open(output_file, 'w').close()
        
        # 设置时间范围
        start_date = datetime(time, 1, 1)
        end_date = datetime(time+1, 1, 1)
        
        # 获取排名页面的选手列表和排名
        player_data = get_players_and_rankings(base_url, date)
        
        # 保存选手排名和名字到CSV文件
        save_rankings_to_csv(player_data, csv_file)
        
        # 抓取每个选手的比赛记录并存入文件
        for rank, name, url in player_data:
            print(f"Scraping games for {name} (Rank: {rank}) from {url}")
            games = scrape_player_games(name, url, start_date, end_date)
            save_to_file(games, output_file)

# # 主函数，抓取给定日期的选手和比赛数据
# def main():
    
#     base_url = "https://www.goratings.org/zh"
#     date = "2020-01-01"  # 选择具体日期的页面
#     output_file = f"output/2020-1-1_to_2021-1-1/player_games.txt"  # 输出比赛记录的文件名
#     csv_file = "output/2020-1-1_to_2021-1-1/player_rankings.txt"  # 输出排名的文件名
    
#     # 清空之前的文件内容
#     open(output_file, 'w').close()
    
#     # 设置时间范围
#     start_date = datetime(2020, 1, 1)
#     end_date = datetime(2021, 1, 1)
    
#     # 获取排名页面的选手列表和排名
#     player_data = get_players_and_rankings(base_url, date)
    
#     # 保存选手排名和名字到CSV文件
#     save_rankings_to_csv(player_data, csv_file)
    
#     # 抓取每个选手的比赛记录并存入文件
#     for rank, name, url in player_data:
#         print(f"Scraping games for {name} (Rank: {rank}) from {url}")
#         games = scrape_player_games(name, url, start_date, end_date)
#         save_to_file(games, output_file)

if __name__ == "__main__":
    main()


Scraping games for 加藤正夫 (Rank: 1) from https://www.goratings.org/zh/../../zh/players/193.html
Scraping games for 赵治勋 (Rank: 2) from https://www.goratings.org/zh/../../zh/players/98.html
Scraping games for 大竹英雄 (Rank: 3) from https://www.goratings.org/zh/../../zh/players/158.html
Scraping games for 武宫正树 (Rank: 4) from https://www.goratings.org/zh/../../zh/players/82.html
Scraping games for 桥本昌二 (Rank: 5) from https://www.goratings.org/zh/../../zh/players/216.html
Scraping games for 林海峰 (Rank: 6) from https://www.goratings.org/zh/../../zh/players/36.html
Scraping games for 曹薰铉 (Rank: 7) from https://www.goratings.org/zh/../../zh/players/149.html
Scraping games for 小林光一 (Rank: 8) from https://www.goratings.org/zh/../../zh/players/50.html
Scraping games for 藤泽秀行 (Rank: 9) from https://www.goratings.org/zh/../../zh/players/293.html
Scraping games for 坂田荣男 (Rank: 10) from https://www.goratings.org/zh/../../zh/players/592.html
Scraping games for 淡路修三 (Rank: 11) from https://www.goratings.org/