In [2]:
import pandas as pd

In [None]:
# 区别season和year
# 读取CSV文件
df = pd.read_csv('2026_MCM_Problem_C_Data.csv')

# 定义season到年份的映射函数
def season_to_year(season):
    """
    映射规则：
    - Season 1 = 2005
    - Season 2-27 = 2006-2018 (每年2季)
    - Season 28-34 = 2019-2025 (每年1季)
    """
    if season == 1:
        return 2005
    elif 2 <= season <= 27:
        # 2006-2018年，每年2季
        # Season 2-3 -> 2006, Season 4-5 -> 2007, ...
        return 2005 + (season - 2) // 2 + 1
    elif season >= 28:
        # 2019-2025年，每年1季
        return 2019 + (season - 28)
    else:
        return None

# 添加年份列
df['year'] = df['season'].apply(season_to_year)

# 提取需要的列：celebrity_name, ballroom_partner, season, year
df_filtered = df[['celebrity_name', 'ballroom_partner', 'season', 'year']]

# 保存处理后的数据到新文件
df_filtered.to_csv('name.csv', index=False)

print(f"处理完成！共处理了 {len(df_filtered)} 行数据")
print("处理后的数据已保存到: name.csv")
print("\n前10行数据预览:")
print(df_filtered.head(10))
print("\n各season对应的年份:")
print(df_filtered[['season', 'year']].drop_duplicates().sort_values('season'))



处理完成！共处理了 421 行数据
处理后的数据已保存到: name.csv

前10行数据预览:
      celebrity_name     ballroom_partner  season  year
0      John O'Hurley  Charlotte Jorgensen       1  2005
1       Kelly Monaco            Alec Mazo       1  2005
2  Evander Holyfield      Edyta Sliwinska       1  2005
3      Rachel Hunter     Jonathan Roberts       1  2005
4      Joey McIntyre      Ashly DelGrosso       1  2005
5      Trista Sutter     Louis van Amstel       1  2005
6       Tatum O'Neal        Nick Kosovich       2  2006
7        Tia Carrere   Maksim Chmerkoskiy       2  2006
8    George Hamilton      Edyta Sliwinska       2  2006
9         Lisa Rinna     Louis van Amstel       2  2006

各season对应的年份:
     season  year
0         1  2005
6         2  2006
16        3  2006
27        4  2007
38        5  2007
50        6  2008
62        7  2008
75        8  2009
88        9  2009
104      10  2010
115      11  2010
127      12  2011
138      13  2011
150      14  2012
162      15  2012
175      16  2013
187      17  

In [3]:
# 统计所有ballroom_partner的姓名并去重
# 如果df_filtered存在，直接使用；否则从name.csv读取
if 'df_filtered' in globals():
    partners = df_filtered['ballroom_partner'].unique()
else:
    df_name = pd.read_csv('name.csv')
    partners = df_name['ballroom_partner'].unique()

# 创建DataFrame并保存
df_partners = pd.DataFrame({'partner_name': sorted(partners)})
df_partners.to_csv('partner_name.csv', index=False)

print(f"统计完成！共找到 {len(partners)} 个不同的舞伴")
print("去重后的舞伴姓名已保存到: partner_name.csv")
print("\n前10个舞伴姓名:")
print(df_partners.head(10))



统计完成！共找到 60 个不同的舞伴
去重后的舞伴姓名已保存到: partner_name.csv

前10个舞伴姓名:
                            partner_name
0                           Alan Bersten
1  Alan Bersten (Rashad Jennings week 9)
2                              Alec Mazo
3                         Allison Holker
4                            Andrea Hale
5                          Anna Demidova
6                       Anna Trebunskaya
7                      Artem Chigvintsev
8                        Ashly DelGrosso
9                      Brandon Armstrong


In [None]:
#示例爬取
import wikipedia
import requests
import pandas as pd
import time
from urllib.parse import quote
from datetime import datetime, timedelta

def get_yearly_pageviews(page_title, year):
    # 1. 标题规范化
    page_title = page_title.replace(" ", "_")
    page_title = quote(page_title, safe="")

    url = (
        "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
        f"en.wikipedia.org/all-access/all-agents/"
        f"{page_title}/monthly/"
        f"{year}0101/{year}1231"
    )

    headers = {
        "User-Agent": "Academic-Research-Bot/1.0 (contact: your_email@example.com)"
    }

    r = requests.get(url, headers=headers)

    if r.status_code != 200:
        print("Pageviews API failed:")
        print("URL:", url)
        print("Status:", r.status_code)
        print("Response:", r.text)
        return None

    items = r.json().get("items", [])
    return sum(item["views"] for item in items)

def get_yearly_edit_count(page_title, year):
    """
    获取某 Wikipedia 页面在某一年的编辑次数
    """
    url = "https://en.wikipedia.org/w/api.php"

    start = f"{year}-01-01T00:00:00Z"
    end = f"{year}-12-31T23:59:59Z"

    params = {
        "action": "query",
        "format": "json",
        "prop": "revisions",
        "titles": page_title,
        "rvstart": end,      # 注意：先新后旧
        "rvend": start,
        "rvlimit": "max"
    }

    headers = {
        "User-Agent": "Academic-Research-Bot/1.0 (contact: your_email@example.com)"
    }

    total_edits = 0

    while True:
        r = requests.get(url, params=params, headers=headers)
        data = r.json()

        pages = data["query"]["pages"]
        page_data = next(iter(pages.values()))

        if "revisions" in page_data:
            total_edits += len(page_data["revisions"])

        if "continue" in data:
            params.update(data["continue"])
        else:
            break

    return total_edits


wikipedia.set_lang("en")  # 设置为英文维基，如需中文改为"zh"

# 获取前5个celebrity_name
if 'df_filtered' in globals():
    celebrities = df_filtered['celebrity_name'].head(5).tolist()
else:
    df_name = pd.read_csv('name.csv')
    celebrities = df_name['celebrity_name'].head(5).tolist()

print(f"查询前5个名人: {celebrities}\n")

results = []

for name in celebrities:
    search_results = wikipedia.search(name)
    # print(f"搜索结果列表{search_results}")

    page=wikipedia.page(search_results[0], auto_suggest=False)
    print("---匹配结果---")
    print(f"页面标题：{page.title}")
    print(f"页面URL：{page.url}")
    print(f"页面摘要：{page.summary[:100]}")

    year = 2023
    views = get_yearly_pageviews(page.title, year)
    print(f"{page.title} 在 {year} 年的浏览量：{views}")

    edit_counts=get_yearly_edit_count("Donald Trump", year)
    print(f"{page.title} 在 {year} 年的编辑次数：{edit_counts}")


查询前5个名人: ["John O'Hurley", 'Kelly Monaco', 'Evander Holyfield', 'Rachel Hunter', 'Joey McIntyre']

---匹配结果---
页面标题：John O'Hurley
页面URL：https://en.wikipedia.org/wiki/John_O%27Hurley
页面摘要：John George O'Hurley Jr. (born October 9, 1954) is an American actor and game show host. He played J
John O'Hurley 在 2023 年的浏览量：388916
John O'Hurley 在 2023 年的编辑次数：1863
---匹配结果---
页面标题：Kelly Monaco
页面URL：https://en.wikipedia.org/wiki/Kelly_Monaco
页面摘要：Kelly Marie Monaco (born May 23, 1976) is an American actress, model, and reality television persona
Kelly Monaco 在 2023 年的浏览量：254426
Kelly Monaco 在 2023 年的编辑次数：1863
---匹配结果---
页面标题：Evander Holyfield
页面URL：https://en.wikipedia.org/wiki/Evander_Holyfield
页面摘要：Evander Holyfield (born October 19, 1962) is an American former professional boxer who competed betw
Evander Holyfield 在 2023 年的浏览量：1331834
Evander Holyfield 在 2023 年的编辑次数：1863
---匹配结果---
页面标题：Rachel Hunter
页面URL：https://en.wikipedia.org/wiki/Rachel_Hunter
页面摘要：Rachel Hunter (born 8 September 1969) is a

In [3]:
#整理最终处理数据
import wikipedia
import requests
import pandas as pd
import time
from urllib.parse import quote
from datetime import datetime

# 设置维基百科语言为英文
wikipedia.set_lang("en")

def get_yearly_pageviews(page_title, year):
    """
    获取某 Wikipedia 页面在某一年的总浏览量
    """
    try:
        page_title = page_title.replace(" ", "_")
        page_title = quote(page_title, safe="")
        
        url = (
            "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
            f"en.wikipedia.org/all-access/all-agents/"
            f"{page_title}/monthly/"
            f"{year}0101/{year}1231"
        )
        
        headers = {
            "User-Agent": "Academic-Research-Bot/1.0"
        }
        
        r = requests.get(url, headers=headers, timeout=10)
        
        if r.status_code != 200:
            return 0
        
        items = r.json().get("items", [])
        return sum(item["views"] for item in items)
    except Exception as e:
        return 0

def get_yearly_edit_count(page_title, year):
    """
    获取某 Wikipedia 页面在某一年的编辑次数
    """
    try:
        url = "https://en.wikipedia.org/w/api.php"
        
        start = f"{year}-01-01T00:00:00Z"
        end = f"{year}-12-31T23:59:59Z"
        
        params = {
            "action": "query",
            "format": "json",
            "prop": "revisions",
            "titles": page_title,
            "rvstart": end,
            "rvend": start,
            "rvlimit": "max"
        }
        
        headers = {
            "User-Agent": "Academic-Research-Bot/1.0"
        }
        
        total_edits = 0
        
        while True:
            r = requests.get(url, params=params, headers=headers, timeout=10)
            data = r.json()
            
            if "query" not in data or "pages" not in data["query"]:
                break
                
            pages = data["query"]["pages"]
            if not pages:
                break
                
            page_data = next(iter(pages.values()))
            
            if "revisions" in page_data:
                total_edits += len(page_data["revisions"])
            
            if "continue" in data:
                params.update(data["continue"])
            else:
                break
        
        return total_edits
    except Exception as e:
        return 0

def get_cumulative_stats_for_partner(page_title, target_year):
    """
    获取舞伴页面在target_year及之前所有年份的累计浏览量和编辑次数
    """
    total_pageviews = 0
    total_edits = 0
    
    # 从2005年开始统计（因为第一季是2005年）
    start_year = 2005
    current_year = datetime.now().year
    
    # 只统计到target_year
    end_year = min(target_year, current_year)
    
    # 统计从2005年到target_year的累计数据
    for year in range(start_year, end_year + 1):
        year_views = get_yearly_pageviews(page_title, year)
        year_edits = get_yearly_edit_count(page_title, year)
        
        total_pageviews += year_views
        total_edits += year_edits
        
        # 避免请求过快
        time.sleep(0.3)
    
    return total_pageviews, total_edits

# 读取数据
print("="*60)
print("读取数据文件...")
print("="*60)

df_name = pd.read_csv('name.csv')
df_celebrity_stats = pd.read_csv('celebrity_wikipedia_stats.csv')

print(f"name.csv: {len(df_name)} 行")
print(f"celebrity_wikipedia_stats.csv: {len(df_celebrity_stats)} 行")

# 合并数据（基于celebrity_name, season, year）
df_merged = df_name.merge(
    df_celebrity_stats[['celebrity_name', 'season', 'year', 'total_pageviews', 'total_edits']],
    on=['celebrity_name', 'season', 'year'],
    how='left'
)

print(f"\n合并后数据: {len(df_merged)} 行")

# 重命名列以便区分
df_merged = df_merged.rename(columns={
    'total_pageviews': 'celebrity_total_pageviews',
    'total_edits': 'celebrity_total_edits'
})

# 获取所有唯一的舞伴及其对应的年份
print("\n" + "="*60)
print("分析舞伴数据...")
print("="*60)

# 为每个舞伴找到其最早出现的年份（用于查询累计数据）
partner_years = df_merged.groupby('ballroom_partner')['year'].min().to_dict()
unique_partners = df_merged['ballroom_partner'].unique()

print(f"找到 {len(unique_partners)} 个不同的舞伴")

# 查询每个舞伴的维基百科统计数据
print("\n" + "="*60)
print("开始查询舞伴的维基百科数据...")
print("="*60)
print("注意：每个舞伴需要查询多个年份的数据，请耐心等待...\n")

partner_stats = {}
partner_page_titles = {}

for idx, partner_name in enumerate(unique_partners, 1):
    year = partner_years[partner_name]
    
    print(f"[{idx}/{len(unique_partners)}] 查询舞伴: {partner_name} (最早出现年份: {year})")
    
    try:
        # 搜索维基百科页面
        search_results = wikipedia.search(partner_name, results=5)
        
        if not search_results:
            print(f"  ✗ 未找到页面")
            partner_stats[partner_name] = {
                'wikipedia_title': 'Not Found',
                'total_pageviews': 0,
                'total_edits': 0,
                'status': 'Not Found'
            }
            partner_page_titles[partner_name] = None
            continue
        
        # 尝试使用第一个搜索结果
        page = None
        for search_result in search_results:
            try:
                page = wikipedia.page(search_result, auto_suggest=False)
                # 验证页面标题是否匹配（简单检查）
                if partner_name.lower() in page.title.lower() or page.title.lower() in partner_name.lower():
                    break
            except:
                continue
        
        if page is None:
            # 如果都失败，使用第一个结果
            page = wikipedia.page(search_results[0], auto_suggest=False)
        
        print(f"  ✓ 找到页面: {page.title}")
        partner_page_titles[partner_name] = page.title
        
        # 获取累计统计（到对应年份及之前）
        total_pageviews, total_edits = get_cumulative_stats_for_partner(page.title, year)
        
        print(f"  ✓ 累计浏览量（至{year}年）: {total_pageviews:,}")
        print(f"  ✓ 累计编辑次数（至{year}年）: {total_edits:,}\n")
        
        partner_stats[partner_name] = {
            'wikipedia_title': page.title,
            'total_pageviews': total_pageviews,
            'total_edits': total_edits,
            'status': 'Success'
        }
        
        # 避免请求过快
        time.sleep(1)
        
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"  ✗ 存在歧义，尝试第一个选项: {e.options[0] if e.options else 'N/A'}")
        try:
            if e.options:
                page = wikipedia.page(e.options[0], auto_suggest=False)
                total_pageviews, total_edits = get_cumulative_stats_for_partner(page.title, year)
                partner_stats[partner_name] = {
                    'wikipedia_title': page.title,
                    'total_pageviews': total_pageviews,
                    'total_edits': total_edits,
                    'status': 'Disambiguation'
                }
                partner_page_titles[partner_name] = page.title
                print(f"  ✓ 使用歧义选项: {page.title}\n")
            else:
                partner_stats[partner_name] = {
                    'wikipedia_title': 'Disambiguation Error',
                    'total_pageviews': 0,
                    'total_edits': 0,
                    'status': 'Disambiguation Error'
                }
                partner_page_titles[partner_name] = None
        except Exception as e2:
            print(f"  ✗ 处理歧义失败: {str(e2)}\n")
            partner_stats[partner_name] = {
                'wikipedia_title': 'Error',
                'total_pageviews': 0,
                'total_edits': 0,
                'status': f'Error: {str(e2)}'
            }
            partner_page_titles[partner_name] = None
    except Exception as e:
        print(f"  ✗ 查询失败: {str(e)}\n")
        partner_stats[partner_name] = {
            'wikipedia_title': 'Error',
            'total_pageviews': 0,
            'total_edits': 0,
            'status': f'Error: {str(e)}'
        }
        partner_page_titles[partner_name] = None

# 为每行数据添加舞伴的统计数据
# 注意：每个舞伴的统计数据是基于其最早出现的年份，但我们需要根据每行的年份来查询
# 所以我们需要为每个(partner, year)组合重新查询
print("\n" + "="*60)
print("为每行数据查询舞伴的统计数据...")
print("="*60)

# 获取所有唯一的(partner, year)组合
partner_year_combos = df_merged[['ballroom_partner', 'year']].drop_duplicates()
print(f"需要查询 {len(partner_year_combos)} 个(舞伴, 年份)组合\n")

partner_year_stats = {}

for idx, row in partner_year_combos.iterrows():
    partner_name = row['ballroom_partner']
    year = row['year']
    combo_key = (partner_name, year)
    
    print(f"[{idx+1}/{len(partner_year_combos)}] 查询: {partner_name} (年份: {year})")
    
    # 如果已经有页面标题，使用它；否则先搜索
    if partner_name in partner_page_titles and partner_page_titles[partner_name]:
        page_title = partner_page_titles[partner_name]
    else:
        try:
            search_results = wikipedia.search(partner_name, results=1)
            if search_results:
                page = wikipedia.page(search_results[0], auto_suggest=False)
                page_title = page.title
                partner_page_titles[partner_name] = page_title
            else:
                page_title = None
        except:
            page_title = None
    
    if page_title:
        total_pageviews, total_edits = get_cumulative_stats_for_partner(page_title, year)
        partner_year_stats[combo_key] = {
            'total_pageviews': total_pageviews,
            'total_edits': total_edits
        }
        print(f"  ✓ 累计浏览量（至{year}年）: {total_pageviews:,}")
        print(f"  ✓ 累计编辑次数（至{year}年）: {total_edits:,}\n")
    else:
        partner_year_stats[combo_key] = {
            'total_pageviews': 0,
            'total_edits': 0
        }
        print(f"  ✗ 未找到页面\n")
    
    time.sleep(0.5)

# 将舞伴统计数据添加到合并的数据框中
def get_partner_stats(row):
    combo_key = (row['ballroom_partner'], row['year'])
    if combo_key in partner_year_stats:
        return pd.Series({
            'ballroom_total_pageviews': partner_year_stats[combo_key]['total_pageviews'],
            'ballroom_total_edits': partner_year_stats[combo_key]['total_edits']
        })
    else:
        return pd.Series({
            'ballroom_total_pageviews': 0,
            'ballroom_total_edits': 0
        })

df_merged[['ballroom_total_pageviews', 'ballroom_total_edits']] = df_merged.apply(get_partner_stats, axis=1)

# 重新排列列的顺序
final_columns = [
    'season',
    'year',
    'celebrity_name',
    'celebrity_total_pageviews',
    'celebrity_total_edits',
    'ballroom_partner',
    'ballroom_total_pageviews',
    'ballroom_total_edits'
]

df_final = df_merged[final_columns].copy()

# 保存最终结果
output_file = 'combined_season_stats.csv'
df_final.to_csv(output_file, index=False, encoding='utf-8-sig')

print("="*60)
print(f"完成！结果已保存到: {output_file}")
print("="*60)
print(f"\n最终数据: {len(df_final)} 行")
print("\n前10行预览:")
print(df_final.head(10))
print("\n数据统计:")
print(f"  季节数: {df_final['season'].nunique()}")
print(f"  年份范围: {df_final['year'].min()} - {df_final['year'].max()}")
print(f"  明星数: {df_final['celebrity_name'].nunique()}")
print(f"  舞伴数: {df_final['ballroom_partner'].nunique()}")



读取数据文件...
name.csv: 421 行
celebrity_wikipedia_stats.csv: 421 行

合并后数据: 421 行

分析舞伴数据...
找到 60 个不同的舞伴

开始查询舞伴的维基百科数据...
注意：每个舞伴需要查询多个年份的数据，请耐心等待...

[1/60] 查询舞伴: Charlotte Jorgensen (最早出现年份: 2005)
  ✓ 找到页面: Jorgensen
  ✓ 累计浏览量（至2005年）: 0
  ✓ 累计编辑次数（至2005年）: 0

[2/60] 查询舞伴: Alec Mazo (最早出现年份: 2005)
  ✓ 找到页面: Alec Mazo
  ✓ 累计浏览量（至2005年）: 0
  ✓ 累计编辑次数（至2005年）: 0

[3/60] 查询舞伴: Edyta Sliwinska (最早出现年份: 2005)
  ✓ 找到页面: Dancing with the Stars (American TV series)
  ✓ 累计浏览量（至2005年）: 0
  ✓ 累计编辑次数（至2005年）: 12

[4/60] 查询舞伴: Jonathan Roberts (最早出现年份: 2005)




  lis = BeautifulSoup(html).find_all('li')


  ✓ 找到页面: Jonathan Roberts (writer)
  ✓ 累计浏览量（至2005年）: 0
  ✓ 累计编辑次数（至2005年）: 2

[5/60] 查询舞伴: Ashly DelGrosso (最早出现年份: 2005)
  ✓ 找到页面: Ashly DelGrosso
  ✓ 累计浏览量（至2005年）: 0
  ✓ 累计编辑次数（至2005年）: 0

[6/60] 查询舞伴: Louis van Amstel (最早出现年份: 2005)
  ✓ 找到页面: Louis van Amstel
  ✓ 累计浏览量（至2005年）: 0
  ✓ 累计编辑次数（至2005年）: 0

[7/60] 查询舞伴: Nick Kosovich (最早出现年份: 2006)
  ✓ 找到页面: Nick Kosovich
  ✓ 累计浏览量（至2006年）: 0
  ✓ 累计编辑次数（至2006年）: 17

[8/60] 查询舞伴: Maksim Chmerkoskiy (最早出现年份: 2006)
  ✓ 找到页面: Paige Hemmis
  ✓ 累计浏览量（至2006年）: 0
  ✓ 累计编辑次数（至2006年）: 0

[9/60] 查询舞伴: Tony Dovolani (最早出现年份: 2006)
  ✓ 找到页面: Tony Dovolani
  ✓ 累计浏览量（至2006年）: 0
  ✓ 累计编辑次数（至2006年）: 43

[10/60] 查询舞伴: Anna Trebunskaya (最早出现年份: 2006)
  ✓ 找到页面: Anna Trebunskaya
  ✓ 累计浏览量（至2006年）: 0
  ✓ 累计编辑次数（至2006年）: 0

[11/60] 查询舞伴: Cheryl Burke (最早出现年份: 2006)
  ✓ 找到页面: Cheryl Burke
  ✓ 累计浏览量（至2006年）: 0
  ✓ 累计编辑次数（至2006年）: 123

[12/60] 查询舞伴: Andrea Hale (最早出现年份: 2006)
  ✓ 找到页面: Andrea Botez
  ✓ 累计浏览量（至2006年）: 0
  ✓ 累计编辑次数（至2006年）: 0

[13/60] 查询舞伴: Kari

3777

In [2]:
import wikipedia
import requests
import pandas as pd
import time
from urllib.parse import quote
from datetime import datetime

# 设置维基百科语言为英文
wikipedia.set_lang("en")

def get_yearly_pageviews(page_title, year):
    """
    获取某 Wikipedia 页面在某一年的总浏览量
    """
    try:
        page_title = page_title.replace(" ", "_")
        page_title = quote(page_title, safe="")
        
        url = (
            "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
            f"en.wikipedia.org/all-access/all-agents/"
            f"{page_title}/monthly/"
            f"{year}0101/{year}1231"
        )
        
        headers = {
            "User-Agent": "Academic-Research-Bot/1.0"
        }
        
        r = requests.get(url, headers=headers, timeout=10)
        
        if r.status_code != 200:
            return 0
        
        items = r.json().get("items", [])
        return sum(item["views"] for item in items)
    except Exception as e:
        return 0

def get_yearly_edit_count(page_title, year):
    """
    获取某 Wikipedia 页面在某一年的编辑次数
    """
    try:
        url = "https://en.wikipedia.org/w/api.php"
        
        start = f"{year}-01-01T00:00:00Z"
        end = f"{year}-12-31T23:59:59Z"
        
        params = {
            "action": "query",
            "format": "json",
            "prop": "revisions",
            "titles": page_title,
            "rvstart": end,
            "rvend": start,
            "rvlimit": "max"
        }
        
        headers = {
            "User-Agent": "Academic-Research-Bot/1.0"
        }
        
        total_edits = 0
        
        while True:
            r = requests.get(url, params=params, headers=headers, timeout=10)
            data = r.json()
            
            if "query" not in data or "pages" not in data["query"]:
                break
                
            pages = data["query"]["pages"]
            if not pages:
                break
                
            page_data = next(iter(pages.values()))
            
            if "revisions" in page_data:
                total_edits += len(page_data["revisions"])
            
            if "continue" in data:
                params.update(data["continue"])
            else:
                break
        
        return total_edits
    except Exception as e:
        return 0

def get_cumulative_stats(page_title, target_year):
    """
    获取页面在target_year及之前所有年份的累计浏览量和编辑次数
    """
    total_pageviews = 0
    total_edits = 0
    
    # 从2005年开始统计（因为第一季是2005年）
    start_year = 2005
    current_year = datetime.now().year
    
    # 只统计到target_year
    end_year = min(target_year, current_year)
    
    print(f"    统计年份范围: {start_year} - {end_year}")
    
    #改成只统计一年
    for year in range(start_year, end_year + 1):
        year=end_year
        year_views = get_yearly_pageviews(page_title, year)
        year_edits = get_yearly_edit_count(page_title, year)
            
        total_pageviews += year_views
        total_edits += year_edits
            
        # 避免请求过快
        time.sleep(0.3)
    
    return total_pageviews, total_edits

# 读取name.csv文件
if 'df_filtered' in globals():
    df_celebrities = df_filtered.copy()
else:
    df_celebrities = pd.read_csv('name.csv')

print(f"开始查询 {len(df_celebrities)} 个celebrity的维基百科数据...\n")
print("注意：每个celebrity需要查询多个年份的数据，请耐心等待...\n")

results = []

for idx, row in df_celebrities.iterrows():
    celebrity_name = row['celebrity_name']
    year = row['year']
    season = row['season']
    
    print(f"[{idx+1}/{len(df_celebrities)}] 查询: {celebrity_name} (Season {season}, Year {year})")
    
    try:
        # 搜索维基百科页面
        search_results = wikipedia.search(celebrity_name, results=5)
        
        if not search_results:
            print(f"  ✗ 未找到页面")
            results.append({
                'celebrity_name': celebrity_name,
                'season': season,
                'year': year,
                'wikipedia_title': 'Not Found',
                'wikipedia_url': '',
                'total_pageviews': 0,
                'total_edits': 0,
                'status': 'Not Found'
            })
            continue
        
        # 尝试使用第一个搜索结果
        page = None
        for search_result in search_results:
            try:
                page = wikipedia.page(search_result, auto_suggest=False)
                # 验证页面标题是否匹配（简单检查）
                if celebrity_name.lower() in page.title.lower() or page.title.lower() in celebrity_name.lower():
                    break
            except:
                continue
        
        if page is None:
            # 如果都失败，使用第一个结果
            page = wikipedia.page(search_results[0], auto_suggest=False)
        
        print(f"  ✓ 找到页面: {page.title}")
        
        # 获取累计统计（到对应年份及之前）
        total_pageviews, total_edits = get_cumulative_stats(page.title, year)
        
        print(f"  ✓ 累计浏览量（至{year}年）: {total_pageviews:,}")
        print(f"  ✓ 累计编辑次数（至{year}年）: {total_edits:,}\n")
        
        results.append({
            'celebrity_name': celebrity_name,
            'season': season,
            'year': year,
            'wikipedia_title': page.title,
            'wikipedia_url': page.url,
            'total_pageviews': total_pageviews,
            'total_edits': total_edits,
            'status': 'Success'
        })
        
        # 避免请求过快
        time.sleep(1)
        
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"  ✗ 存在歧义，尝试第一个选项: {e.options[0] if e.options else 'N/A'}")
        try:
            if e.options:
                page = wikipedia.page(e.options[0], auto_suggest=False)
                total_pageviews, total_edits = get_cumulative_stats(page.title, year)
                results.append({
                    'celebrity_name': celebrity_name,
                    'season': season,
                    'year': year,
                    'wikipedia_title': page.title,
                    'wikipedia_url': page.url,
                    'total_pageviews': total_pageviews,
                    'total_edits': total_edits,
                    'status': 'Disambiguation'
                })
                print(f"  ✓ 使用歧义选项: {page.title}\n")
            else:
                results.append({
                    'celebrity_name': celebrity_name,
                    'season': season,
                    'year': year,
                    'wikipedia_title': 'Disambiguation Error',
                    'wikipedia_url': '',
                    'total_pageviews': 0,
                    'total_edits': 0,
                    'status': 'Disambiguation Error'
                })
        except Exception as e2:
            print(f"  ✗ 处理歧义失败: {str(e2)}\n")
            results.append({
                'celebrity_name': celebrity_name,
                'season': season,
                'year': year,
                'wikipedia_title': 'Error',
                'wikipedia_url': '',
                'total_pageviews': 0,
                'total_edits': 0,
                'status': f'Error: {str(e2)}'
            })
    except Exception as e:
        print(f"  ✗ 查询失败: {str(e)}\n")
        results.append({
            'celebrity_name': celebrity_name,
            'season': season,
            'year': year,
            'wikipedia_title': 'Error',
            'wikipedia_url': '',
            'total_pageviews': 0,
            'total_edits': 0,
            'status': f'Error: {str(e)}'
        })

# 保存结果到CSV
df_results = pd.DataFrame(results)
df_results.to_csv('celebrity_wikipedia_stats.csv', index=False, encoding='utf-8-sig')

print("=" * 60)
print("查询完成！结果已保存到: celebrity_wikipedia_stats(one_year).csv")
print("=" * 60)
print(f"\n成功查询: {len(df_results[df_results['status'] == 'Success'])} 个")
print(f"失败查询: {len(df_results[df_results['status'] != 'Success'])} 个")
print("\n结果预览:")
print(df_results[['celebrity_name', 'year', 'wikipedia_title', 'total_pageviews', 'total_edits']].head(10))



开始查询 421 个celebrity的维基百科数据...

注意：每个celebrity需要查询多个年份的数据，请耐心等待...

[1/421] 查询: John O'Hurley (Season 1, Year 2005)
  ✓ 找到页面: John O'Hurley
    统计年份范围: 2005
  ✓ 累计浏览量（至2005年）: 0
  ✓ 累计编辑次数（至2005年）: 49

[2/421] 查询: Kelly Monaco (Season 1, Year 2005)
  ✓ 找到页面: Kelly Monaco
    统计年份范围: 2005
  ✓ 累计浏览量（至2005年）: 0
  ✓ 累计编辑次数（至2005年）: 114

[3/421] 查询: Evander Holyfield (Season 1, Year 2005)
  ✓ 找到页面: Evander Holyfield
    统计年份范围: 2005
  ✓ 累计浏览量（至2005年）: 0
  ✓ 累计编辑次数（至2005年）: 47

[4/421] 查询: Rachel Hunter (Season 1, Year 2005)
  ✓ 找到页面: Rachel Hunter
    统计年份范围: 2005
  ✓ 累计浏览量（至2005年）: 0
  ✓ 累计编辑次数（至2005年）: 24

[5/421] 查询: Joey McIntyre (Season 1, Year 2005)
  ✓ 找到页面: Joey McIntyre
    统计年份范围: 2005
  ✓ 累计浏览量（至2005年）: 0
  ✓ 累计编辑次数（至2005年）: 31

[6/421] 查询: Trista Sutter (Season 1, Year 2005)
  ✓ 找到页面: Trista Sutter
    统计年份范围: 2005
  ✓ 累计浏览量（至2005年）: 0
  ✓ 累计编辑次数（至2005年）: 39

[7/421] 查询: Tatum O'Neal (Season 2, Year 2006)
  ✓ 找到页面: Tatum O'Neal
    统计年份范围: 2006
  ✓ 累计浏览量（至2006年）: 0
  ✓ 累计编辑次数（至20



  lis = BeautifulSoup(html).find_all('li')


  ✓ 找到页面: Albert Reed Jr.
    统计年份范围: 2007
  ✓ 累计浏览量（至2007年）: 0
  ✓ 累计编辑次数（至2007年）: 30

[46/421] 查询: Helio Castroneves (Season 5, Year 2007)
  ✓ 找到页面: 2002 Indy Racing League
    统计年份范围: 2007
  ✓ 累计浏览量（至2007年）: 0
  ✓ 累计编辑次数（至2007年）: 16

[47/421] 查询: Mel B (Season 5, Year 2007)
  ✓ 找到页面: Mel B
    统计年份范围: 2007
  ✓ 累计浏览量（至2007年）: 0
  ✓ 累计编辑次数（至2007年）: 520

[48/421] 查询: Wayne Newton (Season 5, Year 2007)
  ✓ 找到页面: Wayne Newton
    统计年份范围: 2007
  ✓ 累计浏览量（至2007年）: 0
  ✓ 累计编辑次数（至2007年）: 224

[49/421] 查询: Marie Osmond (Season 5, Year 2007)
  ✓ 找到页面: Marie Osmond
    统计年份范围: 2007
  ✓ 累计浏览量（至2007年）: 0
  ✓ 累计编辑次数（至2007年）: 431

[50/421] 查询: Mark Cuban (Season 5, Year 2007)
  ✓ 找到页面: Mark Cuban
    统计年份范围: 2007
  ✓ 累计浏览量（至2007年）: 0
  ✓ 累计编辑次数（至2007年）: 734

[51/421] 查询: Cristian de la Fuente (Season 6, Year 2008)
  ✓ 找到页面: La Jefa (TV series)
    统计年份范围: 2008
  ✓ 累计浏览量（至2008年）: 0
  ✓ 累计编辑次数（至2008年）: 0

[52/421] 查询: Steve Guttenberg (Season 6, Year 2008)
  ✓ 找到页面: Steve Guttenberg
    统计年份范围: 2008
 