In [1]:
import pandas as pd

# 示例数据加载（假设您的数据列包含Year, NOC, Gold, Silver, Bronze, Total）
# 替换为实际数据路径
df = pd.read_csv("NEW_MEDAL_TABLE_WITH_PARTICIPANTS.csv")

# 1. 过滤战争年份（1916, 1940, 1944未举办奥运会）
valid_years = sorted([year for year in df['Year'].unique() if year not in {1916, 1940, 1944}])

# 2. 生成时间窗口：连续三届预测下一届
windows = []
for i in range(len(valid_years) - 3):
    window_years = valid_years[i:i+3]
    target_year = valid_years[i+3]
    windows.append((window_years, target_year))

# 3. 构建特征与目标数据集
features, targets, nocs, target_years = [], [], [], []
for window, target_year in windows:
    # 获取在窗口和预测年均存在的国家
    countries_in_window = df[df['Year'].isin(window)]['NOC'].unique()
    countries_in_target = df[df['Year'] == target_year]['NOC'].unique()
    valid_nocs = list(set(countries_in_window) & set(countries_in_target))

    for noc in valid_nocs:
        # 提取三届数据（按时间排序）
        window_data = df[(df['NOC'] == noc) & (df['Year'].isin(window))]
        window_data = window_data.sort_values('Year').tail(3)  # 确保取最近三届

        # 检查数据完整性
        if len(window_data) != 3 or window_data['Year'].nunique() != 3:
            continue

        # 构建特征：三届的奖牌数平铺
        feat = []
        for _, row in window_data.iterrows():
            feat.extend([row['Gold'], row['Silver'], row['Bronze']])

        # 提取目标值
        target_value = df[(df['NOC'] == noc) & (df['Year'] == target_year)]['Total'].values[0]

        # 存储结果
        features.append(feat)
        targets.append(target_value)
        nocs.append(noc)
        target_years.append(target_year)

# 4. 转换为DataFrame
feature_cols = [f"{medal}_{i+1}" for i in range(3) for medal in ['Gold', 'Silver', 'Bronze']]
final_df = pd.DataFrame(features, columns=feature_cols)
final_df['NOC'] = nocs
final_df['Target_Year'] = target_years
final_df['Total_Medals_Target'] = targets

print(final_df.head())

   Gold_1  Silver_1  Bronze_1  Gold_2  Silver_2  Bronze_2  Gold_3  Silver_3  \
0       2         0         0       2         0         3       0         3   
1       2         3         2      15         7         9       1         1   
2       5         4         2      27        39        37       0         1   
3      11         7         2      19        14        15      76        78   
4       2         1         2       0         3         3       1         1   

   Bronze_3  NOC  Target_Year  Total_Medals_Target  
0         1  AUS         1908                    5  
1         0  GBR         1908                  146  
2         0  FRA         1908                   19  
3        77  USA         1908                   47  
4         1  AUT         1908                    1  
