In [33]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist  # 关键导入修复
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [34]:
df = pd.read_csv("../complete_data.csv")

# 划分时间序列数据集
train_df = df[df['Year'] < 2024]  # 训练集：2024年前数据
test_df = df[df['Year'] == 2024]   # 测试集：预测2024年

In [36]:
country_features_train = train_df.groupby('NOC').agg({
    'Total': ['sum', 'mean', 'std'],
    'Participants': 'mean',
    'is_host': 'sum'
}).reset_index()

# 处理列名和类型转换
country_features_train.columns = [
    'NOC', 'Total_sum', 'Total_mean', 'Total_std',
    'Participants_mean', 'is_host_sum'
]
country_features_train = country_features_train.astype({
    'Total_sum': 'float64',
    'Total_mean': 'float64',
    'Total_std': 'float64',
    'Participants_mean': 'float64',
    'is_host_sum': 'int64'  # 明确转为整数
})
country_features_train.fillna(0, inplace=True)

In [37]:
# 标准化聚类特征
scaler_cluster = StandardScaler()
X_cluster_train = scaler_cluster.fit_transform(
    country_features_train.drop('NOC', axis=1)
)

# 自动选择聚类数（示例固定为3，实际可用轮廓系数优化）
Z = linkage(X_cluster_train, method='ward')
clusters = fcluster(Z, t=3, criterion='maxclust')
country_features_train['cluster'] = clusters

# 合并聚类标签到训练集
train_df = train_df.merge(
    country_features_train[['NOC', 'cluster']],
    on='NOC',
    how='left'
)

In [38]:
# 标准化聚类特征
scaler_cluster = StandardScaler()
X_cluster_train = scaler_cluster.fit_transform(
    country_features_train.drop('NOC', axis=1)
)

# 自动选择聚类数（示例固定为3，实际可用轮廓系数优化）
Z = linkage(X_cluster_train, method='ward')
clusters = fcluster(Z, t=3, criterion='maxclust')
country_features_train['cluster'] = clusters

# 合并聚类标签到训练集
train_df = train_df.merge(
    country_features_train[['NOC', 'cluster']],
    on='NOC',
    how='left'
)

In [39]:
# 标准化聚类特征
scaler_cluster = StandardScaler()
X_cluster_train = scaler_cluster.fit_transform(
    country_features_train.drop('NOC', axis=1)
)

# 自动选择聚类数（示例固定为3，实际可用轮廓系数优化）
Z = linkage(X_cluster_train, method='ward')
clusters = fcluster(Z, t=3, criterion='maxclust')
country_features_train['cluster'] = clusters

# 合并聚类标签到训练集
train_df = train_df.merge(
    country_features_train[['NOC', 'cluster']],
    on='NOC',
    how='left'
)

In [40]:
def predict_row(row):
    cluster_id = row['cluster']
    if pd.isna(cluster_id) or cluster_id not in models:
        return 0.0  # 处理未知聚类
    model = models[cluster_id]
    scaler = scalers[cluster_id]
    X_row = scaler.transform([[
        row['Year'],
        row['Participants'],
        row['Events'],
        row['is_host']
    ]])
    return model.predict(X_row)[0]


In [42]:
# 合并聚类标签到测试集
test_df = test_df.merge(
    test_country_features[['NOC', 'cluster']],
    on='NOC',
    how='left'
)

# 检查测试集是否包含cluster列
print("测试集列名:", test_df.columns.tolist())

KeyError: "['cluster'] not in index"