In [43]:
import json
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit

# 1. 数据加载与预处理
with open('new_data.json') as f:
    raw_data = json.load(f)

# 提取国家级特征用于聚类
country_stats = []
for noc, entries in raw_data.items():
    stats = {
        'NOC': noc,
        'avg_gold': np.mean([e['Feats'][3] for e in entries]) if entries else 0,
        'max_athletes' : max(e['Feats'][4] for e in entries) if entries else 0 , # 历史最大参赛人数
        'total_host': sum(e['Feats'][-1] for e in entries),      # 累计主办次数
        'trend': np.polyfit(                                      # 奖牌数趋势
            [e['TimeOrder'] for e in entries],
            [e['Label'] for e in entries], 1
        )[0]
    }
    country_stats.append(stats)

df_country = pd.DataFrame(country_stats).set_index('NOC')


TypeError: expected non-empty vector for x

In [21]:

# 2. 国家聚类
scaler = StandardScaler()
X_cluster = scaler.fit_transform(df_country)
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_cluster)
df_country['Cluster'] = clusters  # 保存聚类标签

# 3. 构建训练数据集
all_samples = []
for noc, entries in raw_data.items():
    cluster_label = df_country.loc[noc, 'Cluster']
    for entry in entries:
        features = entry['Feats'].copy()
        features.append(cluster_label)  # 添加聚类标签
        all_samples.append({
            'features': features,
            'label': entry['Label'],
            'year': entry['Year']
        })

df = pd.DataFrame(all_samples)
X = pd.DataFrame(df['features'].tolist())
y = df['label']

# 4. 特征工程
# 原始特征名称（示例）
original_feat_names = [
    'gold_t3', 'silver_t3', 'bronze_t3', 'total_t3', 'athletes_t3', 'host_t3',
    'gold_t2', 'silver_t2', 'bronze_t2', 'total_t2', 'athletes_t2', 'host_t2',
    'gold_t1', 'silver_t1', 'bronze_t1', 'total_t1', 'athletes_t1', 'host_t1',
    'is_host_current',
    'cluster'  # 新增聚类标签
]

# 生成聚类交互特征
encoder = OneHotEncoder(sparse_output=False)
cluster_encoded = encoder.fit_transform(X[[19]])  # 假设cluster在第19列

# 交互特征：运动员数量 * 聚类类别
athletes_feats = X[[4, 10, 16]]  # t-3, t-2, t-1届的运动员数
interactions = []
for i in range(athletes_feats.shape[1]):
    for j in range(cluster_encoded.shape[1]):
        interactions.append(athletes_feats.iloc[:, i] * cluster_encoded[:, j])

# 合并所有特征
X_processed = pd.concat([
    X.iloc[:, :19],  # 原始特征
    pd.DataFrame(cluster_encoded, columns=[f'Cluster_{i}' for i in range(3)]),
    pd.DataFrame(np.array(interactions).T,
                 columns=[f'Athletes_t{3-i}_c{j}' for i in range(3) for j in range(3)])
], axis=1)

# 5. 时空交叉验证
tscv = TimeSeriesSplit(n_splits=5)
model = LassoCV(cv=tscv, alphas=np.logspace(-3, 1, 50),
                max_iter=10000, random_state=42)

# 6. 训练与评估
scaler_x = StandardScaler()
X_scaled = scaler_x.fit_transform(X_processed)
model.fit(X_scaled, y)

# 输出重要特征
important_feats = X_processed.columns[np.abs(model.coef_) > 0]
print("重要特征:", important_feats.tolist())

NameError: name 'pm' is not defined