In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

pd.set_option('display.max_rows', None) 
pd.set_option('display.max_columns', None) 

# 加载数据
data = pd.read_csv("Final_table.csv")

# 数据预处理
data = data.sort_values(by=['NOC', 'Year'])  # 按国家和年份排序

# 提取前三届奥运会的奖牌数据
def get_previous_medals(group):
    group = group.sort_values(by='Year')  # 按年份排序
    group['Gold_1'] = group['Gold'].shift(1)  # 上一届金牌
    group['Silver_1'] = group['Silver'].shift(1)  # 上一届银牌
    group['Bronze_1'] = group['Bronze'].shift(1)  # 上一届铜牌
    group['Gold_2'] = group['Gold'].shift(2)  # 上上届金牌
    group['Silver_2'] = group['Silver'].shift(2)  # 上上届银牌
    group['Bronze_2'] = group['Bronze'].shift(2)  # 上上届铜牌
    group['Gold_3'] = group['Gold'].shift(3)  # 上上上届金牌
    group['Silver_3'] = group['Silver'].shift(3)  # 上上上届银牌
    group['Bronze_3'] = group['Bronze'].shift(3)  # 上上上届铜牌
    return group

data = data.groupby('NOC').apply(get_previous_medals)  # 按国家分组处理

# 删除缺失值（前三届数据不足的国家）
data = data.dropna()

# 特征和标签
X = data[['Gold_1', 'Silver_1', 'Bronze_1', 
          'Gold_2', 'Silver_2', 'Bronze_2', 
          'Gold_3', 'Silver_3', 'Bronze_3', 
          'host_flag', 'Participants']]  # 特征
y = data['Gold']  # 目标变量

# 划分训练集和测试集
# 使用 train_test_split 随机划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 使用 LassoCV 进行交叉验证
# 设置 alpha 的候选值范围
alphas = np.logspace(-4, 1, 100)  # 从 10^-4 到 10^1 的 100 个候选值
lasso_cv = LassoCV(alphas=alphas, cv=5, random_state=42)  # 5 折交叉验证

# 训练模型
lasso_cv.fit(X_train, y_train)

# 输出最优的 alpha 值
print(f"最优的 alpha 值: {lasso_cv.alpha_}")

# 使用最优 alpha 值进行预测
y_pred = lasso_cv.predict(X_test)

# 评估模型
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"均方误差 (MSE): {mse:.2f}")
print(f"R² 分数: {r2:.2f}")

# 查看模型系数
coefficients = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lasso_cv.coef_
})
print("模型系数:")
print(coefficients)

# 输出预测结果
predictions = pd.DataFrame({
    'Actual_Gold': y_test,
    'Predicted_Gold': y_pred
})
print("预测结果:")
print(predictions)


最优的 alpha 值: 0.026560877829466867
均方误差 (MSE): 19.87
R² 分数: 0.78
模型系数:
         Feature  Coefficient
0         Gold_1     0.536846
1       Silver_1    -0.033021
2       Bronze_1    -0.103179
3         Gold_2     0.232177
4       Silver_2    -0.076160
5       Bronze_2     0.136612
6         Gold_3     0.024676
7       Silver_3     0.184173
8       Bronze_3     0.009908
9      host_flag     9.766482
10  Participants     0.001362
预测结果:
          Actual_Gold  Predicted_Gold
NOC                                  
GEO 1167            2        1.645002
IRN 1055            7        2.089447
FRA 579             5        4.638210
BGR 535             8        4.573719
GBR 1131           27       21.439548
    893             9        6.401452
CAN 387             1        1.484244
CZE 981             3        2.061226
MEX 481             0        1.701905
ITA 83             13        2.378254
NOR 104             5        8.003640
AUS 291            13       13.559582
FRA 966             7       10.3

  data = data.groupby('NOC').apply(get_previous_medals)  # 按国家分组处理


金银铜 总 参赛人数 host 2012 2016 2020 2024host

金银铜 总 人



In [5]:
import json
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 加载 JSON 数据
with open("data.json", "r", encoding="utf-8") as json_file:
    data = json.load(json_file)

# 提取特征和标签
def extract_features_and_labels(data):
    features = []
    labels = []
    for country, entries in data.items():
        for entry in entries:
            features.append(entry["Feats"])
            labels.append(entry["label"][0])  # 假设我们只预测第一个标签
    return np.array(features), np.array(labels)

# 提取特征和标签
X, y = extract_features_and_labels(data)

# 将数据集分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 初始化拉索回归模型
lasso = Lasso(alpha=0.1)  # alpha 是正则化强度，可以调整

# 训练模型
lasso.fit(X_train, y_train)

# 预测
y_pred = lasso.predict(X_test)

# 评估模型
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# 输出模型系数
print("Lasso Coefficients:", lasso.coef_)

print(y_pred)


Mean Squared Error: 17.7892457091735
Lasso Coefficients: [ 0.0684999   0.06287674 -0.05721343  0.01567484 -0.00098386 -0.
  0.28294612  0.1407167  -0.01981939  0.00203828 -0.00069868 -0.02993828
  0.1735463   0.11865959 -0.          0.09368548  0.00033583 -0.        ]
[14.58801627  1.66045098  1.77285078  2.82904664  6.24217056  0.94905879
  6.33149856  1.2450672   1.41080724  5.63489384 34.57733695 18.1524473
  4.92711592  3.26673143  3.5552251   4.02309881  2.05991326  0.2794022
  3.82009333  4.28981272  7.52021527  2.1007313   0.49541101  2.35166345
  1.93721746  4.92217116  8.8878608   3.39518918  1.37001874  0.73883849
  7.46031621 38.70064311  7.91875428  9.91810959  1.89085206  5.17072777
  5.17340736  6.86989366 15.45252709  7.31529355  5.08436779  0.877897
  1.15082767  3.42620928 11.60134038  3.7525035   1.1221389   2.62319563
  4.05043044  3.39382442  3.82698305  1.81387089  1.24738387  2.3003925
  0.47408581  2.60885471  1.66702182  0.98899651  2.19504587  3.58421655
  1.74

In [2]:
import pandas as pd
df = pd.read_csv("complete_data.csv")
df_filtered = df[~df['NOC'].isin(['URS', 'RUS'])]
df_filtered.to_csv("complete_data.csv")