In [67]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error

# from Lasso.New_total import results


# 数据加载和预处理
def load_and_preprocess(filepath):
    df = pd.read_csv(filepath)

    # 计算当届总金牌数
    df['Total_Gold'] = df.groupby('Year')['Gold'].transform('sum')

    # 计算金牌占比
    df['Gold_Pct'] = df['Gold'] / df['Total_Gold']
    df['Gold_Pct'] = df['Gold_Pct'].fillna(0)  # 处理无金牌情况

    return df

# 特征工程
def create_features(df):
    features = []
    years = sorted(df['Year'].unique())

    for (noc, year) in df[['NOC', 'Year']].drop_duplicates().values:
        # 获取前三届年份
        prev_years = [year-12, year-8, year-4]  # 过去三届奥运会年份

        # 收集特征
        row = {'NOC': noc, 'Year': year}

        # 获取当前届主办信息
        current_host = df[(df['NOC'] == noc) & (df['Year'] == year)]['is_host'].values
        row['is_host_current'] = current_host[0] if len(current_host) > 0 else 0

        # 获取前三届数据
        for i, pyear in enumerate(prev_years, 1):
            prev_data = df[(df['NOC'] == noc) & (df['Year'] == pyear)]

            if not prev_data.empty:
                for col in ['Gold', 'Silver', 'Bronze', 'Total', 'Participants', 'Events', 'is_host']:
                    row[f'{col}_prev{i}'] = prev_data[col].values[0]
            else:
                for col in ['Gold', 'Silver', 'Bronze', 'Total', 'Participants', 'Events', 'is_host']:
                    row[f'{col}_prev{i}'] = 0

        # 获取目标变量
        current_gold = df[(df['NOC'] == noc) & (df['Year'] == year)]['Gold_Pct'].values
        row['Gold_Pct'] = current_gold[0] if len(current_gold) > 0 else 0

        features.append(row)

    return pd.DataFrame(features)

# 主程序
def main():
    # 数据加载
    df = load_and_preprocess('../complete_data.csv')

    # 特征工程
    feature_df = create_features(df)

    # 划分数据集
    train_df = feature_df[feature_df['Year'] < 2024]
    test_df = feature_df[feature_df['Year'] == 2024]

    # 特征/目标分离
    X_cols = [col for col in feature_df.columns if col not in ['NOC', 'Year', 'Gold_Pct']]
    X_train = train_df[X_cols].fillna(0)
    y_train = train_df['Gold_Pct'].fillna(0)
    X_test = test_df[X_cols].fillna(0)

    # 构建模型
    pipeline = make_pipeline(
        StandardScaler(),
        LassoCV(cv=5, max_iter=10000, random_state=42)
    )

    # 训练模型
    pipeline.fit(X_train, y_train)

    # 预测
    test_df['Predicted_Gold_Pct'] = pipeline.predict(X_test)

    # 输出结果
    results = test_df[['NOC', 'Year', 'Predicted_Gold_Pct']].sort_values('Predicted_Gold_Pct', ascending=False)
    print("2024年金牌占比预测结果：")
    results.to_csv('2024_predicted_Gold.csv', index=False)

    # 模型评估（使用最近三届验证）
    valid_years = [2012, 2016, 2020]
    valid_df = feature_df[feature_df['Year'].isin(valid_years)]
    X_valid = valid_df[X_cols].fillna(0)
    y_valid = valid_df['Gold_Pct'].fillna(0)
    y_pred = pipeline.predict(X_valid)
    mae = mean_absolute_error(y_valid, y_pred)
    print(f"\n验证集MAE: {mae:.4f}")

if __name__ == "__main__":
    main()

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

2024年奥运会奖牌预测结果：
    NOC  Predicted_Pct  Predicted_Total
59  USA     105.057369           115248
9   CHN      91.004113            99832
19  FRA      59.676528            65465
2   AUS      51.964551            57005
32  JPN      39.695571            43546
20  GBR      34.217391            37536
22  GER      32.205084            35329
30  ITA      31.953160            35053
35  KOR      29.995709            32905
41  NED      24.666006            27059


In [53]:
tmp = pd.read_csv('final_output.csv')
results = pd.read_csv('2024_predicted_Gold.csv')

In [54]:
results

Unnamed: 0,NOC,Year,Predicted_Gold_Pct
0,FRA,2024,0.179058
1,USA,2024,0.163247
2,CHN,2024,0.133218
3,GBR,2024,0.097783
4,JPN,2024,0.077657
...,...,...,...
227,LIB,2024,-0.003933
228,MAS,2024,-0.004014
229,GUA,2024,-0.004127
230,FIN,2024,-0.005608


In [55]:
results = results.drop('Year', axis=1)

In [56]:
results = results[results['NOC'].isin(tmp['NOC'])]

In [57]:
results

Unnamed: 0,NOC,Predicted_Gold_Pct
0,FRA,0.179058
1,USA,0.163247
2,CHN,0.133218
3,GBR,0.097783
4,JPN,0.077657
...,...,...
227,LIB,-0.003933
228,MAS,-0.004014
229,GUA,-0.004127
230,FIN,-0.005608


In [58]:
# results['Gold'] = tmp['Gold']
summ = tmp['Gold'].sum()
summ_now = results['Predicted_Gold_Pct'].sum()
for i in range(results.shape[0]):
    results.loc[i, 'Predicted_Gold_Pct'] = max(results.iloc[i]['Predicted_Gold_Pct'],0)
    results.loc[i, 'Predicted_Gold_Pct'] = results.iloc[i]['Predicted_Gold_Pct']*summ/summ_now

In [63]:
results.drop('Gold', axis=1, inplace=True)

In [65]:
results.to_csv('2024_predicted_Gold.csv', index=False)