In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error

from Independent_lasso.Independent_model_Total import results


# 数据加载和预处理
def load_and_preprocess(filepath):
    df = pd.read_csv(filepath)

    # 计算当届总奖牌数
    df['Total_Medals'] = df.groupby('Year')['Total'].transform('sum')

    # 计算奖牌占比
    df['Medal_Pct'] = df['Total'] / df['Total_Medals']
    df['Medal_Pct'] = df['Medal_Pct'].fillna(0)  # 处理无奖牌情况

    return df

# 特征工程
def create_features(df):
    features = []
    years = sorted(df['Year'].unique())

    for (noc, year) in df[['NOC', 'Year']].drop_duplicates().values:
        # 获取前三届年份
        prev_years = [year-12, year-8, year-4]  # 过去三届奥运会年份

        # 收集特征
        row = {'NOC': noc, 'Year': year}

        # 获取当前届主办信息
        current_host = df[(df['NOC'] == noc) & (df['Year'] == year)]['is_host'].values
        row['is_host_current'] = current_host[0] if len(current_host) > 0 else 0

        # 获取前三届数据
        for i, pyear in enumerate(prev_years, 1):
            prev_data = df[(df['NOC'] == noc) & (df['Year'] == pyear)]

            if not prev_data.empty:
                for col in ['Gold', 'Silver', 'Bronze', 'Total', 'Participants', 'Events', 'is_host']:
                    row[f'{col}_prev{i}'] = prev_data[col].values[0]
            else:
                for col in ['Gold', 'Silver', 'Bronze', 'Total', 'Participants', 'Events', 'is_host']:
                    row[f'{col}_prev{i}'] = 0

        # 获取目标变量
        current_medal = df[(df['NOC'] == noc) & (df['Year'] == year)]['Medal_Pct'].values
        row['Medal_Pct'] = current_medal[0] if len(current_medal) > 0 else 0

        features.append(row)

    return pd.DataFrame(features)

# 主程序
def main():
    # 数据加载
    df = load_and_preprocess('../complete_data.csv')

    # 特征工程
    feature_df = create_features(df)

    # 划分数据集
    train_df = feature_df[feature_df['Year'] < 2024]
    test_df = feature_df[feature_df['Year'] == 2024]

    # 特征/目标分离
    X_cols = [col for col in feature_df.columns if col not in ['NOC', 'Year', 'Medal_Pct']]
    X_train = train_df[X_cols].fillna(0)
    y_train = train_df['Medal_Pct'].fillna(0)
    X_test = test_df[X_cols].fillna(0)

    # 构建模型
    pipeline = make_pipeline(
        StandardScaler(),
        LassoCV(cv=5, max_iter=10000, random_state=42)
    )

    # 训练模型
    pipeline.fit(X_train, y_train)

    # 预测
    test_df['Predicted_Pct'] = pipeline.predict(X_test)

    # 输出结果
    results = test_df[['NOC', 'Year', 'Predicted_Pct']].sort_values('Predicted_Pct', ascending=False)
    print("2024年奖牌占比预测结果：")
    results.to_csv('2024_predicted_total.csv',index=false)

    # 模型评估（使用最近三届验证）
    valid_years = [2012, 2016, 2020]
    valid_df = feature_df[feature_df['Year'].isin(valid_years)]
    X_valid = valid_df[X_cols].fillna(0)
    y_valid = valid_df['Medal_Pct'].fillna(0)
    y_pred = pipeline.predict(X_valid)
    mae = mean_absolute_error(y_valid, y_pred)
    print(f"\n验证集MAE: {mae:.4f}")

if __name__ == "__main__":
    main()

2024年奖牌占比预测结果：

验证集MAE: 0.0030


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Predicted_Pct'] = pipeline.predict(X_test)


In [5]:
tmp =  pd.read_csv('2024_Total_lasso.csv')
results = pd.read_csv('2024_predicted_total.csv')
results.drop('Year', axis=1, inplace=True)

KeyError: "['Year'] not found in axis"