In [36]:
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import complete
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error

# from Lasso.New_total import results


# 数据加载和预处理
def load_and_preprocess(filepath):
    df = pd.read_csv(filepath)

    # 计算当届总金牌数
    df['Total_Gold'] = df.groupby('Year')['Gold'].transform('sum')

    # 计算金牌占比
    df['Gold_Pct'] = df['Gold'] / df['Total_Gold']
    df['Gold_Pct'] = df['Gold_Pct'].fillna(0)  # 处理无金牌情况

    return df

# 特征工程
def create_features(df):
    features = []
    years = sorted(df['Year'].unique())

    for (noc, year) in df[['NOC', 'Year']].drop_duplicates().values:
        # 获取前三届年份
        prev_years = [year-12, year-8, year-4]  # 过去三届奥运会年份

        # 收集特征
        row = {'NOC': noc, 'Year': year}

        # 获取当前届主办信息
        current_host = df[(df['NOC'] == noc) & (df['Year'] == year)]['is_host'].values
        row['is_host_current'] = current_host[0] if len(current_host) > 0 else 0

        # 获取前三届数据
        for i, pyear in enumerate(prev_years, 1):
            prev_data = df[(df['NOC'] == noc) & (df['Year'] == pyear)]

            if not prev_data.empty:
                for col in ['Gold', 'Silver', 'Bronze', 'Total', 'Participants', 'Events', 'is_host']:
                    row[f'{col}_prev{i}'] = prev_data[col].values[0]
            else:
                for col in ['Gold', 'Silver', 'Bronze', 'Total', 'Participants', 'Events', 'is_host']:
                    row[f'{col}_prev{i}'] = 0

        # 获取目标变量
        current_gold = df[(df['NOC'] == noc) & (df['Year'] == year)]['Gold_Pct'].values
        row['Gold_Pct'] = current_gold[0] if len(current_gold) > 0 else 0

        features.append(row)

    return pd.DataFrame(features)

# 主程序

    # 数据加载
df = load_and_preprocess('../complete_data.csv')

# 特征工程
feature_df = create_features(df)

# 划分数据集
train_df = feature_df[feature_df['Year'] < 2024]
test_df = feature_df[feature_df['Year'] == 2024]

# 特征/目标分离
X_cols = [col for col in feature_df.columns if col not in ['NOC', 'Year', 'Gold_Pct']]
X_train = train_df[X_cols].fillna(0)
y_train = train_df['Gold_Pct'].fillna(0)
X_test = test_df[X_cols].fillna(0)
# print
# 构建模型
pipeline = make_pipeline(
    StandardScaler(),
    LassoCV(cv=5, max_iter=10000, random_state=42)
)

# 训练模型
pipeline.fit(X_train, y_train)

# 预测
test_df['Predicted_Gold_Pct'] = pipeline.predict(X_test)

# 输出结果
results = test_df[['NOC', 'Year', 'Predicted_Gold_Pct']].sort_values('Predicted_Gold_Pct', ascending=False)
print("2024年金牌占比预测结果：")
results.to_csv('2024_predicted_Gold.csv', index=False)

2024年金牌占比预测结果：

验证集MAE: 0.0034


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['Predicted_Gold_Pct'] = pipeline.predict(X_test)


In [40]:

# 获取LassoCV模型
lasso_model = pipeline.named_steps['lassocv']

# 输出模型参数
print("Lasso回归模型的参数：")
for feature, coef in zip(X_cols, lasso_model.coef_):
    print(f"\"{feature}\": {coef:.4f}")

Lasso回归模型的参数：
"is_host_current": 0.0068
"Gold_prev1": 0.0030
"Silver_prev1": -0.0029
"Bronze_prev1": -0.0000
"Total_prev1": 0.0000
"Participants_prev1": 0.0029
"Events_prev1": -0.0007
"is_host_prev1": -0.0012
"Gold_prev2": 0.0015
"Silver_prev2": -0.0022
"Bronze_prev2": 0.0029
"Total_prev2": 0.0000
"Participants_prev2": 0.0058
"Events_prev2": -0.0040
"is_host_prev2": -0.0009
"Gold_prev3": 0.0076
"Silver_prev3": 0.0016
"Bronze_prev3": 0.0000
"Total_prev3": 0.0008
"Participants_prev3": -0.0028
"Events_prev3": 0.0015
"is_host_prev3": 0.0000


In [43]:
df = pd.read_csv('../complete_data.csv')
df = df[df['Year'].isin([2024, 2020, 2016])]

In [44]:
df_sorted = df.sort_values(by=['NOC', 'Year'])

# 合并数据
def merge_rows(group):
    return pd.Series({
        'NOC': group['NOC'].iloc[0],
        'Bronze_prev1': group['Bronze'].iloc[2] if len(group) > 2 else None,
        'Gold_prev1': group['Gold'].iloc[2] if len(group) > 2 else None,
        'Silver_prev1': group['Silver'].iloc[2] if len(group) > 2 else None,
        'Participants_prev1': group['Participants'].iloc[2] if len(group) > 2 else None,
        'Events_prev1': group['Events'].iloc[2] if len(group) > 2 else None,
        'is_host_prev1': group['is_host'].iloc[2] if len(group) > 2 else None,
        'Total_prev1': group['Total'].iloc[2] if len(group) > 2 else None,
        'Bronze_prev2': group['Bronze'].iloc[1] if len(group) > 1 else None,
        'Gold_prev2': group['Gold'].iloc[1] if len(group) > 1 else None,
        'Silver_prev2': group['Silver'].iloc[1] if len(group) > 1 else None,
        'Participants_prev2': group['Participants'].iloc[1] if len(group) > 1 else None,
        'Events_prev2': group['Events'].iloc[1] if len(group) > 1 else None,
        'is_host_prev2': group['is_host'].iloc[1] if len(group) > 1 else None,
        'Total_prev2': group['Total'].iloc[1] if len(group) > 1 else None,
        'Bronze_prev3': group['Bronze'].iloc[0] if len(group) > 0 else None,
        'Gold_prev3': group['Gold'].iloc[0] if len(group) > 0 else None,
        'Silver_prev3': group['Silver'].iloc[0] if len(group) > 0 else None,
        'Participants_prev3': group['Participants'].iloc[0] if len(group) > 0 else None,
        'Events_prev3': group['Events'].iloc[0] if len(group) > 0 else None,
        'is_host_prev3': group['is_host'].iloc[0] if len(group) > 0 else None,
        'Total_prev3': group['Total'].iloc[0] if len(group) > 0 else None
    })

# 按 NOC 分组并应用合并函数
merged_df = df_sorted.groupby('NOC').apply(merge_rows).reset_index(drop=True)

# 打印合并后的数据
print(merged_df)

     NOC  Bronze_prev1  Gold_prev1  Silver_prev1  Participants_prev1  \
0    AFG             0           0             0                   7   
1    AHO             0           0             0                   0   
2    AIN             1           1             4                  46   
3    ALB             2           0             0                   9   
4    ALG             1           2             0                  55   
..   ...           ...         ...           ...                 ...   
227  YEM             0           0             0                   4   
228  YMD             0           0             0                   0   
229  YUG             0           0             0                   0   
230  ZAM             1           0             0                  32   
231  ZIM             0           0             0                   7   

     Events_prev1  is_host_prev1  Total_prev1  Bronze_prev2  Gold_prev2  ...  \
0               4              0            0          

  merged_df = df_sorted.groupby('NOC').apply(merge_rows).reset_index(drop=True)


In [45]:
merged_df['Year'] = 2028
merged_df['is_host_current'] = 0
for i in range(merged_df.shape[0]):
    if merged_df.iloc[i]['NOC'] == 'USA':
        merged_df.loc[i,'is_host_current'] = 1

In [46]:
fict = {"is_host_current": 0.0068,
"Gold_prev1": 0.0030,
"Silver_prev1": -0.0029,
"Bronze_prev1": -0.0000,
"Total_prev1": 0.0000,
"Participants_prev1": 0.0029,
"Events_prev1": -0.0007,
"is_host_prev1": -0.0012,
"Gold_prev2": 0.0015,
"Silver_prev2": -0.0022,
"Bronze_prev2": 0.0029,
"Total_prev2": 0.0000,
"Participants_prev2": 0.0058,
"Events_prev2": -0.0040,
"is_host_prev2": -0.0009,
"Gold_prev3": 0.0076,
"Silver_prev3": 0.0016,
"Bronze_prev3": 0.0000,
"Total_prev3": 0.0008,
"Participants_prev3": -0.0028,
"Events_prev3": 0.0015,
"is_host_prev3": 0.0000}

In [55]:
merged_df

Unnamed: 0,NOC,Bronze_prev1,Gold_prev1,Silver_prev1,Participants_prev1,Events_prev1,is_host_prev1,Total_prev1,Bronze_prev2,Gold_prev2,...,Total_prev2,Bronze_prev3,Gold_prev3,Silver_prev3,Participants_prev3,Events_prev3,is_host_prev3,Total_prev3,Year,is_host_current
0,AFG,0,0,0,7,4,0,0,0,0,...,0,0,0,0,3,2,0,0,2028,0
1,AHO,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2028,0
2,AIN,1,1,4,46,10,0,6,0,0,...,0,0,0,0,0,0,0,0,2028,0
3,ALB,2,0,0,9,4,0,2,0,0,...,0,0,0,0,6,3,0,0,2028,0
4,ALG,1,2,0,55,15,0,3,0,0,...,0,0,0,2,74,13,0,2,2028,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,YEM,0,0,0,4,4,0,0,0,0,...,0,0,0,0,3,3,0,0,2028,0
228,YMD,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2028,0
229,YUG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2028,0
230,ZAM,1,0,0,32,5,0,1,0,0,...,0,0,0,0,7,4,0,0,2028,0


In [58]:
columns = merged_df.columns
for i in range(merged_df.shape[0]):
    noc = merged_df.iloc[i]['NOC']
    ans = 0
    for j in range(0,merged_df.shape[1]):
        if columns[j] == 'NOC' or columns[j] == 'Year':
            continue

        beta = fict[columns[j]]
        ans += beta*merged_df.iloc[i][columns[j]]
    print(f'{noc},{ans},')

AFG,0.0251,
AHO,0.0,
AIN,0.11779999999999999,
ALB,0.045,
ALG,0.17309999999999995,
AND,0.0144,
ANG,0.1042,
ANT,0.008999999999999998,
ANZ,0.0,
ARG,0.9321999999999997,
ARM,0.047699999999999985,
ARU,0.0191,
ASA,0.0165,
AUS,4.1582,
AUT,0.6512,
AZE,0.2623,
BAH,0.0894,
BAN,0.03,
BAR,0.024799999999999996,
BDI,0.02859999999999999,
BEL,1.3338999999999999,
BEN,0.0262,
BER,0.009899999999999996,
BHU,0.011199999999999998,
BIH,0.0184,
BIZ,0.012,
BLR,0.3748,
BOH,0.0,
BOL,0.0011000000000000003,
BOT,0.09880000000000001,
BRA,1.9228999999999987,
BRN,0.1367,
BRU,0.0055000000000000005,
BUL,0.31529999999999997,
BUR,0.03949999999999999,
CAF,0.0023000000000000017,
CAM,0.005900000000000001,
CAN,3.3204999999999996,
CAY,0.0431,
CGO,-0.0016000000000000007,
CHA,0.0079,
CHI,0.29329999999999995,
CHN,3.8547000000000002,
CIV,0.1901,
CMR,0.003599999999999999,
COD,0.0325,
COK,0.022899999999999997,
COL,0.26400000000000007,
COM,0.013600000000000001,
CPV,0.02359999999999999,
CRC,0.06169999999999999,
CRO,0.4178000000000001,


In [63]:
results.drop('Gold', axis=1, inplace=True)

In [65]:
results.to_csv('2024_predicted_Gold.csv', index=False)