In [16]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

def prepare_time_window_data(df, target_year, noc):
    """准备时间窗口数据"""
    # 获取目标年份前三届奥运会的年份
    previous_years = [target_year - 12, target_year - 8, target_year - 4]

    # 提取特征
    features = []
    for year in previous_years:
        year_data = df[(df['Year'] == year) & (df['NOC'] == noc)].iloc[0]
        # 计算该年的奖牌占比
        total_medals_ratio = year_data['Total'] / year_data['Events']
        gold_medals_ratio = year_data['Gold'] / year_data['Events']
        silver_medals_ratio = year_data['Silver'] / year_data['Events']
        bronze_medals_ratio = year_data['Bronze'] / year_data['Events']
        participants_ratio = year_data['Participants'] / year_data['Events']

        features.extend([
            total_medals_ratio, gold_medals_ratio, silver_medals_ratio,
            bronze_medals_ratio, participants_ratio, year_data['is_host']
        ])

    # 添加目标年份的is_host
    target_data = df[(df['Year'] == target_year) & (df['NOC'] == noc)].iloc[0]
    features.append(target_data['is_host'])

    return features

def train_lasso_model(df, noc, min_participations=8):
    """为特定国家训练Lasso模型"""
    # 检查参与次数
    participations = len(df[df['NOC'] == noc])
    if participations < min_participations:
        return None

    # 准备训练数据
    X = []
    y = []

    # 从2008年开始训练（因为需要前12年的数据）
    for year in range(1986, 2024, 4):
        try:
            X.append(prepare_time_window_data(df, year, noc))
            year_data = df[(df['Year'] == year) & (df['NOC'] == noc)].iloc[0]
            y.append(year_data['Total'] / year_data['Events'])
        except:
            continue

    if len(X) < 3:  # 确保有足够的训练数据
        return None

    X = np.array(X)
    y = np.array(y)

    # 标准化特征
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 训练Lasso模型
    model = Lasso(alpha=0.01)
    model.fit(X_scaled, y)

    return model, scaler

def predict_2024(df, noc, model, scaler):
    """预测2024年的奖牌比例"""
    features = prepare_time_window_data(df, 2024, noc)
    features = np.array(features).reshape(1, -1)
    features_scaled = scaler.transform(features)
    return model.predict(features_scaled)[0]

# 主程序
def main():
    # 假设数据已经读入DataFrame df
    df = pd.read_csv('../complete_data.csv')

    # 获取所有unique的NOC
    nocs = df['NOC'].unique()

    predictions = {}

    for noc in nocs:
        result = train_lasso_model(df, noc)
        if result is not None:
            model, scaler = result
            pred = predict_2024(df, noc, model, scaler)
            predictions[noc] = pred

    # 将预测结果转换为DataFrame
    predictions_df = pd.DataFrame.from_dict(predictions, orient='index', columns=['Predicted_Medal_Ratio'])
    predictions_df.sort_values('Predicted_Medal_Ratio', ascending=False, inplace=True)

    return predictions_df

# 运行预测
if __name__ == "__main__":
    predictions = main()
    print("2024年奥运会奖牌比例预测：")
    print(predictions)


  total_medals_ratio = year_data['Total'] / year_data['Events']
  gold_medals_ratio = year_data['Gold'] / year_data['Events']
  silver_medals_ratio = year_data['Silver'] / year_data['Events']
  bronze_medals_ratio = year_data['Bronze'] / year_data['Events']
  participants_ratio = year_data['Participants'] / year_data['Events']


ValueError: Input X contains NaN.
Lasso does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [2]:
cnt

NameError: name 'cnt' is not defined

In [6]:

# 计算预测奖牌数（假设2024年总奖牌数与2020年相同）
total_2020 = 1019
results['Predicted_Total'] = results['Predicted_2024_Pct'] * total_2020

# 保存结果
results.sort_values('Predicted_Total', ascending=False, inplace=True)
print(results.head(10))

# 可视化示例
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
top10 = results.head(10)
plt.barh(top10['Country'], top10['Predicted_Total'])
plt.xlabel('Predicted Medal Count')
plt.title('2024 Olympic Medal Predictions (Top 10)')
plt.gca().invert_yaxis()
plt.show()

KeyError: 'Predicted_2024_Pct'

In [4]:
import pandas as pd

# 将结果转换为DataFrame
results_list = []
for noc, res in results.items():
    results_list.append({
        'NOC': noc,
        'Actual_Total_2024': res['Actual'],
        'Predicted_Total_2024': res['Predicted']
    })

# 创建DataFrame
results_df = pd.DataFrame(results_list)

# 保存到CSV文件
results_df.to_csv("Total.csv", index=False)
print("Results saved to 'Total.csv'.")

Results saved to 'Total.csv'.
