In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score


def load_and_merge_data(data_dir="Data"):
    """
    Load and merge all necessary datasets for modeling.
    """

    # 1. Load main athlete data
    athletes_path = os.path.join(data_dir, "summerOly_athletes.csv")
    athletes_df = pd.read_csv(athletes_path)

    # 2. Load moving averages (medal probabilities)
    mov_avg_path = os.path.join(data_dir, "moving_averages.csv")
    movavg_df = pd.read_csv(mov_avg_path)

    # 3. Load sports codes
    sports_path = os.path.join(data_dir, "sports_codes.csv")
    sports_codes_df = pd.read_csv(sports_path)

    # 4. Load medal counts
    medal_counts_path = os.path.join(data_dir, "summerOly_medal_counts_with_codes.csv")
    medal_counts_df = pd.read_csv(medal_counts_path)

    # 5. Load host countries
    hosts_path = os.path.join(data_dir, "summerOly_hosts_with_codes.csv")
    hosts_df = pd.read_csv(hosts_path)

    # Merge dataframes
    # Merge moving averages with athlete-level data
    merged_df = pd.merge(
        athletes_df, movavg_df, on=["Athlete_ID", "Year", "Sport"], how="left"
    )

    # Merge sports codes for standardization
    merged_df = pd.merge(merged_df, sports_codes_df, on="Sport", how="left")

    # Merge host information
    merged_df = pd.merge(
        merged_df, hosts_df, on=["Year", "Country_Code"], how="left"
    )
    merged_df["Is_Host"] = merged_df["Is_Host"].fillna(0).astype(int)

    # Merge medal counts (country-level data)
    merged_df = pd.merge(
        merged_df,
        medal_counts_df,
        on=["Year", "Country_Code"],
        how="left",
        suffixes=("", "_Country"),
    )

    return merged_df


def preprocess_and_engineer(merged_df):
    """
    Preprocess and engineer features for modeling.
    """
    # Create a binary column for medal wins
    merged_df["Medal_Won"] = merged_df["Medal"].apply(
        lambda x: 1 if pd.notnull(x) and x in ["Gold", "Silver", "Bronze"] else 0
    )

    # Create country-year aggregates
    country_year_df = merged_df.groupby(["Country_Code", "Year"]).agg(
        Total_Medals=("Medal_Won", "sum"),
        Athlete_Count=("Athlete_ID", "nunique"),
        Avg_Probability=("MA_Probability", "mean"),
        Is_Host=("Is_Host", "max"),
    ).reset_index()

    # Add lagged medal counts
    country_year_df["Lag_Total_Medals"] = country_year_df.groupby("Country_Code")[
        "Total_Medals"
    ].shift(1)

    return country_year_df


def train_models(country_year_df):
    """
    Train models for medal prediction.
    """
    # Prepare data
    features = ["Lag_Total_Medals", "Athlete_Count", "Avg_Probability", "Is_Host"]
    target_reg = "Total_Medals"
    target_clf = (country_year_df["Total_Medals"] > 0).astype(int)

    X = country_year_df[features].fillna(0)
    y_reg = country_year_df[target_reg]
    y_clf = target_clf

    # Train-test split
    X_train, X_test, y_reg_train, y_reg_test = train_test_split(
        X, y_reg, test_size=0.2, shuffle=False
    )
    _, _, y_clf_train, y_clf_test = train_test_split(
        X, y_clf, test_size=0.2, shuffle=False
    )

    # Train RandomForestRegressor
    rf = RandomForestRegressor(n_estimators=100, random_state=42)
    rf.fit(X_train, y_reg_train)
    reg_preds = rf.predict(X_test)
    rmse = mean_squared_error(y_reg_test, reg_preds, squared=False)
    print(f"Regressor RMSE: {rmse:.2f}")

    # Train LogisticRegression
    clf = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
    clf.fit(X_train, y_clf_train)
    clf_preds = clf.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_clf_test, clf_preds)
    print(f"Classifier AUC: {auc:.3f}")

    return rf, clf


def predict_for_2028(country_year_df, rf, clf, total_medals=1000):
    """
    Predict medal counts for 2028 and allocate medals.
    """
    features = ["Lag_Total_Medals", "Athlete_Count", "Avg_Probability", "Is_Host"]
    df_2028 = country_year_df[country_year_df["Year"] == 2028].copy()

    # Predict total medals
    df_2028["Predicted_Medals"] = rf.predict(df_2028[features])
    df_2028["Predicted_Medals"] = np.clip(df_2028["Predicted_Medals"], 0, None)

    # Predict probability of winning any medal
    df_2028["Prob_Any_Medal"] = clf.predict_proba(df_2028[features])[:, 1]

    # Scale medals to total_medals
    total_predicted = df_2028["Predicted_Medals"].sum()
    if total_predicted > 0:
        df_2028["Scaled_Medals"] = (
            df_2028["Predicted_Medals"] / total_predicted
        ) * total_medals

    return df_2028


def main():
    data_dir = "Data"

    # Load and merge data
    merged_df = load_and_merge_data(data_dir)

    # Preprocess and engineer features
    country_year_df = preprocess_and_engineer(merged_df)

    # Train models
    rf, clf = train_models(country_year_df)

    # Predict for 2028
    result_2028 = predict_for_2028(country_year_df, rf, clf)
    print(result_2028[["Country_Code", "Scaled_Medals", "Prob_Any_Medal"]])


if __name__ == "__main__":
    main()
