In [1]:
import pandas as pd
df_no_outliers = pd.read_csv("C:/Users/Zainab/Downloads/swimming_data_no_outliers.csv")

In [2]:


# Display counts of records per event
event_counts = df_no_outliers['Event'].value_counts()
print("Event Counts:")
print(event_counts)

# Define a threshold for minimum records per event 
min_records = 50
sufficient_events = event_counts[event_counts >= min_records].index.tolist()
print("Events with sufficient data:", sufficient_events)


Event Counts:
4 x 100 m medley relay       337
4 x 100 m freestyle relay    322
4 x 200 m freestyle relay    248
200 m breaststroke           133
400 m freestyle              131
200 m backstroke             116
800 m freestyle              111
100 m freestyle              107
200 m freestyle              106
100 m breaststroke            96
200 m individual medley       88
1500 m freestyle              88
200 m butterfly               85
400 m individual medley       82
100 m butterfly               76
100 m backstroke              73
50 m freestyle                56
50 m backstroke               24
50 m breaststroke             20
50 m butterfly                17
Name: Event, dtype: int64
Events with sufficient data: ['4 x 100 m medley relay', '4 x 100 m freestyle relay', '4 x 200 m freestyle relay', '200 m breaststroke', '400 m freestyle', '200 m backstroke', '800 m freestyle', '100 m freestyle', '200 m freestyle', '100 m breaststroke', '200 m individual medley', '1500 m freestyle',

In [3]:
import numpy as np
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


event_results = {}

# Loop through each event 
for event in sufficient_events:
    # Filter dataset for the specific event
    df_event = df_no_outliers[df_no_outliers['Event'] == event].copy()
    
    
    #  use 'Ranking_numeric' and 'Sex' as predictors.
    features = ['Ranking_numeric', 'Sex'] 
    target = 'Time_seconds'
    
    X = df_event[features]
    y = df_event[target]
    
    # Define preprocessors: Standardize numeric features and one-hot encode categorical ones
    numeric_features = ['Ranking_numeric']
    categorical_features = ['Sex']
    
    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore')
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )
    
    # Build the pipeline with XGBoost Regressor
    pipeline_xgb = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('xgb', XGBRegressor(objective='reg:squarederror', random_state=42))
    ])
    
    # Split the data (80/20)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Fit the model
    pipeline_xgb.fit(X_train, y_train)
    y_pred = pipeline_xgb.predict(X_test)
    
    # Calculate evaluation metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    
    event_results[event] = {'rmse': rmse, 'r2': r2, 'model': pipeline_xgb}
    
    print(f"Event: {event}")
    print(f"  XGBoost Test RMSE: {rmse:.2f}")
    print(f"  XGBoost Test R²: {r2:.4f}")
    print("-----")


Event: 4 x 100 m medley relay
  XGBoost Test RMSE: 0.01
  XGBoost Test R²: 1.0000
-----
Event: 4 x 100 m freestyle relay
  XGBoost Test RMSE: 0.91
  XGBoost Test R²: 0.9983
-----
Event: 4 x 200 m freestyle relay
  XGBoost Test RMSE: 0.00
  XGBoost Test R²: 1.0000
-----
Event: 200 m breaststroke
  XGBoost Test RMSE: 2.00
  XGBoost Test R²: 0.9885
-----
Event: 400 m freestyle
  XGBoost Test RMSE: 3.62
  XGBoost Test R²: 0.9876
-----
Event: 200 m backstroke
  XGBoost Test RMSE: 6.74
  XGBoost Test R²: 0.8607
-----
Event: 800 m freestyle
  XGBoost Test RMSE: 11.72
  XGBoost Test R²: 0.9745
-----
Event: 100 m freestyle
  XGBoost Test RMSE: 0.55
  XGBoost Test R²: 0.9918
-----
Event: 200 m freestyle
  XGBoost Test RMSE: 2.82
  XGBoost Test R²: 0.9533
-----
Event: 100 m breaststroke
  XGBoost Test RMSE: 0.33
  XGBoost Test R²: 0.9956
-----
Event: 200 m individual medley
  XGBoost Test RMSE: 1.28
  XGBoost Test R²: 0.9826
-----
Event: 1500 m freestyle
  XGBoost Test RMSE: 17.40
  XGBoost Test 

In [4]:

event_to_inspect = "100 m freestyle"
if event_to_inspect in event_results:
    model_pipeline = event_results[event_to_inspect]['model']
    xgb_model = model_pipeline.named_steps['xgb']
    
   
   
    numeric_features = ['Ranking_numeric']
    categorical_features = ['Sex']
    preprocessor = model_pipeline.named_steps['preprocessor']
    ohe = preprocessor.named_transformers_['cat']
    ohe_feature_names = ohe.get_feature_names_out(categorical_features)
    all_feature_names = np.concatenate([numeric_features, ohe_feature_names])
    
    importances = xgb_model.feature_importances_
    importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    print(f"Feature Importances for {event_to_inspect}:")
    print(importance_df)
else:
    print(f"Event '{event_to_inspect}' not found in sufficient events.")


Feature Importances for 100 m freestyle:
           Feature  Importance
1       Sex_Female    0.775194
0  Ranking_numeric    0.224806
2         Sex_Male    0.000000


In [5]:

event_to_inspect = "100 m backstroke"
if event_to_inspect in event_results:
    model_pipeline = event_results[event_to_inspect]['model']
    xgb_model = model_pipeline.named_steps['xgb']
    
    
    numeric_features = ['Ranking_numeric']
    categorical_features = ['Sex']
    preprocessor = model_pipeline.named_steps['preprocessor']
    ohe = preprocessor.named_transformers_['cat']
    ohe_feature_names = ohe.get_feature_names_out(categorical_features)
    all_feature_names = np.concatenate([numeric_features, ohe_feature_names])
    
    importances = xgb_model.feature_importances_
    importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    print(f"Feature Importances for {event_to_inspect}:")
    print(importance_df)
else:
    print(f"Event '{event_to_inspect}' not found in sufficient events.")


Feature Importances for 100 m backstroke:
           Feature  Importance
1       Sex_Female    0.873653
0  Ranking_numeric    0.126347
2         Sex_Male    0.000000


In [6]:

event_to_inspect = "100 m butterfly"
if event_to_inspect in event_results:
    model_pipeline = event_results[event_to_inspect]['model']
    xgb_model = model_pipeline.named_steps['xgb']
    
   
    numeric_features = ['Ranking_numeric']
    categorical_features = ['Sex']
    preprocessor = model_pipeline.named_steps['preprocessor']
    ohe = preprocessor.named_transformers_['cat']
    ohe_feature_names = ohe.get_feature_names_out(categorical_features)
    all_feature_names = np.concatenate([numeric_features, ohe_feature_names])
    
    importances = xgb_model.feature_importances_
    importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    print(f"Feature Importances for {event_to_inspect}:")
    print(importance_df)
else:
    print(f"Event '{event_to_inspect}' not found in sufficient events.")


Feature Importances for 100 m butterfly:
           Feature  Importance
1       Sex_Female    0.735266
0  Ranking_numeric    0.264734
2         Sex_Male    0.000000


In [10]:

event_to_inspect = "1500 m freestyle"
if event_to_inspect in event_results:
    model_pipeline = event_results[event_to_inspect]['model']
    xgb_model = model_pipeline.named_steps['xgb']
    
   
    numeric_features = ['Ranking_numeric']
    categorical_features = ['Sex']
    preprocessor = model_pipeline.named_steps['preprocessor']
    ohe = preprocessor.named_transformers_['cat']
    ohe_feature_names = ohe.get_feature_names_out(categorical_features)
    all_feature_names = np.concatenate([numeric_features, ohe_feature_names])
    
    importances = xgb_model.feature_importances_
    importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    print(f"Feature Importances for {event_to_inspect}:")
    print(importance_df)
else:
    print(f"Event '{event_to_inspect}' not found in sufficient events.")


Feature Importances for 1500 m freestyle:
           Feature  Importance
0  Ranking_numeric    0.841388
1       Sex_Female    0.158612
2         Sex_Male    0.000000


In [15]:

event_to_inspect = "800 m freestyle"
if event_to_inspect in event_results:
    model_pipeline = event_results[event_to_inspect]['model']
    xgb_model = model_pipeline.named_steps['xgb']
    
   
    numeric_features = ['Ranking_numeric']
    categorical_features = ['Sex']
    preprocessor = model_pipeline.named_steps['preprocessor']
    ohe = preprocessor.named_transformers_['cat']
    ohe_feature_names = ohe.get_feature_names_out(categorical_features)
    all_feature_names = np.concatenate([numeric_features, ohe_feature_names])
    
    importances = xgb_model.feature_importances_
    importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    print(f"Feature Importances for {event_to_inspect}:")
    print(importance_df)
else:
    print(f"Event '{event_to_inspect}' not found in sufficient events.")


Feature Importances for 800 m freestyle:
           Feature  Importance
0  Ranking_numeric    0.892667
1       Sex_Female    0.107333
2         Sex_Male    0.000000


In [20]:


event_to_inspect = "4 x 100 m freestyle relay"
if event_to_inspect in event_results:
    model_pipeline = event_results[event_to_inspect]['model']
    xgb_model = model_pipeline.named_steps['xgb']
    

    numeric_features = ['Ranking_numeric']
    categorical_features = ['Sex']
    preprocessor = model_pipeline.named_steps['preprocessor']
    ohe = preprocessor.named_transformers_['cat']
    ohe_feature_names = ohe.get_feature_names_out(categorical_features)
    all_feature_names = np.concatenate([numeric_features, ohe_feature_names])
    
    importances = xgb_model.feature_importances_
    importance_df = pd.DataFrame({'Feature': all_feature_names, 'Importance': importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)
    print(f"Feature Importances for {event_to_inspect}:")
    print(importance_df)
else:
    print(f"Event '{event_to_inspect}' not found in sufficient events.")


Feature Importances for 4 x 100 m freestyle relay:
           Feature  Importance
1       Sex_Female    0.675725
0  Ranking_numeric    0.324275
2         Sex_Male    0.000000


In [22]:
import os


importance_df = pd.DataFrame({
    'Feature': ['Ranking_numeric', 'Sex_Female', 'Sex_Male'],
    'Importance': [0.841388, 0.158612, 0.000000]
})


downloads_path = os.path.join(os.path.expanduser("~"), "Downloads", "1500m_freestyle_feature_importances.csv")


importance_df.to_csv(downloads_path, index=False)

print("Results saved to:", downloads_path)


Results saved to: C:\Users\Zainab\Downloads\1500m_freestyle_feature_importances.csv


In [23]:
import os
import pandas as pd

# summary for XGBoost model results:
data = {
    'Event': [
        '4 x 100 m medley relay', '4 x 100 m freestyle relay', '4 x 200 m freestyle relay',
        '200 m breaststroke', '400 m freestyle', '200 m backstroke', '800 m freestyle',
        '100 m freestyle', '200 m freestyle', '100 m breaststroke', '200 m individual medley',
        '1500 m freestyle', '200 m butterfly', '400 m individual medley', '100 m butterfly',
        '100 m backstroke', '50 m freestyle'
    ],
    'Test_RMSE': [
        0.01, 0.91, 0.00, 2.00, 3.62, 6.74, 11.72, 0.55, 2.82, 0.33, 1.28, 17.40, 1.16, 2.14, 0.51, 0.60, 0.20
    ],
    'Test_R2': [
        1.0000, 0.9983, 1.0000, 0.9885, 0.9876, 0.8607, 0.9745, 0.9918, 0.9533, 0.9956,
        0.9826, 0.9802, 0.9885, 0.9936, 0.9897, 0.9702, 0.9830
    ]
}

results_df = pd.DataFrame(data)


importance_data = {
    'Feature': ['Sex_Female', 'Ranking_numeric', 'Sex_Male'],
    'Importance': [0.775194, 0.224806, 0.0]
}
importance_df = pd.DataFrame(importance_data)


downloads_path = os.path.join(os.path.expanduser("~"), "Downloads")
results_file = os.path.join(downloads_path, "xgb_event_results.csv")
results_df.to_csv(results_file, index=False)
print("Event results saved to:", results_file)


importance_file = os.path.join(downloads_path, "xgb_feature_importances_100m_freestyle.csv")
importance_df.to_csv(importance_file, index=False)
print("Feature importances saved to:", importance_file)


Event results saved to: C:\Users\Zainab\Downloads\xgb_event_results.csv
Feature importances saved to: C:\Users\Zainab\Downloads\xgb_feature_importances_100m_freestyle.csv
