In [6]:
import pandas as pd
df_no_outliers = pd.read_csv("C:/Users/Zainab/Downloads/swimming_data_no_outliers.csv")


In [7]:
# Define a function to categorize swimmers based on their ranking
def categorize_swimmer(ranking):
    if ranking <= 3:
        return 'Elite'
    elif ranking <= 8:
        return 'Competitive'
    else:
        return 'Developing'

# Ensure 'Ranking_numeric' exists and drop rows with missing values
df_no_outliers['Ranking_numeric'] = pd.to_numeric(df_no_outliers['Ranking'], errors='coerce')
df_no_outliers = df_no_outliers.dropna(subset=['Ranking_numeric'])

# Create a new column for the performance category
df_no_outliers['Performance_Category'] = df_no_outliers['Ranking_numeric'].apply(categorize_swimmer)


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the features and target for classification
features_class = ['Ranking_numeric', 'Distance', 'Sex', 'Event']
target_class = 'Performance_Category'
X_class = df_no_outliers[features_class]
y_class = df_no_outliers[target_class]

# Define which features are numeric and which are categorical
numeric_features = ['Ranking_numeric', 'Distance']
categorical_features = ['Sex', 'Event']

# Create transformers for numeric and categorical data
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine transformers into a preprocessor
preprocessor_class = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Build the Random Forest pipeline for classification
pipeline_rf = Pipeline(steps=[
    ('preprocessor', preprocessor_class),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Split the data into training and testing sets (80/20 split)
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42
)

# Train the classifier
pipeline_rf.fit(X_train_class, y_train_class)

# Make predictions on the test set
y_pred_class = pipeline_rf.predict(X_test_class)

# Evaluate the classifier's performance
print("Classification Report:")
print(classification_report(y_test_class, y_pred_class))
print("Confusion Matrix:")
print(confusion_matrix(y_test_class, y_pred_class))
print("Accuracy:", accuracy_score(y_test_class, y_pred_class))


Classification Report:
              precision    recall  f1-score   support

 Competitive       1.00      1.00      1.00        56
  Developing       1.00      1.00      1.00       387
       Elite       1.00      1.00      1.00        21

    accuracy                           1.00       464
   macro avg       1.00      1.00      1.00       464
weighted avg       1.00      1.00      1.00       464

Confusion Matrix:
[[ 56   0   0]
 [  0 387   0]
 [  0   0  21]]
Accuracy: 1.0


In [9]:
import numpy as np
import pandas as pd

# Function to assign performance category based on time percentiles within each event.
def assign_category(group):
    # Compute the percentile rank of Time_seconds in each event.

    group['percentile'] = group['Time_seconds'].rank(pct=True, method='min')
    conditions = [
        (group['percentile'] <= 0.2),                    # Fastest 20%
        (group['percentile'] > 0.2) & (group['percentile'] <= 0.8),  # Middle 60%
        (group['percentile'] > 0.8)                      # Slowest 20%
    ]
    choices = ['Elite', 'Competitive', 'Developing']
    group['Performance_Category_New'] = np.select(conditions, choices, default='Competitive')
    return group

# Apply the function per event
df_no_outliers = df_no_outliers.groupby('Event').apply(assign_category)

# Drop the temporary 'percentile'  don't need it further:
df_no_outliers = df_no_outliers.drop(columns=['percentile'])


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define the features and new target for classification
features_class = ['Ranking_numeric', 'Distance', 'Sex', 'Event']
target_class_new = 'Performance_Category_New'

# Ensure 'Ranking_numeric' exists if not
df_no_outliers['Ranking_numeric'] = pd.to_numeric(df_no_outliers['Ranking'], errors='coerce')
df_no_outliers = df_no_outliers.dropna(subset=['Ranking_numeric'])

X_class_new = df_no_outliers[features_class]
y_class_new = df_no_outliers[target_class_new]

# Define which features are numeric and which are categorical
numeric_features = ['Ranking_numeric', 'Distance']
categorical_features = ['Sex', 'Event']

# Create transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine transformers into a preprocessor for classification
preprocessor_class = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Build the Random Forest pipeline for classification
pipeline_rf_new = Pipeline(steps=[
    ('preprocessor', preprocessor_class),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Split data into training and testing sets (80/20 split)
X_train_class_new, X_test_class_new, y_train_class_new, y_test_class_new = train_test_split(
    X_class_new, y_class_new, test_size=0.2, random_state=42
)

# Train the classifier with the new target
pipeline_rf_new.fit(X_train_class_new, y_train_class_new)

# Make predictions on the test set
y_pred_class_new = pipeline_rf_new.predict(X_test_class_new)


print("Modified Classification Report:")
print(classification_report(y_test_class_new, y_pred_class_new))
print("Modified Confusion Matrix:")
print(confusion_matrix(y_test_class_new, y_pred_class_new))
print("Modified Accuracy:", accuracy_score(y_test_class_new, y_pred_class_new))


Modified Classification Report:
              precision    recall  f1-score   support

 Competitive       0.94      0.99      0.96       273
  Developing       0.97      0.95      0.96        87
       Elite       1.00      0.88      0.93       104

    accuracy                           0.96       464
   macro avg       0.97      0.94      0.95       464
weighted avg       0.96      0.96      0.96       464

Modified Confusion Matrix:
[[270   3   0]
 [  4  83   0]
 [ 13   0  91]]
Modified Accuracy: 0.9568965517241379


In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import numpy as np
import pandas as pd



# ensure 'Ranking_numeric' exists:
df_no_outliers['Ranking_numeric'] = pd.to_numeric(df_no_outliers['Ranking'], errors='coerce')
df_no_outliers = df_no_outliers.dropna(subset=['Ranking_numeric'])

# Use the modified target:
target_class_new = 'Performance_Category_New'
features_class = ['Ranking_numeric', 'Distance', 'Sex', 'Event']

X_class_new = df_no_outliers[features_class]
y_class_new = df_no_outliers[target_class_new]

# Define numeric and categorical features
numeric_features = ['Ranking_numeric', 'Distance']
categorical_features = ['Sex', 'Event']

# Create transformers
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Combine into a preprocessor
preprocessor_class = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Build the Random Forest pipeline for classification 
pipeline_rf_new = Pipeline(steps=[
    ('preprocessor', preprocessor_class),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Split data into training and test sets (80/20 split)
X_train_class_new, X_test_class_new, y_train_class_new, y_test_class_new = train_test_split(
    X_class_new, y_class_new, test_size=0.2, random_state=42
)

# Define a hyperparameter grid for tuning
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__class_weight': [None, 'balanced']
}

# Set up the grid search with 5-fold cross-validation and use accuracy as scoring
grid_search_rf = GridSearchCV(pipeline_rf_new, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search on the training set
grid_search_rf.fit(X_train_class_new, y_train_class_new)

# Print the best hyperparameters and best cross-validated accuracy
print("Best Parameters:", grid_search_rf.best_params_)
print("Best CV Accuracy:", grid_search_rf.best_score_)


best_model = grid_search_rf.best_estimator_
y_pred_class_new_tuned = best_model.predict(X_test_class_new)

print("Tuned Model Test Accuracy:", accuracy_score(y_test_class_new, y_pred_class_new_tuned))
print("Tuned Classification Report:")
print(classification_report(y_test_class_new, y_pred_class_new_tuned))
print("Tuned Confusion Matrix:")
print(confusion_matrix(y_test_class_new, y_pred_class_new_tuned))


Best Parameters: {'classifier__class_weight': None, 'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 100}
Best CV Accuracy: 0.958419173890872
Tuned Model Test Accuracy: 0.9568965517241379
Tuned Classification Report:
              precision    recall  f1-score   support

 Competitive       0.94      0.99      0.96       273
  Developing       0.97      0.95      0.96        87
       Elite       1.00      0.88      0.93       104

    accuracy                           0.96       464
   macro avg       0.97      0.94      0.95       464
weighted avg       0.96      0.96      0.96       464

Tuned Confusion Matrix:
[[270   3   0]
 [  4  83   0]
 [ 13   0  91]]


In [12]:

df_no_outliers['Predicted_Category'] = best_model.predict(df_no_outliers[features_class])

# Now, group the dataframe by the predicted category and list the swimmer names in each group.
grouped_names = df_no_outliers.groupby('Predicted_Category')['Name'].apply(list)

print("Swimmers by Predicted Category:")
print(grouped_names)


Swimmers by Predicted Category:
Predicted_Category
Competitive    [David Theile, John Monckton, Bob Bennett, Tom...
Developing     [Cathy Ferguson, Ann Farlie, Elaine Tanner, El...
Elite          [Igor Polyansky, David Berkoff, David Berkoff,...
Name: Name, dtype: object


In [13]:
import os


downloads_path = os.path.join(os.path.expanduser("~"), "Downloads", "classified_swimming_data.csv")


df_no_outliers.to_csv(downloads_path, index=False)
print("File saved to:", downloads_path)


File saved to: C:\Users\Zainab\Downloads\classified_swimming_data.csv


In [1]:


# First, sort the DataFrame by Ranking_numeric in ascending order
df_sorted = df_no_outliers.sort_values(by='Ranking_numeric', ascending=True)

# Then, group by the performance category and take the top 3 swimmers for each group
top3_per_category = df_sorted.groupby('Performance_Category_New').head(3)


print(top3_per_category[['Name', 'Ranking_numeric', 'Time_seconds', 'Performance_Category_New']])


NameError: name 'df_no_outliers' is not defined