In [None]:
#library installation
!pip install category_encoders==2.*
!pip install eli5
!pip install xgboost


In [None]:
# file import for google colab
from google.colab import files

In [None]:

#import relevant libraries

import pandas as pd
import numpy as np
import sklearn
import xgboost
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import eli5
from eli5.sklearn import PermutationImportance
from xgboost import XGBRegressor

import joblib
from joblib import dump

In [None]:
#import OPL dataset if in colab

upload = files.upload()

In [None]:
#begin EDA

df = pd.read_csv('openpowerlifting.csv')
df.head()



In [None]:
#EDA 
df.head()



In [None]:
#Scope Restriction
#Looking through the data it's clear that the observations are both very numerous and diverse in ways that probably don't contribute to the predictive accuracy of our modeling. 
#We can reduce the scope of the data to something more manageable and increase accuracy by eliminating confounding variables and focusing on those of interest in a comparitively homogenous subset of the data.


df = df[(df['Sex'] == 'M')] #Keep only observations of males. This avoids sex differences obscuring the relationships between more useful variables, and enables us to use Wilks coefficients. 
                            #Females will get their own model.

df = df[(df['Equipment'] == 'Raw')] #Keep only observations of raw lifters. The nuances single and multiply equipment add to these observations aren't our concern here.

df = df[(df['Division'] == 'Open')] #Similar to our motivation for eliminating sex as a variable, the noise introduced by various age categories is unlikely to be helpful.

df = df[(df['Event'] == 'SBD')] #We're intersted in full PL competition and its participants, not specialists.

df = df[(df['Place'] != 'DQ')] #the principle means whereby one is disqualified from a powerlifting meet is by "bombing out," eg. failing to produce a passing attempt on one or more of the lifts.
                               #These we reject for similar reasons to the specialists
df = df[(df['Wilks'] >= 150)] #we restricting our interest to lifters with a wilks score of at least 150 as a very gentle quality control measure. 
                              #150 wilks is very easy to achieve, and results lower than this are unlikely to say anything useful about training (because very little training went into them) 

#We additionally drop observations with NaNs in important columns not appropriate for imputation. These drops are mostly redundant after those above; we're mainly catching errors and inconsistency in data entry here. 
df = df.dropna(subset=['Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg', 'Wilks'])

#Here we drop the remaining columns that are of no use to us. 
#Name, Date, Place, Tested, Country, Federation, MeetCountry, MeetState, and Meetname aren't predictively useful.
#Sex, Event, Equipment, and Division are all redundant single category categoricals as a result of the decisions we made above. 
#The columns devoted to each individual attempt are dropped because the relevant information from them (both for our purpose and competition) is summarized in the corresponding "best" columns for each lift.
#Totalkg, Wilks, McCulloch, Glossbrenner, and IPFPoints are dropped to prevent leakage; these values are each functions of groups of our other columns that include the target. 
#Leaving any of them in would reduce our "prediction" to simple algebra.

df = df.drop(columns=['Name', 'Sex', 'Event', 'Equipment', 'Division', 'Squat1Kg', 'Squat2Kg', 'Squat3Kg', 'Squat4Kg', 'Bench1Kg', 'Bench2Kg', 'Bench3Kg', 'Bench4Kg', 'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg', 'Deadlift4Kg', 'TotalKg', 'Place', 'Wilks', 'McCulloch', 'Glossbrenner', 'IPFPoints', 'Tested', 'Country', 'Federation', 'Date', 'MeetCountry', 'MeetState', 'MeetName'])

In [None]:
#Formatting
#Categorical features are changed to strings so that they'll place nice with encoders. 

df['AgeClass'] = df['AgeClass'].astype(str)
df['WeightClassKg'] = df['WeightClassKg'].astype(str)

#Reset index so that it is no longer full of gaps from our dropped observations
df = df.reset_index(drop=True)




In [None]:
#EDA post wrangle

df['Best3DeadliftKg'].mean()


In [None]:
#Feature Engineering

#Wilks coefficients 
#The Wilks formula (and corresponding coefficients) are one of the metrics used to compare perfomance across bodyweights.
#We removed the Wilks value from our data because the contribution of a lifter's deadlift to that value creates a leakage problem, but the formula only relates bodyweight to lifted weight (regardless of where that weight came from).
#This means we can restore some of the information to our dataset by calculating the contribution of the other lifts to a lifter's Wilks sans deadlift, and possibly add more by doing so for each individual lift (to compare their relative contribution).
#These "[lift] Wilks" features are simply Wilks calculations with individual lifts instead of totals. The coefficients are unchanged."

a = -216.0475144
b = 16.2606339
c = -0.002388645
d = -0.00113732
e = 0.00000701863
f = -0.00000001291

df['Squat Wilks'] = df['Best3SquatKg'] * 500 /(a+(b*df['BodyweightKg'])+(c*df['BodyweightKg']**2)+(d*df['BodyweightKg']**3)+(e*df['BodyweightKg']**4)+(f*df['BodyweightKg']**5)) 
df['Bench Wilks'] = df['Best3BenchKg'] * 500 /(a+(b*df['BodyweightKg'])+(c*df['BodyweightKg']**2)+(d*df['BodyweightKg']**3)+(e*df['BodyweightKg']**4)+(f*df['BodyweightKg']**5)) 

#Analogy to anthropometry
#Folk wisdom says that some amount of the difference between lifters' relative ability in the lifts are a result of differing anthropometry (specifically limb lengths and the resulting leverages)
#While there isn't any anthropometic data in the dataset, we can attempt to test these assumptions by making some of our own to approximate it. 
#If we assume that relatively longer limbed men tend to be heavier (simply by virtue of being larger), and that longer arms are beneficial in deadlift while being detrimental in the bench press, the ratio between these might be useful to our model. 
#We can similarly use the ratio between bench and squat to see if short arms (manifested by a relatively more powerful bench) translates to a relatively less powerful deadlift. 

df['Bench:Bodyweight Ratio'] = df['Best3BenchKg']/df['BodyweightKg']
df['Bench:Squat Ratio'] = df['Best3BenchKg']/df['Best3SquatKg']



In [None]:
#EDA post feature engineering

df.head()

In [None]:
#Exploratory Visualization
#Scatterplots to get a rough picture of the relationships between each of the individual features and the target

df.plot.scatter(x='BodyweightKg', y='Best3DeadliftKg')
df.plot.scatter(x='Best3SquatKg', y='Best3DeadliftKg')
df.plot.scatter(x='Best3BenchKg', y='Best3DeadliftKg')
df.plot.scatter(x='Bench:Squat Ratio', y='Best3DeadliftKg')
df.plot.scatter(x='Bench:Bodyweight Ratio', y='Best3DeadliftKg')
df.plot.scatter(x='Squat Wilks', y='Best3DeadliftKg')
df.plot.scatter(x='Bench Wilks', y='Best3DeadliftKg')
df.plot.scatter(x='Age', y='Best3DeadliftKg')
df.plot.scatter(x='WeightClassKg', y='Best3DeadliftKg')
df.plot.scatter(x='AgeClass', y='Best3DeadliftKg')






In [None]:
#Split
#Data split into train, val, and test sets

target = df['Best3DeadliftKg']
features = df.drop(columns='Best3DeadliftKg')
y = target
X = features

X_remain, X_test, y_remain, y_test = train_test_split(X, y, test_size=.15, random_state=666)

X_train, X_val, y_train, y_val = train_test_split(X_remain, y_remain, test_size=.2, random_state=666)


In [None]:
#Subset size comparison

len(X_train), len(X_val), len(X_test)

In [None]:
#Baseline Metrics

#For baseline metrics we're using the mean absolute and percentage errors that we would see if we just predicted the mean target value in the validation set. 


guess = y_val.mean()
guess_prediction = [guess] * len(y_val)
baseline_mae = mean_absolute_error(y_val, guess_prediction)
baseline_mape = mean_absolute_percentage_error(y_val, guess_prediction)

guess, baseline_mae, baseline_mape


In [None]:
# Simple Linear Regression

slr = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    LinearRegression())
slr.fit(X_train, y_train)



In [None]:
#testing simple linear regression metrics
#error less than half of baseline

y_pred1 = slr.predict(X_val)
linear_mae = mean_absolute_error(y_val, y_pred1)
linear_mape = mean_absolute_percentage_error(y_val, y_pred1)

linear_mae, linear_mape





In [None]:
#Examining how different features contribute to the model
#Permutation importances

#transforming feature matricies for use outside of pipeline and fitting transformed model
transformers = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

X_train_transformed = transformers.fit_transform(X_train)
X_val_transformed = transformers.transform(X_val)

slr_perm = LinearRegression()
slr_perm.fit(X_train_transformed, y_train)

In [None]:
#fitting permuter

permuter_ml = PermutationImportance(
    slr_perm,  
    n_iter=5, 
    random_state=666
)

permuter_ml.fit(X_val_transformed, y_val)

In [None]:
#View and Interpret Results
#Looking at the ordering of these features it seems as though there may be something to the anthropometric explanation of relative deadlift strength difference after all; 
#both of the features that relate bench press to bodyweight score high. I'm somewhat surprised that Bench:Squat ratio isn't more predictive than it is.
#What isn't surprising in hindsight is how unimportant Age is. Excluding non-open categories from the data seems to have excluded competitors for whom age would be a significant factor in their performance. 

feature_names = X_val.columns.tolist()
pd.Series(permuter_ml.feature_importances_, feature_names).sort_values(ascending=False)

In [None]:
#Gradient Boosting


gb = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    XGBRegressor(n_estimators=100, max_depth=5, random_state=666, n_jobs=-1))
gb.fit(X_train, y_train)


In [None]:
#testing gradient boosting metrics
#Gradient Boosting model performs roughly as well as the linear model. 


y_pred2 = gb.predict(X_val)
gb_mae = mean_absolute_error(y_val, y_pred2)
gb_mape = mean_absolute_percentage_error(y_val, y_pred2)

gb_mae, gb_mape

In [None]:
#Examining Feature Interaction in Gradient Boosting Model

#fitting transformed model, reusing permuted sets 
gb_perm = XGBRegressor()
gb_perm.fit(X_train_transformed, y_train)

In [None]:
#fitting permuter for gradient boosting model

gb_permuter = PermutationImportance(
    gb_perm,  
    n_iter=5, 
    random_state=666
)

gb_permuter.fit(X_val_transformed, y_val)

In [None]:
#View and Interpret Results
#This ordering of features is somewhat different from the one we saw with the linear model (mainly in the relative importance of Best3SquatKg). 
#Given how much redundant information there is across features we should probably take their permutation importances with a grain of salt.

feature_names = X_val.columns.tolist()
pd.Series(gb_permuter.feature_importances_, feature_names).sort_values(ascending=False)

In [None]:

df2 = pd.read_csv('openpowerlifting.csv')



In [None]:
#The same wrangling procedures mirrored to predict BestDeadliftKg for female lifters

df2 = df2[(df2['Sex'] == 'F')] 
df2 = df2[(df2['Equipment'] == 'Raw')] 
df2 = df2[(df2['Division'] == 'Open')] 
df2 = df2[(df2['Event'] == 'SBD')] 
df2 = df2[(df2['Place'] != 'DQ')]                                
df2 = df2[(df2['Wilks'] >= 150)] 

df2 = df2.dropna(subset=['Best3SquatKg', 'Best3BenchKg', 'Best3DeadliftKg', 'Wilks'])


df2 = df2.drop(columns=['Name', 'Sex', 'Event', 'Equipment', 'Division', 'Squat1Kg', 'Squat2Kg', 'Squat3Kg', 'Squat4Kg', 'Bench1Kg', 'Bench2Kg', 'Bench3Kg', 'Bench4Kg', 'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg', 'Deadlift4Kg', 'TotalKg', 'Place', 'Wilks', 'McCulloch', 'Glossbrenner', 'IPFPoints', 'Tested', 'Country', 'Federation', 'Date', 'MeetCountry', 'MeetState', 'MeetName'])

In [None]:
#Formatting
df2['AgeClass'] = df2['AgeClass'].astype(str)
df2['WeightClassKg'] = df2['WeightClassKg'].astype(str)

#Reset index 
df2 = df2.reset_index(drop=True)

In [None]:
#Feature Engineering

#Wilks coefficients 
#The Wilks coefficients for females are different from those in the male formula, which is one of the motivating reasons to model the two subsets seperately. 

a = 594.31747775582
b = -27.23842536447
c = 0.82112226871
d = -0.00930733913
e = 0.00004731582
f = -0.00000009054

df2['Squat Wilks'] = df2['Best3SquatKg'] * 500 /(a+(b*df2['BodyweightKg'])+(c*df2['BodyweightKg']**2)+(d*df2['BodyweightKg']**3)+(e*df2['BodyweightKg']**4)+(f*df2['BodyweightKg']**5)) 
df2['Bench Wilks'] = df2['Best3BenchKg'] * 500 /(a+(b*df2['BodyweightKg'])+(c*df2['BodyweightKg']**2)+(d*df2['BodyweightKg']**3)+(e*df2['BodyweightKg']**4)+(f*df2['BodyweightKg']**5)) 

#Ratios
df2['Bench:Bodyweight Ratio'] = df2['Best3BenchKg']/df2['BodyweightKg']
df2['Bench:Squat Ratio'] = df2['Best3BenchKg']/df2['Best3SquatKg']

In [None]:
df2.head()

In [None]:
#Split
#Data split into train, val, and test sets

target = df2['Best3DeadliftKg']
features = df2.drop(columns='Best3DeadliftKg')
yf = target
Xf = features

Xf_remain, Xf_test, yf_remain, yf_test = train_test_split(Xf, yf, test_size=.15, random_state=666)

Xf_train, Xf_val, yf_train, yf_val = train_test_split(Xf_remain, yf_remain, test_size=.2, random_state=666)

In [None]:
#Baseline Metrics

#baseline errors for females are predictably smaller in absolute terms but about the same in relative terms 


guess = yf_val.mean()
guess_prediction = [guess] * len(yf_val)
baseline_maef = mean_absolute_error(yf_val, guess_prediction)
baseline_mapef = mean_absolute_percentage_error(yf_val, guess_prediction)

guess, baseline_maef, baseline_mapef

In [None]:
# Simple Linear Regression

slrf = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    LinearRegression())
slrf.fit(Xf_train, yf_train)

In [None]:
#testing simple linear regression metrics
#errors for females also about half of baseline

yf_pred1 = slrf.predict(Xf_val)
linear_maef = mean_absolute_error(yf_val, yf_pred1)
linear_mapef = mean_absolute_percentage_error(yf_val, yf_pred1)

linear_maef, linear_mapef

In [None]:

#Permutation importances for female linear model

#transforming feature matricies for use outside of pipeline and fitting transformed model
transformersf = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='mean'),
    StandardScaler()
)

Xf_train_transformed = transformersf.fit_transform(Xf_train)
Xf_val_transformed = transformersf.transform(Xf_val)

slrf_perm = LinearRegression()
slrf_perm.fit(Xf_train_transformed, yf_train)

In [None]:
#fitting permuter

permuter_fl = PermutationImportance(
    slrf_perm,  
    n_iter=5, 
    random_state=666
)

permuter_fl.fit(Xf_val_transformed, yf_val)

In [None]:
#View and Interpret Results
#feature importances for the female linear model are similar those for the male version 

feature_namesf = Xf_val.columns.tolist()
pd.Series(permuter_fl.feature_importances_, feature_namesf).sort_values(ascending=False)

In [None]:
#Gradient Boosting

gbf = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    XGBRegressor(n_estimators=100, max_depth=5, random_state=666, n_jobs=-1))
gbf.fit(Xf_train, yf_train)

In [None]:
#testing gradient boosting metrics
#in the female case the gradient boosting model underperforms slightly relative to the linear model 


yf_pred2 = gb.predict(Xf_val)
gbf_maef = mean_absolute_error(yf_val, yf_pred2)
gbf_mapef = mean_absolute_percentage_error(yf_val, yf_pred2)

gbf_maef, gbf_mapef

In [None]:
#Examining Feature Interaction in Gradient Boosting Model

#fitting transformed model, reusing permuted sets 
gbf_perm = XGBRegressor()
gbf_perm.fit(Xf_train_transformed, yf_train)

In [None]:
#fitting permuter

permuter_gbf = PermutationImportance(
    gbf_perm,  
    n_iter=5, 
    random_state=666
)

permuter_gbf.fit(Xf_val_transformed, yf_val)

In [None]:
#View and Interpret Results
#Feature importances for the female gradient boosting model are somewhat more balanced than their male counterparts, albeit still dominated by BestSquatKg

feature_namesf = Xf_val.columns.tolist()
pd.Series(permuter_gbf.feature_importances_, feature_namesf).sort_values(ascending=False)

In [None]:
#Permutation Importances Formatted

#male linear model
eli5.show_weights(permuter_ml, top=None, feature_names=feature_names)

In [None]:
#male gradient boosting model
eli5.show_weights(gb_permuter, top=None, feature_names=feature_names)

In [None]:
#female linear model
eli5.show_weights(permuter_fl, top=None, feature_names=feature_names)

In [None]:
#female gradient boosting model
eli5.show_weights(permuter_gbf, top=None, feature_names=feature_names)

In [None]:
#Final Error Metrics with Test Sets
#male linear model

y_pred1_test = slr.predict(X_test)
linear_mae_test = mean_absolute_error(y_test, y_pred1_test)
linear_mape_test = mean_absolute_percentage_error(y_test, y_pred1_test)

linear_mae_test, linear_mape_test


In [None]:
#male gradient boosting model
y_pred2_test = gb.predict(X_test)
gb_mae_test = mean_absolute_error(y_test, y_pred2_test)
gb_mape_test = mean_absolute_percentage_error(y_test, y_pred2_test)

gb_mae_test, gb_mape_test

In [None]:
#female linear model
yf_pred1_test = slrf.predict(Xf_test)
linear_maef_test = mean_absolute_error(yf_test, yf_pred1_test)
linear_mapef_test = mean_absolute_percentage_error(yf_test, yf_pred1_test)

linear_maef_test, linear_mapef_test

In [None]:
#female gradient boosting model
yf_pred2_test = gb.predict(Xf_test)
gbf_maef_test = mean_absolute_error(yf_test, yf_pred2_test)
gbf_mapef_test = mean_absolute_percentage_error(yf_test, yf_pred2_test)

gbf_maef_test, gbf_mapef_test

In [None]:
#getting coefs for linear models
#male model

male_linear = slr.named_steps['linearregression']
print(male_linear.coef_),
print(male_linear.intercept_)

In [None]:
#female model

female_linear = slrf.named_steps['linearregression']
print(female_linear.coef_),
print(female_linear.intercept_)

In [None]:
# pipelines are slr, gb, slrf, gbf

In [None]:
#getting versions for libraries



print(f'joblib=={joblib.__version__}')
print(f'scikit-learn=={sklearn.__version__}')
print(f'category_encoders=={ce.__version__}')
print(f'xgboost=={xgboost.__version__}')

In [None]:
#pickling pipelines

dump(slr, 'slr.joblib', compress=True)
dump(gb, 'gb.joblib', compress=True)
dump(slrf, 'slrf.joblib', compress=True)
dump(gbf, 'gbf.joblib', compress=True)

In [None]:
#male linear predictor function

def mlpredict(age, bodyweight, squat, bench):
    
    a = -216.0475144
    b = 16.2606339
    c = -0.002388645
    d = -0.00113732
    e = 0.00000701863
    f = -0.00000001291

    mswilks = squat * 500 /(a+(b*bodyweight)+(c*bodyweight**2)+(d*bodyweight**3)+(e*bodyweight**4)+(f*bodyweight**5))
    mbwilks = bench * 500 /(a+(b*bodyweight)+(c*bodyweight**2)+(d*bodyweight**3)+(e*bodyweight**4)+(f*bodyweight**5))
    mbbr = bench/bodyweight
    mbsr = bench/squat
    age_class = '24-34'

    if (age>=5)&(age<=12):
        age_class = '5-12'
    elif (age>=13)&(age<=15):
        age_class = '13-15'
    elif (age>=16)&(age<=17):
        age_class = '16-17'
    elif (age>=18)&(age<=19):
        age_class = '18-19'
    elif (age>=20)&(age<=23):
        age_class = '20-23'
    elif (age>=24)&(age<=34):
        age_class = '24-34'
    elif (age>=35)&(age<=39):
        age_class = '35-39'
    elif (age>=40)&(age<=44):
        age_class = '40-44'
    elif (age>=45)&(age<=49):
        age_class = '45-49'
    elif (age>=50)&(age<=54):
        age_class = '50-54'
    elif (age>=55)&(age<=59):
        age_class = '55-59'
    elif (age>=60)&(age<=64):
        age_class = '60-64'
    elif (age>=65)&(age<=69):
        age_class = '65-69'
    elif (age>=70)&(age<=74):
        age_class = '70-74'
    elif (age>=75)&(age<=79):
        age_class = '75-79'
    elif (age>=80)&(age<=999):
        age_class = '80-999'

    weight_class = '93'
    if bodyweight<50:
        weight_class = '48'
    elif (bodyweight>=50)&(bodyweight<=66):
        weight_class = '66'
    elif (bodyweight>66)&(bodyweight<=75):
        weight_class = '75'
    elif (bodyweight>75)&(bodyweight<=83):
        weight_class = '83'
    elif (bodyweight>83)&(bodyweight<=93):
        weight_class = '93'
    elif (bodyweight>93)&(bodyweight<=100):
        weight_class = '100'
    elif (bodyweight>100)&(bodyweight<=120):
        weight_class = '120'
    elif (bodyweight>120)&(bodyweight<=140):
        weight_class = '140'
    elif (bodyweight>140):
        weight_class = '140+'
    
    
    temp = pd.DataFrame(
        columns=['Age', 'AgeClass', 'BodyweightKg', 'WeightClassKg', 'Best3SquatKg', 'Best3BenchKg', 'Squat Wilks', 'Bench Wilks', 'Bench:Bodyweight Ratio', 'Bench:Squat Ratio'], 
        data=[[age, age_class, bodyweight, weight_class, squat, bench, mswilks, mbwilks, mbbr, mbsr]]
    )
    y_pred = slr.predict(temp)[0]
    return y_pred

In [None]:
#male gradient boosting predictor function

def mgbpredict(age, bodyweight, squat, bench):
    
    a = -216.0475144
    b = 16.2606339
    c = -0.002388645
    d = -0.00113732
    e = 0.00000701863
    f = -0.00000001291

    mswilks = squat * 500 /(a+(b*bodyweight)+(c*bodyweight**2)+(d*bodyweight**3)+(e*bodyweight**4)+(f*bodyweight**5))
    mbwilks = bench * 500 /(a+(b*bodyweight)+(c*bodyweight**2)+(d*bodyweight**3)+(e*bodyweight**4)+(f*bodyweight**5))
    mbbr = bench/bodyweight
    mbsr = bench/squat
    age_class = '24-34'

    if (age>=5)&(age<=12):
        age_class = '5-12'
    elif (age>=13)&(age<=15):
        age_class = '13-15'
    elif (age>=16)&(age<=17):
        age_class = '16-17'
    elif (age>=18)&(age<=19):
        age_class = '18-19'
    elif (age>=20)&(age<=23):
        age_class = '20-23'
    elif (age>=24)&(age<=34):
        age_class = '24-34'
    elif (age>=35)&(age<=39):
        age_class = '35-39'
    elif (age>=40)&(age<=44):
        age_class = '40-44'
    elif (age>=45)&(age<=49):
        age_class = '45-49'
    elif (age>=50)&(age<=54):
        age_class = '50-54'
    elif (age>=55)&(age<=59):
        age_class = '55-59'
    elif (age>=60)&(age<=64):
        age_class = '60-64'
    elif (age>=65)&(age<=69):
        age_class = '65-69'
    elif (age>=70)&(age<=74):
        age_class = '70-74'
    elif (age>=75)&(age<=79):
        age_class = '75-79'
    elif (age>=80)&(age<=999):
        age_class = '80-999'

    weight_class = '93'
    if bodyweight<50:
        weight_class = '48'
    elif (bodyweight>=50)&(bodyweight<=66):
        weight_class = '66'
    elif (bodyweight>66)&(bodyweight<=75):
        weight_class = '75'
    elif (bodyweight>75)&(bodyweight<=83):
        weight_class = '83'
    elif (bodyweight>83)&(bodyweight<=93):
        weight_class = '93'
    elif (bodyweight>93)&(bodyweight<=100):
        weight_class = '100'
    elif (bodyweight>100)&(bodyweight<=120):
        weight_class = '120'
    elif (bodyweight>120)&(bodyweight<=140):
        weight_class = '140'
    elif (bodyweight>140):
        weight_class = '140+'
    
    
    temp = pd.DataFrame(
        columns=['Age', 'AgeClass', 'BodyweightKg', 'WeightClassKg', 'Best3SquatKg', 'Best3BenchKg', 'Squat Wilks', 'Bench Wilks', 'Bench:Bodyweight Ratio', 'Bench:Squat Ratio'], 
        data=[[age, age_class, bodyweight, weight_class, squat, bench, mswilks, mbwilks, mbbr, mbsr]]
    )
    y_pred = gb.predict(temp)[0]
    return y_pred

In [None]:
Xf_train['WeightClassKg'].value_counts()

In [None]:
#female linear predictor function

def flpredict(age, bodyweight, squat, bench):
    
    a = 594.31747775582
    b = -27.23842536447
    c = 0.82112226871
    d = -0.00930733913
    e = 0.00004731582
    f = -0.00000009054

    fswilks = squat * 500 /(a+(b*bodyweight)+(c*bodyweight**2)+(d*bodyweight**3)+(e*bodyweight**4)+(f*bodyweight**5))
    fbwilks = bench * 500 /(a+(b*bodyweight)+(c*bodyweight**2)+(d*bodyweight**3)+(e*bodyweight**4)+(f*bodyweight**5))
    fbbr = bench/bodyweight
    fbsr = bench/squat
    age_class = '24-34'

    if (age>=5)&(age<=12):
        age_class = '5-12'
    elif (age>=13)&(age<=15):
        age_class = '13-15'
    elif (age>=16)&(age<=17):
        age_class = '16-17'
    elif (age>=18)&(age<=19):
        age_class = '18-19'
    elif (age>=20)&(age<=23):
        age_class = '20-23'
    elif (age>=24)&(age<=34):
        age_class = '24-34'
    elif (age>=35)&(age<=39):
        age_class = '35-39'
    elif (age>=40)&(age<=44):
        age_class = '40-44'
    elif (age>=45)&(age<=49):
        age_class = '45-49'
    elif (age>=50)&(age<=54):
        age_class = '50-54'
    elif (age>=55)&(age<=59):
        age_class = '55-59'
    elif (age>=60)&(age<=64):
        age_class = '60-64'
    elif (age>=65)&(age<=69):
        age_class = '65-69'
    elif (age>=70)&(age<=74):
        age_class = '70-74'
    elif (age>=75):
        age_class = '75-79'

    weight_class = '93'
    if bodyweight<=50:
        weight_class = '44'
    elif (bodyweight>50)&(bodyweight<=52):
        weight_class = '52'
    elif (bodyweight>52)&(bodyweight<=63):
        weight_class = '63'
    elif (bodyweight>63)&(bodyweight<=72):
        weight_class = '72'
    elif (bodyweight>72)&(bodyweight<=84):
        weight_class = '84'
    elif (bodyweight>84)&(bodyweight<=90):
        weight_class = '90'
    elif (bodyweight>90)&(bodyweight<=100):
        weight_class = '100'
    elif (bodyweight>100):
        weight_class = '100+'

    temp = pd.DataFrame(
        columns=['Age', 'AgeClass', 'BodyweightKg', 'WeightClassKg', 'Best3SquatKg', 'Best3BenchKg', 'Squat Wilks', 'Bench Wilks', 'Bench:Bodyweight Ratio', 'Bench:Squat Ratio'], 
        data=[[age, age_class, bodyweight, weight_class, squat, bench, fswilks, fbwilks, fbbr, fbsr]]
    )
    y_pred = slrf.predict(temp)[0]
    return y_pred



In [None]:
flpredict(32, 100, 215, 152)

In [None]:
#female gradient boosting predictor function

def fgbpredict(age, bodyweight, squat, bench):
    
    a = 594.31747775582
    b = -27.23842536447
    c = 0.82112226871
    d = -0.00930733913
    e = 0.00004731582
    f = -0.00000009054

    fswilks = squat * 500 /(a+(b*bodyweight)+(c*bodyweight**2)+(d*bodyweight**3)+(e*bodyweight**4)+(f*bodyweight**5))
    fbwilks = bench * 500 /(a+(b*bodyweight)+(c*bodyweight**2)+(d*bodyweight**3)+(e*bodyweight**4)+(f*bodyweight**5))
    fbbr = bench/bodyweight
    fbsr = bench/squat
    age_class = '24-34'

    if (age>=5)&(age<=12):
        age_class = '5-12'
    elif (age>=13)&(age<=15):
        age_class = '13-15'
    elif (age>=16)&(age<=17):
        age_class = '16-17'
    elif (age>=18)&(age<=19):
        age_class = '18-19'
    elif (age>=20)&(age<=23):
        age_class = '20-23'
    elif (age>=24)&(age<=34):
        age_class = '24-34'
    elif (age>=35)&(age<=39):
        age_class = '35-39'
    elif (age>=40)&(age<=44):
        age_class = '40-44'
    elif (age>=45)&(age<=49):
        age_class = '45-49'
    elif (age>=50)&(age<=54):
        age_class = '50-54'
    elif (age>=55)&(age<=59):
        age_class = '55-59'
    elif (age>=60)&(age<=64):
        age_class = '60-64'
    elif (age>=65)&(age<=69):
        age_class = '65-69'
    elif (age>=70)&(age<=74):
        age_class = '70-74'
    elif (age>=75):
        age_class = '75-79'

    weight_class = '93'
    if bodyweight<=50:
        weight_class = '44'
    elif (bodyweight>50)&(bodyweight<=52):
        weight_class = '52'
    elif (bodyweight>52)&(bodyweight<=63):
        weight_class = '63'
    elif (bodyweight>63)&(bodyweight<=72):
        weight_class = '72'
    elif (bodyweight>72)&(bodyweight<=84):
        weight_class = '84'
    elif (bodyweight>84)&(bodyweight<=90):
        weight_class = '90'
    elif (bodyweight>90)&(bodyweight<=100):
        weight_class = '100'
    elif (bodyweight>100):
        weight_class = '100+'

    temp = pd.DataFrame(
        columns=['Age', 'AgeClass', 'BodyweightKg', 'WeightClassKg', 'Best3SquatKg', 'Best3BenchKg', 'Squat Wilks', 'Bench Wilks', 'Bench:Bodyweight Ratio', 'Bench:Squat Ratio'], 
        data=[[age, age_class, bodyweight, weight_class, squat, bench, fswilks, fbwilks, fbbr, fbsr]]
    )
    y_pred = gbf.predict(temp)[0]
    return y_pred

In [None]:
fgbpredict(32, 100, 215, 152)