In [1]:
# 1 - DATA MANIPULATION
import pandas as pd
import numpy as np

# 2 - DATA VISUALISATION
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# 3 - STATISTICS
from statsmodels.graphics.gofplots import qqplot
from scipy import stats

# 4 - MACHINE LEARNING

from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.ensemble import RandomForestRegressor
import itertools
from xgboost import XGBRegressor

## 4.1 - Preprocessing

### 4.1.1 - Scalers
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor

### 4.1.2 - Encoders
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

### 4.1.3 - Crossvalidation, Training, Model
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, ElasticNet

### 4.1.4 - Evaluation
from sklearn.metrics import confusion_matrix, r2_score, accuracy_score
from sklearn.metrics  import ConfusionMatrixDisplay

In [2]:
df_males = pd.read_csv('../data_csv/data_model_males.csv')
df_females = pd.read_csv('../data_csv/data_model_females.csv')

In [3]:
df_males.head()

Unnamed: 0,Name,Exercise,Metric,Height,Weight,0,1,2,3,4,...,90,91,92,93,94,95,96,97,98,99
0,Robert,Chest,Chest_Weight,185,80,35.0,35.0,36.0,36.0,36.0,...,78.0,79.0,80.0,80.0,80.0,81.0,81.0,81.0,81.0,82.0
1,Robert,Chest,Chest_Repetitions,185,80,8.0,10.0,8.0,10.0,12.0,...,8.0,8.0,8.0,10.0,12.0,8.0,10.0,12.0,14.0,8.0
2,Robert,Shoulders,Shoulders_Weight,185,80,7.0,7.0,7.0,7.0,7.0,...,10.5,10.5,10.5,10.5,10.5,10.5,10.5,10.5,10.5,10.5
3,Robert,Shoulders,Shoulders_Repetitions,185,80,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
4,Robert,Legs,Legs_Weight,185,80,60.0,60.0,60.0,62.0,62.0,...,116.0,117.0,117.0,117.0,118.0,118.0,119.0,119.0,119.0,119.0


In [4]:
df_females.head()

Unnamed: 0,Name,Exercise,Metric,Height,Weight,0,1,2,3,4,...,90,91,92,93,94,95,96,97,98,99
0,Julia,Chest,Chest_Weight,160,52,21.0,21.5,22.0,22.5,22.5,...,47.5,48.5,48.5,48.5,49.0,49.5,49.5,49.5,50.0,50.5
1,Julia,Chest,Chest_Repetitions,160,52,8.0,8.0,8.0,8.0,10.0,...,8.0,8.0,10.0,12.0,8.0,8.0,10.0,12.0,8.0,8.0
2,Julia,Shoulders,Shoulders_Weight,160,52,4.0,4.0,4.0,4.0,4.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
3,Julia,Shoulders,Shoulders_Repetitions,160,52,8.0,9.0,10.0,11.0,12.0,...,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,15.0,15.0
4,Julia,Legs,Legs_Weight,160,52,30.5,30.5,31.0,31.5,31.5,...,66.5,67.0,68.0,68.5,69.0,69.0,69.5,69.5,70.0,70.5


In [5]:
df_males.describe()

Unnamed: 0,Height,Weight,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,...,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,176.25,71.75,18.375,19.03125,19.25,19.75,20.1875,20.390625,20.421875,20.953125,...,38.671875,38.953125,38.953125,39.09375,39.5625,39.921875,40.03125,39.96875,40.359375,40.5
std,6.753733,11.878416,17.12525,17.001868,17.305602,17.454087,17.620438,17.946486,18.408058,18.242258,...,40.146026,40.430418,40.686718,41.028007,41.046541,41.199184,41.528447,41.851107,42.007533,42.227113
min,168.0,55.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
25%,171.0,64.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.75,...,9.0,8.75,9.0,8.75,9.75,9.0,9.0,9.875,9.875,9.875
50%,176.0,73.5,8.0,10.0,10.0,11.0,12.0,12.5,11.0,11.0,...,13.0,13.0,12.25,11.5,12.25,13.25,12.25,12.25,13.25,13.25
75%,181.25,81.25,20.25,21.0,21.25,21.25,22.0,22.0,22.25,22.75,...,68.75,69.75,69.75,70.5,70.75,71.0,71.0,72.0,72.25,72.5
max,185.0,85.0,62.0,63.0,64.0,65.0,65.0,66.0,68.0,68.0,...,134.0,135.0,135.0,136.0,137.0,137.0,137.0,139.0,139.0,140.0


In [6]:
df_m = df_males.drop(columns = ['Name'])
df_f = df_females.drop(columns = ['Name'])

In [7]:
m_chest = df_m[df_m['Exercise'] == 'Chest']
f_chest = df_f[df_f['Exercise'] == 'Chest']
m_shoulders = df_m[df_m['Exercise'] == 'Shoulders']
f_shoulders = df_f[df_f['Exercise'] == 'Shoulders']
m_legs = df_m[df_m['Exercise'] == 'Legs']
f_legs = df_f[df_f['Exercise'] == 'Legs']
m_back = df_m[df_m['Exercise'] == 'Back']
f_back = df_f[df_f['Exercise'] == 'Back']

In [8]:
m_chest = m_chest.drop(columns=['Exercise'])
f_chest = f_chest.drop(columns=['Exercise'])
m_shoulders = m_shoulders.drop(columns=['Exercise'])
f_shoulders = f_shoulders.drop(columns=['Exercise'])
m_legs = m_legs.drop(columns=['Exercise'])
f_legs = f_legs.drop(columns=['Exercise'])
m_back = m_back.drop(columns=['Exercise'])
f_back = f_back.drop(columns=['Exercise'])

In [9]:
dfs = [m_chest, f_chest, m_shoulders, f_shoulders, m_legs, f_legs, m_back, f_back]
for df in dfs:
    for row  in df['Metric']:
        words_metric = row.split('_')
    df['Metric'] = df['Metric'].map({f'{words_metric[0]}_Weight': 0, f'{words_metric[0]}_Repetitions': 1})

In [10]:
o_dfs = [df_m, df_f]
for df in o_dfs:
    df['Metric'] = df['Metric'].apply(lambda row: 0 if row.split('_')[1] == 'Weight' else 1)
    df['Exercise'] = df['Exercise'].map({'Chest': 0, 'Shoulders': 1, 'Legs': 2, 'Back': 3})

In [11]:
x = df_m.drop(columns = ['99'])
y = df_m['99']
x

Unnamed: 0,Exercise,Metric,Height,Weight,0,1,2,3,4,5,...,89,90,91,92,93,94,95,96,97,98
0,0,0,185,80,35.0,35.0,36.0,36.0,36.0,37.0,...,77.0,78.0,79.0,80.0,80.0,80.0,81.0,81.0,81.0,81.0
1,0,1,185,80,8.0,10.0,8.0,10.0,12.0,8.0,...,12.0,8.0,8.0,8.0,10.0,12.0,8.0,10.0,12.0,14.0
2,1,0,185,80,7.0,7.0,7.0,7.0,7.0,7.0,...,10.5,10.5,10.5,10.5,10.5,10.5,10.5,10.5,10.5,10.5
3,1,1,185,80,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
4,2,0,185,80,60.0,60.0,60.0,62.0,62.0,63.0,...,116.0,116.0,117.0,117.0,117.0,118.0,118.0,119.0,119.0,119.0
5,2,1,185,80,8.0,9.0,10.0,8.0,9.0,8.0,...,8.0,9.0,8.0,9.0,10.0,8.0,9.0,8.0,9.0,10.0
6,3,0,185,80,20.0,20.0,21.0,21.0,21.0,21.0,...,71.0,74.0,75.0,75.0,75.0,76.0,77.0,77.0,78.0,79.0
7,3,1,185,80,8.0,9.0,8.0,9.0,10.0,11.0,...,8.0,8.0,8.0,9.0,10.0,8.0,8.0,9.0,8.0,8.0
8,0,0,168,55,21.0,21.0,22.0,22.0,22.0,22.0,...,61.0,61.0,63.0,63.0,64.0,65.0,65.0,65.0,65.0,65.0
9,0,1,168,55,8.0,10.0,8.0,10.0,12.0,14.0,...,12.0,14.0,8.0,10.0,8.0,8.0,10.0,12.0,14.0,16.0


In [12]:
sts = StandardScaler()

In [13]:
sts.fit(x)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [14]:
preprocessed_df_m = sts.transform(x)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [15]:
preprocessed_df_m = pd.DataFrame(preprocessed_df_m, columns = x.columns)

In [16]:
preprocessed_df_m

Unnamed: 0,Exercise,Metric,Height,Weight,0,1,2,3,4,5,...,89,90,91,92,93,94,95,96,97,98
0,-1.341641,-1.0,1.31631,0.70565,0.986322,0.954264,0.983382,0.945911,0.911755,0.940303,...,0.984544,0.995302,1.006363,1.024995,1.012986,1.000926,1.013016,1.002308,0.996098,0.982941
1,-1.341641,1.0,1.31631,0.70565,-0.615524,-0.539691,-0.66048,-0.567547,-0.472094,-0.701468,...,-0.670285,-0.776233,-0.77784,-0.77294,-0.720466,-0.682238,-0.787216,-0.73472,-0.678985,-0.637532
2,-0.447214,-1.0,1.31631,0.70565,-0.674852,-0.718966,-0.71919,-0.742176,-0.760396,-0.758081,...,-0.708474,-0.712964,-0.715016,-0.710512,-0.708084,-0.719367,-0.725564,-0.722487,-0.7154,-0.722184
3,-0.447214,1.0,1.31631,0.70565,-0.615524,-0.659208,-0.66048,-0.683967,-0.702736,-0.701468,...,-0.772121,-0.776233,-0.77784,-0.77294,-0.769993,-0.781248,-0.787216,-0.78365,-0.776091,-0.782649
4,0.447214,-1.0,1.31631,0.70565,2.469514,2.448219,2.392407,2.459369,2.410924,2.412236,...,1.977442,1.956992,1.961288,1.948933,1.929239,1.941518,1.925462,1.931985,1.918607,1.902015
5,0.447214,1.0,1.31631,0.70565,-0.615524,-0.599449,-0.543062,-0.683967,-0.645075,-0.701468,...,-0.772121,-0.750925,-0.77784,-0.747969,-0.720466,-0.781248,-0.762555,-0.78365,-0.751815,-0.734277
6,1.341641,-1.0,1.31631,0.70565,0.096407,0.057891,0.102741,0.072762,0.046849,0.034498,...,0.83179,0.894071,0.905844,0.900138,0.889168,0.901916,0.914373,0.904447,0.923268,0.934568
7,1.341641,1.0,1.31631,0.70565,-0.615524,-0.599449,-0.66048,-0.625757,-0.587415,-0.53163,...,-0.772121,-0.776233,-0.77784,-0.747969,-0.720466,-0.781248,-0.787216,-0.759185,-0.776091,-0.782649
8,-1.341641,-1.0,-1.241093,-1.432684,0.155735,0.117649,0.161451,0.130972,0.104509,0.091111,...,0.577201,0.565072,0.604289,0.600482,0.616768,0.62964,0.618444,0.610865,0.607673,0.595962
9,-1.341641,1.0,-1.241093,-1.432684,-0.615524,-0.539691,-0.66048,-0.567547,-0.472094,-0.361791,...,-0.670285,-0.624387,-0.77784,-0.722998,-0.769993,-0.781248,-0.737895,-0.68579,-0.630432,-0.58916


In [17]:
x_train, x_test, y_train, y_test = train_test_split(preprocessed_df_m, y, test_size=0.2)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test)

In [18]:
lr = XGBRegressor()
lr.fit(x_train, y_train)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):


In [19]:
yhat = lr.predict(x_test)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:


In [20]:
y_test

11    11.0
0     82.0
14    50.0
31    12.0
26    12.5
Name: 99, dtype: float64

In [21]:
pd.DataFrame(yhat)

Unnamed: 0,0
0,18.602711
1,96.453827
2,58.286316
3,9.487957
4,10.540812


In [24]:
x_c = m_chest.drop(columns = ['99'])
y_c = m_chest['99']
sts.fit(x_c)
m_c_p = sts.transform(x_c)
m_c_p = pd.DataFrame(m_c_p, columns = x_c.columns)
m_c_p

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,Metric,Height,Weight,0,1,2,3,4,5,6,...,89,90,91,92,93,94,95,96,97,98
0,-1.0,1.31631,0.70565,0.622662,0.588801,0.621841,0.589152,0.548821,0.579865,0.578049,...,0.795977,0.812254,0.822719,0.833685,0.817099,0.808491,0.822502,0.804579,0.798783,0.787973
1,1.0,1.31631,0.70565,-0.871727,-0.829996,-0.875926,-0.835773,-0.76835,-0.984421,-0.794023,...,-0.92864,-1.007194,-0.974605,-0.999149,-0.936063,-0.92308,-1.010861,-0.946382,-0.944017,-0.915065
2,-1.0,-1.241093,-1.432684,-0.152206,-0.205726,-0.127043,-0.178116,-0.219529,-0.229249,-0.133396,...,0.371456,0.370388,0.417688,0.400932,0.416376,0.426527,0.420669,0.409996,0.394656,0.381277
3,1.0,-1.241093,-1.432684,-0.871727,-0.829996,-0.875926,-0.835773,-0.76835,-0.660776,-0.895658,...,-0.92864,-0.851242,-0.974605,-0.948237,-0.986153,-1.024937,-0.960632,-0.897059,-0.893501,-0.864228
4,-1.0,0.564133,1.133317,1.508226,1.49683,1.477707,1.466029,1.5367,1.550801,1.543581,...,1.485824,1.488049,1.45558,1.44463,1.468273,1.470562,1.47548,1.495099,1.506006,1.525109
5,1.0,0.564133,1.133317,-0.871727,-0.829996,-0.875926,-0.835773,-0.987878,-0.984421,-0.895658,...,-0.92864,-1.007194,-0.923977,-0.897325,-0.986153,-0.974009,-1.010861,-0.995705,-0.994532,-1.067576
6,-1.0,-0.639351,-0.406284,1.508226,1.553582,1.531199,1.575638,1.5367,1.49686,1.492763,...,1.167433,1.150151,1.151806,1.164613,1.142686,1.139527,1.123876,1.125177,1.127137,1.118413
7,1.0,-0.639351,-0.406284,-0.871727,-0.9435,-0.875926,-0.945383,-0.878114,-0.768658,-0.895658,...,-1.03477,-0.95521,-0.974605,-0.999149,-0.936063,-0.92308,-0.860173,-0.995705,-0.994532,-0.965902


In [25]:
x_c_train, x_c_test, y_c_train, y_c_test = train_test_split(m_c_p, y_c, test_size=0.2)

In [26]:
xr = XGBRegressor()
xr.fit(x_c_train, y_c_train)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):


In [27]:
y_c_hat = xr.predict(x_c_test)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:


In [28]:
pd.DataFrame(y_c_hat)

Unnamed: 0,0
0,65.830589
1,32.904449


In [29]:
y_c_test

0     82.0
17    10.0
Name: 99, dtype: float64