In [1]:
# 1 - DATA MANIPULATION
import pandas as pd
import numpy as np

# 2 - DATA VISUALISATION
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# 3 - STATISTICS
from statsmodels.graphics.gofplots import qqplot
from scipy import stats

# 4 - MACHINE LEARNING

from sklearn.linear_model import LinearRegression,SGDRegressor
from sklearn.ensemble import RandomForestRegressor
import itertools
from xgboost import XGBRegressor

## 4.1 - Preprocessing

### 4.1.1 - Scalers
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor

### 4.1.2 - Encoders
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

### 4.1.3 - Crossvalidation, Training, Model
from sklearn.model_selection import cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression, ElasticNet

### 4.1.4 - Evaluation
from sklearn.metrics import confusion_matrix, r2_score, accuracy_score
from sklearn.metrics  import ConfusionMatrixDisplay

In [2]:
df_males = pd.read_csv('../data_csv/data_model_males.csv')
df_females = pd.read_csv('../data_csv/data_model_females.csv')

In [3]:
df_males.head()

Unnamed: 0,Name,Exercise,Metric,Height,Weight,0,1,2,3,4,...,90,91,92,93,94,95,96,97,98,99
0,Robert,Chest,Chest_Weight,185,80,35.0,35.0,36.0,36.0,36.0,...,78.0,79.0,80.0,80.0,80.0,81.0,81.0,81.0,81.0,82.0
1,Robert,Chest,Chest_Repetitions,185,80,8.0,10.0,8.0,10.0,12.0,...,8.0,8.0,8.0,10.0,12.0,8.0,10.0,12.0,14.0,8.0
2,Robert,Shoulders,Shoulders_Weight,185,80,7.0,7.0,7.0,7.0,7.0,...,10.5,10.5,10.5,10.5,10.5,10.5,10.5,10.5,10.5,10.5
3,Robert,Shoulders,Shoulders_Repetitions,185,80,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
4,Robert,Legs,Legs_Weight,185,80,60.0,60.0,60.0,62.0,62.0,...,116.0,117.0,117.0,117.0,118.0,118.0,119.0,119.0,119.0,119.0


In [4]:
df_females.head()

Unnamed: 0,Name,Exercise,Metric,Height,Weight,0,1,2,3,4,...,90,91,92,93,94,95,96,97,98,99
0,Julia,Chest,Chest_Weight,160,52,21.0,21.5,22.0,22.5,22.5,...,47.5,48.5,48.5,48.5,49.0,49.5,49.5,49.5,50.0,50.5
1,Julia,Chest,Chest_Repetitions,160,52,8.0,8.0,8.0,8.0,10.0,...,8.0,8.0,10.0,12.0,8.0,8.0,10.0,12.0,8.0,8.0
2,Julia,Shoulders,Shoulders_Weight,160,52,4.0,4.0,4.0,4.0,4.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
3,Julia,Shoulders,Shoulders_Repetitions,160,52,8.0,9.0,10.0,11.0,12.0,...,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,15.0,15.0
4,Julia,Legs,Legs_Weight,160,52,30.5,30.5,31.0,31.5,31.5,...,66.5,67.0,68.0,68.5,69.0,69.0,69.5,69.5,70.0,70.5


In [5]:
df_males.describe()

Unnamed: 0,Height,Weight,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
count,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,...,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0,32.0
mean,176.25,71.75,18.375,19.03125,19.25,19.75,20.1875,20.390625,20.421875,20.953125,...,38.671875,38.953125,38.953125,39.09375,39.5625,39.921875,40.03125,39.96875,40.359375,40.5
std,6.753733,11.878416,17.12525,17.001868,17.305602,17.454087,17.620438,17.946486,18.408058,18.242258,...,40.146026,40.430418,40.686718,41.028007,41.046541,41.199184,41.528447,41.851107,42.007533,42.227113
min,168.0,55.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
25%,171.0,64.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.75,...,9.0,8.75,9.0,8.75,9.75,9.0,9.0,9.875,9.875,9.875
50%,176.0,73.5,8.0,10.0,10.0,11.0,12.0,12.5,11.0,11.0,...,13.0,13.0,12.25,11.5,12.25,13.25,12.25,12.25,13.25,13.25
75%,181.25,81.25,20.25,21.0,21.25,21.25,22.0,22.0,22.25,22.75,...,68.75,69.75,69.75,70.5,70.75,71.0,71.0,72.0,72.25,72.5
max,185.0,85.0,62.0,63.0,64.0,65.0,65.0,66.0,68.0,68.0,...,134.0,135.0,135.0,136.0,137.0,137.0,137.0,139.0,139.0,140.0


In [6]:
df_m = df_males.drop(columns = ['Name'])
df_f = df_females.drop(columns = ['Name'])

In [7]:
m_chest = df_m[df_m['Exercise'] == 'Chest']
f_chest = df_f[df_f['Exercise'] == 'Chest']
m_shoulders = df_m[df_m['Exercise'] == 'Shoulders']
f_shoulders = df_f[df_f['Exercise'] == 'Shoulders']
m_legs = df_m[df_m['Exercise'] == 'Legs']
f_legs = df_f[df_f['Exercise'] == 'Legs']
m_back = df_m[df_m['Exercise'] == 'Back']
f_back = df_f[df_f['Exercise'] == 'Back']

In [8]:
m_chest = m_chest.drop(columns=['Exercise'])
f_chest = f_chest.drop(columns=['Exercise'])
m_shoulders = m_shoulders.drop(columns=['Exercise'])
f_shoulders = f_shoulders.drop(columns=['Exercise'])
m_legs = m_legs.drop(columns=['Exercise'])
f_legs = f_legs.drop(columns=['Exercise'])
m_back = m_back.drop(columns=['Exercise'])
f_back = f_back.drop(columns=['Exercise'])

In [9]:
dfs = [m_chest, f_chest, m_shoulders, f_shoulders, m_legs, f_legs, m_back, f_back]
for df in dfs:
    for row  in df['Metric']:
        words_metric = row.split('_')
    df['Metric'] = df['Metric'].map({f'{words_metric[0]}_Weight': 0, f'{words_metric[0]}_Repetitions': 1})

In [10]:
o_dfs = [df_m, df_f]
for df in o_dfs:
    df['Metric'] = df['Metric'].apply(lambda row: 0 if row.split('_')[1] == 'Weight' else 1)
    df['Exercise'] = df['Exercise'].map({'Chest': 0, 'Shoulders': 1, 'Legs': 2, 'Back': 3})

In [12]:
x = df_m.drop(columns = ['99'])
y = df_m['99']
x

Unnamed: 0,Exercise,Metric,Height,Weight,0,1,2,3,4,5,...,89,90,91,92,93,94,95,96,97,98
0,0,0,185,80,35.0,35.0,36.0,36.0,36.0,37.0,...,77.0,78.0,79.0,80.0,80.0,80.0,81.0,81.0,81.0,81.0
1,0,1,185,80,8.0,10.0,8.0,10.0,12.0,8.0,...,12.0,8.0,8.0,8.0,10.0,12.0,8.0,10.0,12.0,14.0
2,1,0,185,80,7.0,7.0,7.0,7.0,7.0,7.0,...,10.5,10.5,10.5,10.5,10.5,10.5,10.5,10.5,10.5,10.5
3,1,1,185,80,8.0,8.0,8.0,8.0,8.0,8.0,...,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
4,2,0,185,80,60.0,60.0,60.0,62.0,62.0,63.0,...,116.0,116.0,117.0,117.0,117.0,118.0,118.0,119.0,119.0,119.0
5,2,1,185,80,8.0,9.0,10.0,8.0,9.0,8.0,...,8.0,9.0,8.0,9.0,10.0,8.0,9.0,8.0,9.0,10.0
6,3,0,185,80,20.0,20.0,21.0,21.0,21.0,21.0,...,71.0,74.0,75.0,75.0,75.0,76.0,77.0,77.0,78.0,79.0
7,3,1,185,80,8.0,9.0,8.0,9.0,10.0,11.0,...,8.0,8.0,8.0,9.0,10.0,8.0,8.0,9.0,8.0,8.0
8,0,0,168,55,21.0,21.0,22.0,22.0,22.0,22.0,...,61.0,61.0,63.0,63.0,64.0,65.0,65.0,65.0,65.0,65.0
9,0,1,168,55,8.0,10.0,8.0,10.0,12.0,14.0,...,12.0,14.0,8.0,10.0,8.0,8.0,10.0,12.0,14.0,16.0


In [13]:
mms = MinMaxScaler()

In [14]:
mms.fit(x)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [15]:
preprocessed_df_m = mms.transform(x)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [17]:
preprocessed_df_m = pd.DataFrame(preprocessed_df_m, columns = x.columns)

In [18]:
preprocessed_df_m

Unnamed: 0,Exercise,Metric,Height,Weight,0,1,2,3,4,5,...,89,90,91,92,93,94,95,96,97,98
0,0.0,0.0,1.0,0.833333,0.526316,0.517241,0.525424,0.516667,0.516667,0.52459,...,0.552,0.555556,0.559055,0.566929,0.5625,0.55814,0.565891,0.565891,0.557252,0.557252
1,0.0,1.0,1.0,0.833333,0.052632,0.086207,0.050847,0.083333,0.116667,0.04918,...,0.032,0.0,0.0,0.0,0.015625,0.031008,0.0,0.015504,0.030534,0.045802
2,0.333333,0.0,1.0,0.833333,0.035088,0.034483,0.033898,0.033333,0.033333,0.032787,...,0.02,0.019841,0.019685,0.019685,0.019531,0.01938,0.01938,0.01938,0.019084,0.019084
3,0.333333,1.0,1.0,0.833333,0.052632,0.051724,0.050847,0.05,0.05,0.04918,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.666667,0.0,1.0,0.833333,0.964912,0.948276,0.932203,0.95,0.95,0.95082,...,0.864,0.857143,0.858268,0.858268,0.851562,0.852713,0.852713,0.860465,0.847328,0.847328
5,0.666667,1.0,1.0,0.833333,0.052632,0.068966,0.084746,0.05,0.066667,0.04918,...,0.0,0.007937,0.0,0.007874,0.015625,0.0,0.007752,0.0,0.007634,0.015267
6,1.0,0.0,1.0,0.833333,0.263158,0.258621,0.271186,0.266667,0.266667,0.262295,...,0.504,0.52381,0.527559,0.527559,0.523438,0.527132,0.534884,0.534884,0.534351,0.541985
7,1.0,1.0,1.0,0.833333,0.052632,0.068966,0.050847,0.066667,0.083333,0.098361,...,0.0,0.0,0.0,0.007874,0.015625,0.0,0.0,0.007752,0.0,0.0
8,0.0,0.0,0.0,0.0,0.280702,0.275862,0.288136,0.283333,0.283333,0.278689,...,0.424,0.420635,0.433071,0.433071,0.4375,0.44186,0.44186,0.44186,0.435115,0.435115
9,0.0,1.0,0.0,0.0,0.052632,0.086207,0.050847,0.083333,0.116667,0.147541,...,0.032,0.047619,0.0,0.015748,0.0,0.0,0.015504,0.031008,0.045802,0.061069


In [19]:
x_train, x_test, y_train, y_test = train_test_split(preprocessed_df_m, y, test_size=0.4)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test)

In [20]:
lr = XGBRegressor()
lr.fit(x_train, y_train)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:
  if is_sparse(data):


In [21]:
yhat = lr.predict(x_test)

  if is_sparse(dtype):
  elif is_categorical_dtype(dtype) and enable_categorical:


In [22]:
y_test

22    110.0
19      8.0
16    110.0
0      82.0
11     11.0
31     12.0
13     10.0
25     14.0
29     10.0
Name: 99, dtype: float64

In [23]:
pd.DataFrame(yhat)

Unnamed: 0,0
0,99.130547
1,9.532391
2,99.130547
3,94.871162
4,14.865205
5,10.036439
6,10.093981
7,10.234595
8,11.707162
