In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neural_network import MLPRegressor
import numpy as np
from sklearn.model_selection import train_test_split

class Regressao:
  def mostrar_metricas(self, model, X_train, X_test, y_train, y_test):
    y_pred = model.predict(X_test)
    print('='*20)
    print('Teste')
    print('='*20)
    print('MSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
    print('R2:', r2_score(y_test, y_pred))
    print('='*20)
    print('Treino')
    print('='*20)
    y_pred = model.predict(X_train)
    print('MSE:', np.sqrt(mean_squared_error(y_train, y_pred)))
    print('R2:', r2_score(y_train, y_pred))

  def regressaoLinear(self, X_train, X_test, y_train, y_test, **kwargs):
    """
    param_linear = {
      'fit_intercept': True,
      'copy_X': True,
      'positive': False
    }
    """
    reg = LinearRegression(**kwargs)
    reg.fit(X_train, y_train)
    self.mostrar_metricas(reg, X_train, X_test, y_train, y_test)
    return reg

  def svr(self, X_train, X_test, y_train, y_test, **kwargs):
    """
      param_svr = {
        'kernel': 'rbf', # 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
        'degree': 3,
        'gamma': 'scale', # 'scale', 'auto'
        'coef0': 0.0,
        'tol': 0.001,
        'C': 1.0,
        'epsilon': 0.1,
        'shrinking': True,
        'cache_size': 200,
        'max_iter': -1
      }

      kernel: {linear, poly, rbf, sigmoid, precomputed}
      gamma: {scale, auto} or float
    """
    reg = SVR(**kwargs)
    reg.fit(X_train, y_train)
    self.mostrar_metricas(reg, X_train, X_test, y_train, y_test)
    return reg

  def sgd(self, X_train, X_test, y_train, y_test, **kwargs):
    """
      params_sgd = {
        'loss': 'squared_error', # 'huber', 'epsilon_insensitive' ou 'squared_epsilon_insensitive', 'squared_error'
        'penalty': 'l2', # 'l1', 'l2', 'elasticnet', None
        'alpha': 0.0001,
        'l1_ratio': 0.15,
        'fit_intercept': True,
        'max_iter': 1000,
        'tol': 0.001,
        'shuffle': True,
        'epsilon': 0.1,
        'learning_rate': 'invscaling', # 'constant', 'optimal', 'invscaling', 'adaptive'
        'eta0': 0.01,
        'power_t': 0.25,
        'early_stopping': False,
        'validation_fraction': 0.1,
        'n_iter_no_change': 5,
        'warm_start': False,
        'average': False
      }

      loss: 'squared_error', 'huber', 'epsilon_insensitive' ou 'squared_epsilon_insensitive'
      penalty: {l2, l1, elasticnet, None}
      learning_rate: {constant, optimal, invscaling, adaptive}
    """
    reg = SGDRegressor(**kwargs, random_state=42)
    reg.fit(X_train, y_train)
    self.mostrar_metricas(reg, X_train, X_test, y_train, y_test)
    return reg

  def knn(self, X_train, X_test, y_train, y_test, **kwargs):
    """
    params_knn = {
      'n_neighbors': 5,
      'weights': 'uniform', # 'distance', 'uniform'
      'algorithm': 'auto', # 'ball_tree', 'kd_tree', 'brute', 'auto'
      'leaf_size': 30,
      'p': 2,
      'n_jobs': None
    }

    weights: {uniform, distance}
    algorithm: {auto, ball_tree, kd_tree, brute}
    """
    reg = KNeighborsRegressor(**kwargs)
    reg.fit(X_train, y_train)
    self.mostrar_metricas(reg, X_train, X_test, y_train, y_test)
    return reg

  def arvore(self, X_train, X_test, y_train, y_test, **kwargs):
    """
    params_arvore = {
      'criterion': 'squared_error', # 'squared_error', 'friedman_mse', 'absolute_error', 'poisson'
      'splitter': 'best', # 'best', 'random'
      'max_depth': None,
      'min_samples_split': 2,
      'min_samples_leaf': 1,
      'min_weight_fraction_leaf': 0.0,
      'max_features': None, # int, float or {sqrt, log2}
      'max_leaf_nodes': None,
      'min_impurity_decrease': 0.0,
      'ccp_alpha': 0.0,
    }

    criterion: {squared_error, friedman_mse, absolute_error, poisson}
    splitter: {best, random}
    max_features: int, float or {sqrt, log2}
    """

    reg = DecisionTreeRegressor(**kwargs, random_state=42)
    reg.fit(X_train, y_train)
    self.mostrar_metricas(reg, X_train, X_test, y_train, y_test)
    return reg

  def floresta(self, X_train, X_test, y_train, y_test, **kwargs):
    """
    params_floresta = {
      'n_estimators': 100,
      'criterion': 'squared_error', # 'squared_error', 'friedman_mse', 'absolute_error', 'poisson'
      'max_depth': None,
      'min_samples_split': 2,
      'min_samples_leaf': 1,
      'min_weight_fraction_leaf': 0.0,
      'max_features':1.0, # 'sqrt', 'log2', None, int or float
      'max_leaf_nodes': None,
      'min_impurity_decrease': 0.0,
      'bootstrap': True,
      'oob_score': False,
      'warm_start': False,
      'ccp_alpha': 0.0,
      'max_samples': None,
      'n_jobs': None
    }
    criterion: {squared_error, absolute_error, friedman_mse, poisson}
    max_features: {sqrt, log2, None}, int or floa
    """
    reg = RandomForestRegressor(**kwargs, random_state=42)
    reg.fit(X_train, y_train)
    self.mostrar_metricas(reg, X_train, X_test, y_train, y_test)
    return reg

  def mlp(self, X_train, X_test, y_train, y_test, **kwargs):
    """
    params_mlp = {
      'hidden_layer_sizes': (100,),
      'activation': 'relu' # identity, logistic, tanh, relu
      'solver': 'adam', # 'lbfgs', 'sgd', 'adam'
      'alpha': 0.0001,
      'batch_size': 'auto',
      'learning_rate': 'constant', # 'constant', 'invscaling', 'adaptive'
      'learning_rate_init': 0.001,
      'power_t': 0.5,
      'max_iter': 200,
      'shuffle': True,
      'tol': 0.0001,
      'warm_start': False,
      'momentum': 0.9,
      'nesterovs_momentum': True,
      'early_stopping': False,
      'validation_fraction': 0.1,
      'beta_1': 0.9,
      'beta_2': 0.999,
      'epsilon': 1e-08,
      'n_iter_no_change': 10,
      'max_fun': 15000
    }
    """
    reg = MLPRegressor(**kwargs, random_state=42)
    reg.fit(X_train, y_train)
    self.mostrar_metricas(reg, X_train, X_test, y_train, y_test)
    return reg

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('gym_members_exercise_tracking.csv')
df.head()

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI
0,56,Male,88.3,1.71,180,157,60,1.69,1313.0,Yoga,12.6,3.5,4,3,30.2
1,46,Female,74.9,1.53,179,151,66,1.3,883.0,HIIT,33.9,2.1,4,2,32.0
2,32,Female,68.1,1.66,167,122,54,1.11,677.0,Cardio,33.4,2.3,4,2,24.71
3,25,Male,53.2,1.7,190,164,56,0.59,532.0,Strength,28.8,2.1,3,1,18.41
4,38,Male,46.1,1.79,188,158,68,0.64,556.0,Strength,29.2,2.8,3,1,14.39


In [4]:
X = df[['Weight (kg)']]
y = df[['BMI']]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
regressao = Regressao()

In [7]:
param_linear = {
  'fit_intercept': True,
  'copy_X': True,
  'positive': False
}
regressao.regressaoLinear(X_train, X_test, y_train, y_test, **param_linear)

Teste
MSE: 3.5364116060621895
R2: 0.7433650387352468
Treino
MSE: 3.4594667126016176
R2: 0.7221566416647341


In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

In [9]:
df.select_dtypes(include='object').columns

Index(['Gender', 'Workout_Type'], dtype='object')

In [10]:
for coluna in df.select_dtypes(include='object').columns:
    df[f'{coluna}_num'] = label_encoder.fit_transform(df[coluna])

df.head(2)

Unnamed: 0,Age,Gender,Weight (kg),Height (m),Max_BPM,Avg_BPM,Resting_BPM,Session_Duration (hours),Calories_Burned,Workout_Type,Fat_Percentage,Water_Intake (liters),Workout_Frequency (days/week),Experience_Level,BMI,Gender_num,Workout_Type_num
0,56,Male,88.3,1.71,180,157,60,1.69,1313.0,Yoga,12.6,3.5,4,3,30.2,1,3
1,46,Female,74.9,1.53,179,151,66,1.3,883.0,HIIT,33.9,2.1,4,2,32.0,0,1


In [11]:
df.drop(['Gender', 'Workout_Type'], axis=1, inplace=True)

In [12]:
X = df.drop('BMI', axis=1)
y = df['BMI']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
param_linear = {
  'fit_intercept': True,
  'copy_X': True,
  'positive': False
}
regressao.regressaoLinear(X_train, X_test, y_train, y_test, **param_linear)

Teste
MSE: 0.7884523344784875
R2: 0.9872432192269877
Treino
MSE: 0.8210669729525237
R2: 0.984349115955126


In [15]:
param_svr = {
    'kernel': 'linear', # 'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'
    'degree': 3,
    'gamma': 'scale', # 'scale', 'auto'
    'coef0': 0.0,
    'tol': 0.001,
    'C': 1.0,
    'epsilon': 0.1,
    'shrinking': True,
    'cache_size': 200,
    'max_iter': -1
}

regressao.svr(X_train, X_test, y_train, y_test, **param_svr)

Teste
MSE: 1.0039801286356995
R2: 0.9793157185018483
Treino
MSE: 0.979140254274284
R2: 0.9777427492472677


In [16]:
params_sgd = {
    'loss': 'squared_error', # 'huber', 'epsilon_insensitive' ou 'squared_epsilon_insensitive', 'squared_error'
    'penalty': 'l2', # 'l1', 'l2', 'elasticnet', None
    'alpha': 0.0001,
    'l1_ratio': 0.15,
    'fit_intercept': True,
    'max_iter': 1000,
    'tol': 0.001,
    'shuffle': True,
    'epsilon': 0.1,
    'learning_rate': 'invscaling', # 'constant', 'optimal', 'invscaling', 'adaptive'
    'eta0': 0.01,
    'power_t': 0.25,
    'early_stopping': True,
    'validation_fraction': 0.1,
    'n_iter_no_change': 5,
    'warm_start': False,
    'average': False
}

regressao.sgd(X_train, X_test, y_train, y_test, **params_sgd)

Teste
MSE: 172567913076311.4
R2: -6.1109720803518454e+26
Treino
MSE: 170293691052906.34
R2: -6.732538180303217e+26


In [17]:
params_knn = {
  'n_neighbors': 7,
  'weights': 'uniform', # 'distance', 'uniform'
  'algorithm': 'auto', # 'ball_tree', 'kd_tree', 'brute', 'auto'
  'leaf_size': 30,
  'p': 2,
}

regressao.knn(X_train, X_test, y_train, y_test, **params_knn)

Teste
MSE: 4.61703860656433
R2: 0.5625612846495472
Treino
MSE: 3.6203723601194384
R2: 0.6957096437506423


In [18]:
params_arvore = {
  'criterion': 'squared_error', # 'squared_error', 'friedman_mse', 'absolute_error', 'poisson'
  'splitter': 'best', # 'best', 'random'
  'max_depth': 13,
  'min_samples_split': 5,
  'min_samples_leaf': 1,
  'min_weight_fraction_leaf': 0.0,
  'max_features': None, # int, float or {sqrt, log2}
  'max_leaf_nodes': None,
  'min_impurity_decrease': 0.0,
  'ccp_alpha': 0.0,
}

regressao.arvore(X_train, X_test, y_train, y_test, **params_arvore)

Teste
MSE: 0.945067260076433
R2: 0.981671976130999
Treino
MSE: 0.2298239192239836
R2: 0.9987737693693237


In [19]:
params_floresta = {
  'n_estimators': 300,
  'criterion': 'poisson', # 'squared_error', 'friedman_mse', 'absolute_error', 'poisson'
  'max_depth': 6,
  'min_samples_split': 2,
  'min_samples_leaf': 1,
  'min_weight_fraction_leaf': 0.0,
  'max_features':1.0, # 'sqrt', 'log2', None, int or float
  'max_leaf_nodes': None,
  'min_impurity_decrease': 0.0,
  'bootstrap': True,
  'oob_score': False,
  'warm_start': False,
  'ccp_alpha': 0.0,
  'max_samples': None,
  'n_jobs': None
}

regressao.floresta(X_train, X_test, y_train, y_test, **params_floresta)

Teste
MSE: 0.8961596930459989
R2: 0.98351985537775
Treino
MSE: 0.4904961440609731
R2: 0.9944146177223004


In [20]:
params_mlp = {
  'hidden_layer_sizes': (64, 32),
  'activation': 'relu', # identity, logistic, tanh, relu
  'solver': 'adam', # 'lbfgs', 'sgd', 'adam'
  'alpha': 0.0001,
  'batch_size': 'auto',
  'learning_rate': 'constant', # 'constant', 'invscaling', 'adaptive'
  'learning_rate_init': 0.004,
  'power_t': 0.5,
  'max_iter': 200,
  'shuffle': True,
  'tol': 0.0001,
  'warm_start': False,
  'momentum': 0.9,
  'nesterovs_momentum': True,
  'early_stopping': False,
  'validation_fraction': 0.1,
  'beta_1': 0.9,
  'beta_2': 0.999,
  'epsilon': 1e-08,
  'n_iter_no_change': 10,
  'max_fun': 15000
}

regressao.mlp(X_train, X_test, y_train, y_test, **params_mlp)

Teste
MSE: 3.4342708106405193
R2: 0.7579755272702701
Treino
MSE: 3.4399228937539506
R2: 0.7252870558730374


In [21]:
import joblib

In [22]:
X = df.drop('BMI', axis=1)
y = df['BMI']

In [23]:
model = LinearRegression().fit(X, y)

In [24]:
joblib.dump(model, 'modelo_imc.pkl')

['modelo_imc.pkl']