# Importação de Bibliotecas

```python
# importa as bibliotecas necessárias para nosso projeto

In [16]:
import pandas as pd

import mlflow
from mlflow.models import infer_signature

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.model_selection import GridSearchCV, train_test_split

### Lendo o dataset pré-processado

```python

# Printando as 10 primeiras linhas para relembrar a estrutura dos dados


In [17]:
dataset = pd.read_csv('pre_processed_dataframe.csv')
dataset.head(10)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,...,Daily Steps,Sleep Disorder,Age Norm,Sleep Norm,Quality Norm,Physical Norm,Stress Norm,Heart Norm,Steps Norm,Sleep Discretized
0,1,Male,27,Software Engineer,6.1,6,42.0,6,Overweight,126/83,...,4200,,-1.75075,-1.29633,-1.096811,-0.826915,0.346556,1.652505,-1.617417,"(6.07, 6.34]"
1,2,Male,28,Doctor,6.2,6,60.0,8,Normal,125/80,...,10000,,-1.635452,-1.17067,-1.096811,0.03852,1.473618,1.168908,1.967442,"(6.07, 6.34]"
2,3,Male,28,Doctor,6.199996,6,60.0,8,Normal,125/80,...,10000,,-1.635452,-1.170675,-1.096811,0.03852,1.473618,1.168908,1.967442,"(6.07, 6.34]"
3,4,Male,28,Sales Representative,5.9,4,30.0,8,Obese,140/90,...,3000,Sleep Apnea,-1.635452,-1.54765,-2.767716,-1.403873,1.473618,3.586893,-2.359112,"(5.797, 6.07]"
4,5,Male,28,Sales Representative,5.9,4,30.0,8,Obese,140/90,...,3000,Sleep Apnea,-1.635452,-1.54765,-2.767716,-1.403873,1.473618,3.586893,-2.359112,"(5.797, 6.07]"
5,6,Male,28,Software Engineer,5.9,4,30.0,8,Obese,140/90,...,3000,Insomnia,-1.635452,-1.54765,-2.767716,-1.403873,1.473618,3.586893,-2.359112,"(5.797, 6.07]"
6,7,Male,29,Teacher,6.3,6,40.0,7,Obese,140/90,...,3500,Insomnia,-1.520153,-1.04501,-1.096811,-0.923075,0.910087,2.861497,-2.050073,"(6.07, 6.34]"
7,8,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,...,8000,,-1.520153,0.839891,-0.261358,0.759717,0.346556,-0.040084,0.731284,"(7.69, 7.96]"
8,9,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,...,8000,,-1.520153,0.839891,-0.261358,0.759717,0.346556,-0.040084,0.731284,"(7.69, 7.96]"
9,10,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,...,8000,,-1.520153,0.839891,-0.261358,0.759717,0.346556,-0.040084,0.731284,"(7.69, 7.96]"


### Printando os tipos de cada coluna

```python
# Interessante relembrar dessa vez como os tipos pós normalização ficaram

In [18]:
dataset.dtypes

Person ID                    int64
Gender                      object
Age                          int64
Occupation                  object
Sleep Duration             float64
Quality of Sleep             int64
Physical Activity Level    float64
Stress Level                 int64
BMI Category                object
Blood Pressure              object
Heart Rate                   int64
Daily Steps                  int64
Sleep Disorder              object
Age Norm                   float64
Sleep Norm                 float64
Quality Norm               float64
Physical Norm              float64
Stress Norm                float64
Heart Norm                 float64
Steps Norm                 float64
Sleep Discretized           object
dtype: object

### Trocando o tipo dos dados "object" para "category"

In [19]:
dataset["Gender"] = dataset["Gender"].astype("category")
dataset["Occupation"] = dataset["Occupation"].astype("category")
dataset["BMI Category"] = dataset["BMI Category"].astype("category")
dataset["Blood Pressure"] = dataset["Blood Pressure"].astype("category")
dataset["Sleep Disorder"] = dataset["Sleep Disorder"].astype("category")

### Dropando colunas

```python

# Aqui precisamos "dropar" as colunas desnecessárias ou redundantes para o treinamento do modelo, no caso várias colunas a quais já temos suas versões normalizadas e outras como "Person ID"/"Sleep Discretized"/"Quality norm"

In [20]:
ml_dataset = dataset.drop(columns=['Sleep Discretized', 'Person ID', 'Age', 'Sleep Duration', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps', 'Quality Norm'])
ml_dataset.head(10)

Unnamed: 0,Gender,Occupation,Quality of Sleep,BMI Category,Blood Pressure,Sleep Disorder,Age Norm,Sleep Norm,Physical Norm,Stress Norm,Heart Norm,Steps Norm
0,Male,Software Engineer,6,Overweight,126/83,,-1.75075,-1.29633,-0.826915,0.346556,1.652505,-1.617417
1,Male,Doctor,6,Normal,125/80,,-1.635452,-1.17067,0.03852,1.473618,1.168908,1.967442
2,Male,Doctor,6,Normal,125/80,,-1.635452,-1.170675,0.03852,1.473618,1.168908,1.967442
3,Male,Sales Representative,4,Obese,140/90,Sleep Apnea,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
4,Male,Sales Representative,4,Obese,140/90,Sleep Apnea,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
5,Male,Software Engineer,4,Obese,140/90,Insomnia,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
6,Male,Teacher,6,Obese,140/90,Insomnia,-1.520153,-1.04501,-0.923075,0.910087,2.861497,-2.050073
7,Male,Doctor,7,Normal,120/80,,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
8,Male,Doctor,7,Normal,120/80,,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
9,Male,Doctor,7,Normal,120/80,,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284


In [21]:
ml_dataset.dtypes

Gender              category
Occupation          category
Quality of Sleep       int64
BMI Category        category
Blood Pressure      category
Sleep Disorder      category
Age Norm             float64
Sleep Norm           float64
Physical Norm        float64
Stress Norm          float64
Heart Norm           float64
Steps Norm           float64
dtype: object

Como para alguns algoritmos de machine learning (não todos), precisamos que todas as colunas sejam numéricas, então iremos converter logo.

In [22]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

category_columns = ['Gender', 'Occupation', 'BMI Category', 'Blood Pressure', 'Sleep Disorder']

one_hot_encoder = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'),
    category_columns),
    remainder='passthrough')

data = one_hot_encoder.fit_transform(ml_dataset)

if not isinstance(data, pd.DataFrame):
    data = data.toarray()

# Get the feature names
feature_names = one_hot_encoder.get_feature_names_out()

# Create the DataFrame with the correct column names
data_df = pd.DataFrame(data, columns=feature_names)

# Display the first few rows of the new DataFrame
data_df.head(10)

Unnamed: 0,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Occupation_Accountant,onehotencoder__Occupation_Doctor,onehotencoder__Occupation_Engineer,onehotencoder__Occupation_Lawyer,onehotencoder__Occupation_Manager,onehotencoder__Occupation_Nurse,onehotencoder__Occupation_Sales Representative,onehotencoder__Occupation_Salesperson,...,onehotencoder__Sleep Disorder_Insomnia,onehotencoder__Sleep Disorder_Sleep Apnea,onehotencoder__Sleep Disorder_nan,remainder__Quality of Sleep,remainder__Age Norm,remainder__Sleep Norm,remainder__Physical Norm,remainder__Stress Norm,remainder__Heart Norm,remainder__Steps Norm
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,6.0,-1.75075,-1.29633,-0.826915,0.346556,1.652505,-1.617417
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,6.0,-1.635452,-1.17067,0.03852,1.473618,1.168908,1.967442
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,6.0,-1.635452,-1.170675,0.03852,1.473618,1.168908,1.967442
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,4.0,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,4.0,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,4.0,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,6.0,-1.520153,-1.04501,-0.923075,0.910087,2.861497,-2.050073
7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,7.0,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
8,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,7.0,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
9,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,7.0,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284


In [23]:
data_df.dtypes

onehotencoder__Gender_Female                      float64
onehotencoder__Gender_Male                        float64
onehotencoder__Occupation_Accountant              float64
onehotencoder__Occupation_Doctor                  float64
onehotencoder__Occupation_Engineer                float64
onehotencoder__Occupation_Lawyer                  float64
onehotencoder__Occupation_Manager                 float64
onehotencoder__Occupation_Nurse                   float64
onehotencoder__Occupation_Sales Representative    float64
onehotencoder__Occupation_Salesperson             float64
onehotencoder__Occupation_Scientist               float64
onehotencoder__Occupation_Software Engineer       float64
onehotencoder__Occupation_Teacher                 float64
onehotencoder__BMI Category_Normal                float64
onehotencoder__BMI Category_Normal Weight         float64
onehotencoder__BMI Category_Obese                 float64
onehotencoder__BMI Category_Overweight            float64
onehotencoder_

#### Separando os dados
Usamos a seguinte distribuição:
70% treino, 15% validação, 15% teste

In [24]:
target = 'remainder__Quality of Sleep'

X = data_df.drop(columns=[target])
y = data_df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [25]:
len_X = len(X)
len(X_train)/len_X, len(X_val)/len_X, len(X_test)/len_X

(0.6978609625668449, 0.1497326203208556, 0.15240641711229946)

In [26]:
X_train.head()

Unnamed: 0,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Occupation_Accountant,onehotencoder__Occupation_Doctor,onehotencoder__Occupation_Engineer,onehotencoder__Occupation_Lawyer,onehotencoder__Occupation_Manager,onehotencoder__Occupation_Nurse,onehotencoder__Occupation_Sales Representative,onehotencoder__Occupation_Salesperson,...,onehotencoder__Blood Pressure_142/92,onehotencoder__Sleep Disorder_Insomnia,onehotencoder__Sleep Disorder_Sleep Apnea,onehotencoder__Sleep Disorder_nan,remainder__Age Norm,remainder__Sleep Norm,remainder__Physical Norm,remainder__Stress Norm,remainder__Heart Norm,remainder__Steps Norm
19,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,-1.404855,0.588571,0.759717,0.346556,-0.040084,0.731284
357,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.823506,1.091211,0.759717,-1.344036,-0.523681,0.113204
79,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,-1.058959,-1.42199,-1.403873,1.473618,0.443513,-1.122954
167,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,-0.13657,-0.039729,-0.201879,0.346556,0.443513,-0.504875
18,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-1.520153,-0.79369,-0.923075,0.910087,2.3779,-1.741033


In [27]:
y_train.head()

19     7.0
357    9.0
79     6.0
167    7.0
18     5.0
Name: remainder__Quality of Sleep, dtype: float64

Utilizaremos os seguintes algoritmos para a nossa tarefa de regressão:
- KNN Regressor
- Decision Tree Regressor
- Random Forest Regressor
- Support Vector Regressor (SVR)

#### Testes com KNN

In [41]:
param_knn = {
    'n_neighbors': range(1, 31),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
}

knn_regressor = KNeighborsRegressor()
grid_search = GridSearchCV(estimator=knn_regressor, param_grid=param_knn, cv=5)
print(knn_regressor.get_params())

{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [42]:
knn_regressor.fit(X_train, y_train)
y_pred = knn_regressor.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

Mean Absolute Error (MAE): 0.04210526315789474
Mean Squared Error (MSE): 0.023859649122807018
R-squared (R²): 0.9833146792940164


In [43]:
import mlflow
from mlflow.models import infer_signature

In [45]:
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# Create a new MLflow Experiment
mlflow.set_experiment("MLflow Quickstart")

# Start an MLflow run
with mlflow.start_run(run_name='teste_knn', description="Luciano lindo"):
    # Log the hyperparameters
    #mlflow.log_params(param_knn)
    mlflow.log_params(knn_regressor.get_params())

    # Log the loss metric
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("R2", r2)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Basic LR model for our data")

    # Infer the model signature
    signature = infer_signature(X_train, knn_regressor.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=knn_regressor,
        artifact_path="Sleep_health",
        signature=signature,
        input_example=X_train,
        registered_model_name="tracking-quickstart",
    )


Registered model 'tracking-quickstart' already exists. Creating a new version of this model...
2024/07/05 14:14:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-quickstart, version 9
Created version '9' of model 'tracking-quickstart'.


In [46]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import mlflow
import mlflow.sklearn
from mlflow.models.signature import infer_signature

# Define the range of hyperparameters for the Grid Search
param_grid = {
    "max_depth": [10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "random_state": [42]
}

# Initialize the Decision Tree Regressor
dtr = DecisionTreeRegressor()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=dtr, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)

# Start an MLFlow run
with mlflow.start_run(run_name="teste"):
    # Fit the grid search
    grid_search.fit(X_train, y_train)
    
    # Log the best model's hyperparameters and metrics
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # Predict on the test set
    y_pred = best_model.predict(X_test)
    
    # Calculate performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Log the hyperparameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("R2", r2)
    
    # Set a tag to remind ourselves what this run was for
    mlflow.set_tag("Model", "Decision Tree Regressor with GridSearchCV")
    
    # Infer the model signature
    signature = infer_signature(X_train, best_model.predict(X_train))
    
    # Log the best model
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="DecisionTreeRegressorModel",
        signature=signature,
        input_example=X_train,
        registered_model_name="DecisionTreeRegressorModel",
    )
    
    print(f"Logged best model with MAE: {mae}, MSE: {mse}, R2: {r2}")


Registered model 'DecisionTreeRegressorModel' already exists. Creating a new version of this model...
2024/07/05 14:15:05 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressorModel, version 8


Logged best model with MAE: 0.03508771929824561, MSE: 0.02850877192982456, R2: 0.9800634954799827


Created version '8' of model 'DecisionTreeRegressorModel'.


In [None]:
params = {
    "max_depth": 10,
    "min_samples_split": 2,
    "random_state": 42
}

dtr = DecisionTreeRegressor(**params)
print(dtr.get_params())

# Train the model
dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)

# Calculate the performance metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Set our tracking server URI for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("MLflow Quickstart")

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    # Log the metrics
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("R2", r2)

    # Set a tag to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Decision Tree Regressor for our data")

    # Infer the model signature
    signature = infer_signature(X_train, dtr.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=dtr,
        artifact_path="DecisionTreeRegressorModel",
        signature=signature,
        input_example=X_train,
        registered_model_name="DecisionTreeRegressorModel",
    )

    print(f"Logged model with MAE: {mae}, MSE: {mse}, R2: {r2}")

{'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 42, 'splitter': 'best'}


Registered model 'DecisionTreeRegressorModel' already exists. Creating a new version of this model...
2024/07/04 22:51:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressorModel, version 3


Logged model with MAE: 0.017543859649122806, MSE: 0.017543859649122806, R2: 0.9877313818338356


Created version '3' of model 'DecisionTreeRegressorModel'.


In [None]:
new_params = {
    "max_depth": 20,  # updated depth
    "min_samples_split": 5,  # updated min samples split
    "random_state": 42
}

# Instantiate the updated model
updated_dtr = DecisionTreeRegressor(**new_params)
print(updated_dtr.get_params())

# Train the updated model
updated_dtr.fit(X_train, y_train)
y_pred_updated = updated_dtr.predict(X_test)

# Calculate the new performance metrics
mae_updated = mean_absolute_error(y_test, y_pred_updated)
mse_updated = mean_squared_error(y_test, y_pred_updated)
r2_updated = r2_score(y_test, y_pred_updated)

# Set the tracking server URI for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Start a new MLflow run
with mlflow.start_run():
    # Log the new hyperparameters
    mlflow.log_params(new_params)

    # Log the new metrics
    mlflow.log_metric("MAE", mae_updated)
    mlflow.log_metric("MSE", mse_updated)
    mlflow.log_metric("R2", r2_updated)

    # Set a tag to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Updated Decision Tree Regressor for our data")

    # Infer the model signature for the new model
    signature = infer_signature(X_train, updated_dtr.predict(X_train))

    # Log the updated model
    model_info = mlflow.sklearn.log_model(
        sk_model=updated_dtr,
        artifact_path="DecisionTreeRegressorModel",
        signature=signature,
        input_example=X_train,
        registered_model_name="DecisionTreeRegressorModel",
    )

    print(f"Logged updated model with MAE: {mae_updated}, MSE: {mse_updated}, R2: {r2_updated}")


{'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 20, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'random_state': 42, 'splitter': 'best'}


Registered model 'DecisionTreeRegressorModel' already exists. Creating a new version of this model...
2024/07/03 19:36:17 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressorModel, version 2


Logged updated model with MAE: 0.03508771929824561, MSE: 0.02850877192982456, R2: 0.9800634954799827


Created version '2' of model 'DecisionTreeRegressorModel'.
