# Importação de Bibliotecas

```python
# importa as bibliotecas necessárias para nosso projeto

In [2]:
import pandas as pd

import mlflow
from mlflow.models import infer_signature

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.model_selection import GridSearchCV, train_test_split

#import mlflow.sklearn
#from mlflow.models.signature import infer_signature

### Lendo o dataset pré-processado

```python

# Printando as 10 primeiras linhas para relembrar a estrutura dos dados


In [3]:
dataset = pd.read_csv('pre_processed_dataframe.csv')
dataset.head(10)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,...,Daily Steps,Sleep Disorder,Age Norm,Sleep Norm,Quality Norm,Physical Norm,Stress Norm,Heart Norm,Steps Norm,Sleep Discretized
0,1,Male,27,Software Engineer,6.1,6,42.0,6,Overweight,126/83,...,4200,,-1.75075,-1.29633,-1.096811,-0.826915,0.346556,1.652505,-1.617417,"(6.07, 6.34]"
1,2,Male,28,Doctor,6.2,6,60.0,8,Normal,125/80,...,10000,,-1.635452,-1.17067,-1.096811,0.03852,1.473618,1.168908,1.967442,"(6.07, 6.34]"
2,3,Male,28,Doctor,6.199996,6,60.0,8,Normal,125/80,...,10000,,-1.635452,-1.170675,-1.096811,0.03852,1.473618,1.168908,1.967442,"(6.07, 6.34]"
3,4,Male,28,Sales Representative,5.9,4,30.0,8,Obese,140/90,...,3000,Sleep Apnea,-1.635452,-1.54765,-2.767716,-1.403873,1.473618,3.586893,-2.359112,"(5.797, 6.07]"
4,5,Male,28,Sales Representative,5.9,4,30.0,8,Obese,140/90,...,3000,Sleep Apnea,-1.635452,-1.54765,-2.767716,-1.403873,1.473618,3.586893,-2.359112,"(5.797, 6.07]"
5,6,Male,28,Software Engineer,5.9,4,30.0,8,Obese,140/90,...,3000,Insomnia,-1.635452,-1.54765,-2.767716,-1.403873,1.473618,3.586893,-2.359112,"(5.797, 6.07]"
6,7,Male,29,Teacher,6.3,6,40.0,7,Obese,140/90,...,3500,Insomnia,-1.520153,-1.04501,-1.096811,-0.923075,0.910087,2.861497,-2.050073,"(6.07, 6.34]"
7,8,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,...,8000,,-1.520153,0.839891,-0.261358,0.759717,0.346556,-0.040084,0.731284,"(7.69, 7.96]"
8,9,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,...,8000,,-1.520153,0.839891,-0.261358,0.759717,0.346556,-0.040084,0.731284,"(7.69, 7.96]"
9,10,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,...,8000,,-1.520153,0.839891,-0.261358,0.759717,0.346556,-0.040084,0.731284,"(7.69, 7.96]"


### Printando os tipos de cada coluna

```python
# Interessante relembrar dessa vez como os tipos pós normalização ficaram

In [4]:
dataset.dtypes

Person ID                    int64
Gender                      object
Age                          int64
Occupation                  object
Sleep Duration             float64
Quality of Sleep             int64
Physical Activity Level    float64
Stress Level                 int64
BMI Category                object
Blood Pressure              object
Heart Rate                   int64
Daily Steps                  int64
Sleep Disorder              object
Age Norm                   float64
Sleep Norm                 float64
Quality Norm               float64
Physical Norm              float64
Stress Norm                float64
Heart Norm                 float64
Steps Norm                 float64
Sleep Discretized           object
dtype: object

### Trocando o tipo dos dados "object" para "category"

In [21]:
dataset["Gender"] = dataset["Gender"].astype("category")
dataset["Occupation"] = dataset["Occupation"].astype("category")
dataset["BMI Category"] = dataset["BMI Category"].astype("category")
dataset["Blood Pressure"] = dataset["Blood Pressure"].astype("category")
dataset["Sleep Disorder"] = dataset["Sleep Disorder"].astype("category")

### Dropando colunas

```python

# Aqui precisamos "dropar" as colunas desnecessárias ou redundantes para o treinamento do modelo, no caso várias colunas a quais já temos suas versões normalizadas e outras como "Person ID"/"Sleep Discretized"/"Quality norm"

In [5]:
ml_dataset = dataset.drop(columns=['Sleep Discretized', 'Person ID', 'Age', 'Sleep Duration', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps', 'Quality Norm'])
ml_dataset.head(10)

Unnamed: 0,Gender,Occupation,Quality of Sleep,BMI Category,Blood Pressure,Sleep Disorder,Age Norm,Sleep Norm,Physical Norm,Stress Norm,Heart Norm,Steps Norm
0,Male,Software Engineer,6,Overweight,126/83,,-1.75075,-1.29633,-0.826915,0.346556,1.652505,-1.617417
1,Male,Doctor,6,Normal,125/80,,-1.635452,-1.17067,0.03852,1.473618,1.168908,1.967442
2,Male,Doctor,6,Normal,125/80,,-1.635452,-1.170675,0.03852,1.473618,1.168908,1.967442
3,Male,Sales Representative,4,Obese,140/90,Sleep Apnea,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
4,Male,Sales Representative,4,Obese,140/90,Sleep Apnea,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
5,Male,Software Engineer,4,Obese,140/90,Insomnia,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
6,Male,Teacher,6,Obese,140/90,Insomnia,-1.520153,-1.04501,-0.923075,0.910087,2.861497,-2.050073
7,Male,Doctor,7,Normal,120/80,,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
8,Male,Doctor,7,Normal,120/80,,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
9,Male,Doctor,7,Normal,120/80,,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284


### Tipos finais do dataset

```python
ml_dataset.dtypes
# aqui temos como ficaram os tipos do dataset

In [23]:
ml_dataset.dtypes

Gender              category
Occupation          category
Quality of Sleep       int64
BMI Category        category
Blood Pressure      category
Sleep Disorder      category
Age Norm             float64
Sleep Norm           float64
Physical Norm        float64
Stress Norm          float64
Heart Norm           float64
Steps Norm           float64
dtype: object

#### categóricas -> numéricas

```python
# Como para alguns algoritmos de machine learning (não todos), precisamos que todas as colunas sejam numéricas, então iremos converter logo.

In [6]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

category_columns = ['Gender', 'Occupation', 'BMI Category', 'Blood Pressure', 'Sleep Disorder']

one_hot_encoder = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'),
    category_columns),
    remainder='passthrough')

data = one_hot_encoder.fit_transform(ml_dataset)

if not isinstance(data, pd.DataFrame):
    data = data.toarray()

# Get the feature names
feature_names = one_hot_encoder.get_feature_names_out()

# Create the DataFrame with the correct column names
data_df = pd.DataFrame(data, columns=feature_names)

# Display the first few rows of the new DataFrame
data_df.head(10)

Unnamed: 0,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Occupation_Accountant,onehotencoder__Occupation_Doctor,onehotencoder__Occupation_Engineer,onehotencoder__Occupation_Lawyer,onehotencoder__Occupation_Manager,onehotencoder__Occupation_Nurse,onehotencoder__Occupation_Sales Representative,onehotencoder__Occupation_Salesperson,...,onehotencoder__Sleep Disorder_Insomnia,onehotencoder__Sleep Disorder_Sleep Apnea,onehotencoder__Sleep Disorder_nan,remainder__Quality of Sleep,remainder__Age Norm,remainder__Sleep Norm,remainder__Physical Norm,remainder__Stress Norm,remainder__Heart Norm,remainder__Steps Norm
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,6.0,-1.75075,-1.29633,-0.826915,0.346556,1.652505,-1.617417
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,6.0,-1.635452,-1.17067,0.03852,1.473618,1.168908,1.967442
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,6.0,-1.635452,-1.170675,0.03852,1.473618,1.168908,1.967442
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,4.0,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,4.0,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,4.0,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,6.0,-1.520153,-1.04501,-0.923075,0.910087,2.861497,-2.050073
7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,7.0,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
8,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,7.0,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
9,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,7.0,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284


### Resultado do One Hot Encoder

```python
# Observe a mudança na dimensionalidade acarretada pelo algoritmo, utilizamos o One Hot Encoder para evitar problemas resultantes de ordem explícitas que possivelmente afetam as operações matemáticas

In [7]:
data_df.dtypes

onehotencoder__Gender_Female                      float64
onehotencoder__Gender_Male                        float64
onehotencoder__Occupation_Accountant              float64
onehotencoder__Occupation_Doctor                  float64
onehotencoder__Occupation_Engineer                float64
onehotencoder__Occupation_Lawyer                  float64
onehotencoder__Occupation_Manager                 float64
onehotencoder__Occupation_Nurse                   float64
onehotencoder__Occupation_Sales Representative    float64
onehotencoder__Occupation_Salesperson             float64
onehotencoder__Occupation_Scientist               float64
onehotencoder__Occupation_Software Engineer       float64
onehotencoder__Occupation_Teacher                 float64
onehotencoder__BMI Category_Normal                float64
onehotencoder__BMI Category_Normal Weight         float64
onehotencoder__BMI Category_Obese                 float64
onehotencoder__BMI Category_Overweight            float64
onehotencoder_

### Separando os dados
Usamos a seguinte distribuição:
`70% treino`, `15% validação`, `15% teste`

In [8]:
target = 'remainder__Quality of Sleep'

X = data_df.drop(columns=[target])
y = data_df[target]

# treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# agora dividindo treino/validação
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [9]:
len_X = len(X)
len(X_train)/len_X, len(X_val)/len_X, len(X_test)/len_X

(0.6978609625668449, 0.1497326203208556, 0.15240641711229946)

In [28]:
X_train.head()

Unnamed: 0,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Occupation_Accountant,onehotencoder__Occupation_Doctor,onehotencoder__Occupation_Engineer,onehotencoder__Occupation_Lawyer,onehotencoder__Occupation_Manager,onehotencoder__Occupation_Nurse,onehotencoder__Occupation_Sales Representative,onehotencoder__Occupation_Salesperson,...,onehotencoder__Blood Pressure_142/92,onehotencoder__Sleep Disorder_Insomnia,onehotencoder__Sleep Disorder_Sleep Apnea,onehotencoder__Sleep Disorder_nan,remainder__Age Norm,remainder__Sleep Norm,remainder__Physical Norm,remainder__Stress Norm,remainder__Heart Norm,remainder__Steps Norm
19,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,-1.404855,0.588571,0.759717,0.346556,-0.040084,0.731284
357,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.823506,1.091211,0.759717,-1.344036,-0.523681,0.113204
79,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,-1.058959,-1.42199,-1.403873,1.473618,0.443513,-1.122954
167,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,-0.13657,-0.039729,-0.201879,0.346556,0.443513,-0.504875
18,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-1.520153,-0.79369,-0.923075,0.910087,2.3779,-1.741033


In [29]:
y_train.head()

19     7.0
357    9.0
79     6.0
167    7.0
18     5.0
Name: remainder__Quality of Sleep, dtype: float64

Utilizaremos os seguintes algoritmos para a nossa tarefa de regressão:
- `KNN Regressor`
- ``Decision Tree Regressor``
- ``Random Forest Regressor``
- ``Support Vector Regressor (SVR)``

#### Setup do mlflow

In [32]:
host = "127.0.0.1"
port = "8080"

def setup_mlflow(host, port, name):
    # endereço local no qual o mlflow está rodando
    mlflow.set_tracking_uri(uri=f"http://{host}:{port}")

    # Setando o experimento onde serão publicados os dados
    mlflow.set_experiment(f"{name}")

#### Funções para mlflow e treinamento

In [19]:
from sklearn.metrics import make_scorer

In [21]:
# usaremos essas metricas no gridsearch
scores = {
    'MSE': make_scorer(mean_squared_error),
    'MAE': make_scorer(mean_absolute_error),
    'r2': make_scorer(r2_score)
}

In [39]:
def best_params_grid(model, param_grid, scores, refit):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scores, cv=5, refit=refit)
    grid_search.fit(X_train, y_train)
    
    top_5_results = pd.DataFrame(grid_search.cv_results_)[['params','rank_test_r2', 'mean_test_r2']].sort_values(by='rank_test_r2').head()
    
    return top_5_results['params']

In [47]:
def log_data(model_func, top_params, run_name, model_name):
    i = 0
    for param_set in top_params:
        i += 1
        current_run_name = run_name + ' ' + str(i)
        model = model_func(**param_set)

        print(f"using model {model.get_params()}")

        # Start an MLFlow run
        with mlflow.start_run(run_name=current_run_name):
            # Fit the grid search
            model.fit(X_train, y_train)
            
            # Predict on the test set
            y_pred = model.predict(X_val)
            
            # Calculate performance metrics
            mae = mean_absolute_error(y_val, y_pred)
            mse = mean_squared_error(y_val, y_pred)
            r2 = r2_score(y_val, y_pred)
            
            # Log the hyperparameters and metrics
            mlflow.log_params(param_set)
            mlflow.log_metric("MAE", mae)
            mlflow.log_metric("MSE", mse)
            mlflow.log_metric("R2", r2)
            
            # Set a tag to remind ourselves what this run was for
            mlflow.set_tag("Model", f"{model_name}")
            
            # Infer the model signature
            signature = infer_signature(X_train, model.predict(X_train))
            
            # Log the best model
            mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path=f"{model_name}",
                signature=signature,
                input_example=X_train,
                registered_model_name=f"{model_name}",
            )

In [48]:
# Define the range of hyperparameters for the Grid Search
param_grid = {
    "max_depth": list(range(10,100,10)),
    "min_samples_split": [2, 5, 10],
    #"min_s
    # fazer quando acordar
}

setup_mlflow(host, port, 'First Models: DecisionTreeRegressor')

# Initialize the Decision Tree Regressor
dtr = DecisionTreeRegressor()
best_params = best_params_grid(DecisionTreeRegressor(), param_grid, scores, 'r2')
log_data(DecisionTreeRegressor, best_params, 'DecisionTreeRegressor', 'DecisionTreeRegressor')

using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 50, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 14:50:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 21
Created version '21' of model 'DecisionTreeRegressor'.


using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 20, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 14:50:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 22
Created version '22' of model 'DecisionTreeRegressor'.


using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 50, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 14:50:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 23
Created version '23' of model 'DecisionTreeRegressor'.


using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 70, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 14:50:57 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 24
Created version '24' of model 'DecisionTreeRegressor'.


using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 80, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 14:50:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 25
Created version '25' of model 'DecisionTreeRegressor'.


In [24]:
# Define the range of hyperparameters for the Grid Search
param_grid = {
    "max_depth": list(range(10,100,10)),
    "min_samples_split": [2, 5, 10],
    #"min_s
    # fazer quando acordar
}

# Initialize the Decision Tree Regressor
dtr = DecisionTreeRegressor()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=dtr, param_grid=param_grid, scoring=scores, cv=5, refit='r2')
grid_search.fit(X_train, y_train)

In [25]:
pd.DataFrame(grid_search.cv_results_).columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_max_depth', 'param_min_samples_split', 'params',
       'split0_test_MSE', 'split1_test_MSE', 'split2_test_MSE',
       'split3_test_MSE', 'split4_test_MSE', 'mean_test_MSE', 'std_test_MSE',
       'rank_test_MSE', 'split0_test_MAE', 'split1_test_MAE',
       'split2_test_MAE', 'split3_test_MAE', 'split4_test_MAE',
       'mean_test_MAE', 'std_test_MAE', 'rank_test_MAE', 'split0_test_r2',
       'split1_test_r2', 'split2_test_r2', 'split3_test_r2', 'split4_test_r2',
       'mean_test_r2', 'std_test_r2', 'rank_test_r2'],
      dtype='object')

In [31]:
pd.DataFrame(grid_search.cv_results_)[['params','rank_test_r2', 'mean_test_r2']].sort_values(by='rank_test_r2').head()

Unnamed: 0,params,rank_test_r2,mean_test_r2
14,"{'max_depth': 50, 'min_samples_split': 10}",1,0.953733
20,"{'max_depth': 70, 'min_samples_split': 10}",2,0.953188
17,"{'max_depth': 60, 'min_samples_split': 10}",2,0.953188
21,"{'max_depth': 80, 'min_samples_split': 2}",4,0.952899
10,"{'max_depth': 40, 'min_samples_split': 5}",5,0.952206


In [None]:
# Define the range of hyperparameters for the Grid Search
param_grid = {
    "max_depth": list(range(10,100,10)),
    "min_samples_split": [2, 5, 10],
    #"min_s
    # fazer quando acordar
}

# Initialize the Decision Tree Regressor
dtr = DecisionTreeRegressor()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=dtr, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)
def train_model(model, param_grid, ):

def log_data():
    # Start an MLFlow run
    with mlflow.start_run(run_name="teste_dtr"):
        # Fit the grid search
        grid_search.fit(X_train, y_train)
        
        # Log the best model's hyperparameters and metrics
        best_params = grid_search.best_params_
        best_model = grid_search.best_estimator_
        
        # Predict on the test set
        y_pred = best_model.predict(X_test)
        
        # Calculate performance metrics
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        # Log the hyperparameters and metrics
        mlflow.log_params(best_params)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("R2", r2)
        
        # Set a tag to remind ourselves what this run was for
        mlflow.set_tag("Model", "Decision Tree Regressor with GridSearchCV")
        
        # Infer the model signature
        signature = infer_signature(X_train, best_model.predict(X_train))
        
        # Log the best model
        mlflow.sklearn.log_model(
            sk_model=best_model,
            artifact_path="DecisionTreeRegressorModel",
            signature=signature,
            input_example=X_train,
            registered_model_name="DecisionTreeRegressorModel",
        )
        
        print(f"Logged best model with MAE: {mae}, MSE: {mse}, R2: {r2}")


#### Testes com `KNN`

In [30]:
# Escolhemos aqui os parâmetros que serão estudados/testados pelo GridSearch

param_knn = {
    'n_neighbors': range(1, 31),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
}

knn_regressor = KNeighborsRegressor()

# Vamos utilizar GridSearch para encontrar os melhores parâmetros

grid_search = GridSearchCV(estimator=knn_regressor, param_grid=param_knn, cv=5)

# Printando os valores encontrados
print(knn_regressor.get_params())

{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


#### Resultados GridSearch
```python
# portanto encontramos os parâmetros teoricamente ideais para o KNN

print(knn_regressor.get_params())
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [31]:
knn_regressor.fit(X_train, y_train)
y_pred = knn_regressor.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

Mean Absolute Error (MAE): 0.04210526315789474
Mean Squared Error (MSE): 0.023859649122807018
R-squared (R²): 0.9833146792940164


In [32]:
# Endereço obtido ao rodar o mlflow
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# Setando o experimento onde serão publicados os dados
mlflow.set_experiment("MLflow Quickstart")

# Rodando Mlflow
with mlflow.start_run(run_name='teste_knn', description="Luciano lindo"):
    # fazendo o log dos hiperparâmetros
    #mlflow.log_params(param_knn)
    mlflow.log_params(knn_regressor.get_params())

    # Realizando o log dos hiperparâmetros
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("R2", r2)

    
    mlflow.set_tag("Training KNN Regressor", "KNN for Regression")

    signature = infer_signature(X_train, knn_regressor.predict(X_train))

    # Logando o modelo
    model_info = mlflow.sklearn.log_model(
        sk_model=knn_regressor,
        artifact_path="Sleep_health",
        signature=signature,
        input_example=X_train,
        registered_model_name="tracking-quickstart",
    )


Registered model 'tracking-quickstart' already exists. Creating a new version of this model...
2024/07/08 02:29:20 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking-quickstart, version 14
Created version '14' of model 'tracking-quickstart'.


In [33]:
# Define the range of hyperparameters for the Grid Search
param_grid = {
    "max_depth": list(range(10,100,10)),
    "min_samples_split": [2, 5, 10],
    #"min_s
    # fazer quando acordar
}

# Initialize the Decision Tree Regressor
dtr = DecisionTreeRegressor()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=dtr, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)

# Start an MLFlow run
with mlflow.start_run(run_name="teste_dtr"):
    # Fit the grid search
    grid_search.fit(X_train, y_train)
    
    # Log the best model's hyperparameters and metrics
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # Predict on the test set
    y_pred = best_model.predict(X_test)
    
    # Calculate performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Log the hyperparameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("R2", r2)
    
    # Set a tag to remind ourselves what this run was for
    mlflow.set_tag("Model", "Decision Tree Regressor with GridSearchCV")
    
    # Infer the model signature
    signature = infer_signature(X_train, best_model.predict(X_train))
    
    # Log the best model
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="DecisionTreeRegressorModel",
        signature=signature,
        input_example=X_train,
        registered_model_name="DecisionTreeRegressorModel",
    )
    
    print(f"Logged best model with MAE: {mae}, MSE: {mse}, R2: {r2}")


Registered model 'DecisionTreeRegressorModel' already exists. Creating a new version of this model...
2024/07/08 02:29:22 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressorModel, version 14


Logged best model with MAE: 0.03508771929824561, MSE: 0.02850877192982456, R2: 0.9800634954799827


Created version '14' of model 'DecisionTreeRegressorModel'.


In [34]:



params_rfr = {
    'n_estimators': range(1,20),
    'max_depth': [None] + list(range(10,100,10)), # testando de 1 a 100 tava demorando demais
    'max_features': ['sqrt', 'log2']
}
rfr = RandomForestRegressor(**params_rfr)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rfr, param_grid=params_rfr, scoring='neg_mean_squared_error', cv=3)

# Start an MLFlow run
with mlflow.start_run(run_name="teste_rfr"):
    # Fit the grid search
    grid_search.fit(X_train, y_train)
    
    # Log the best model's hyperparameters and metrics
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # Predict on the test set
    y_pred = best_model.predict(X_test)
    
    # Calculate performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Log the hyperparameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("R2", r2)
    
    # Set a tag to remind ourselves what this run was for
    mlflow.set_tag("Model", "Random Forest Regressor with GridSearchCV")
    
    # Infer the model signature
    signature = infer_signature(X_train, best_model.predict(X_train))
    
    # Log the best model
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="DecisionTreeRegressorModel",
        signature=signature,
        input_example=X_train,
        registered_model_name="RandomForestRegressorModel",
    )
    
    print(f"Logged best model with MAE: {mae}, MSE: {mse}, R2: {r2}")


Registered model 'RandomForestRegressorModel' already exists. Creating a new version of this model...
2024/07/08 02:29:39 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestRegressorModel, version 8


Logged best model with MAE: 0.05056759545923629, MSE: 0.019486432343835357, R2: 0.9863729189227032


Created version '8' of model 'RandomForestRegressorModel'.


In [35]:
from sklearn.svm import SVR


params_svr = {
    'C': [0.1, 1, 10, 100, 1000],
    'epsilon': list(range(0,5)),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
}

svr = SVR(**params_svr)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=svr, param_grid=params_svr, scoring='neg_mean_squared_error', cv=3)

# Start an MLFlow run
with mlflow.start_run(run_name="teste_svr", description="bom dia frederick almeida!!!"):
    # Fit the grid search
    grid_search.fit(X_train, y_train)
    
    # Log the best model's hyperparameters and metrics
    best_params = grid_search.best_params_
    best_model = grid_search.best_estimator_
    
    # Predict on the test set
    y_pred = best_model.predict(X_test)
    
    # Calculate performance metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Log the hyperparameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("R2", r2)
    
    # Set a tag to remind ourselves what this run was for
    mlflow.set_tag("Model", "Random Forest Regressor with GridSearchCV")
    
    # Infer the model signature
    signature = infer_signature(X_train, best_model.predict(X_train))
    
    # Log the best model
    mlflow.sklearn.log_model(
        sk_model=best_model,
        artifact_path="DecisionTreeRegressorModel",
        signature=signature,
        input_example=X_train,
        registered_model_name="RandomForestRegressorModel",
    )
    
    print(f"Logged best model with MAE: {mae}, MSE: {mse}, R2: {r2}")


Registered model 'RandomForestRegressorModel' already exists. Creating a new version of this model...
2024/07/08 02:30:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestRegressorModel, version 9


Logged best model with MAE: 0.04075354901808306, MSE: 0.031559783050316356, R2: 0.9779298891238748


Created version '9' of model 'RandomForestRegressorModel'.
