# Importação de Bibliotecas

```python
# importa as bibliotecas necessárias para nosso projeto

In [2]:
import pandas as pd

import mlflow
from mlflow.models import infer_signature

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.model_selection import GridSearchCV, train_test_split

#import mlflow.sklearn
#from mlflow.models.signature import infer_signature

In [3]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

# Alternatively, suppress specific warnings, for example, UserWarnings
warnings.filterwarnings('ignore', category=UserWarning)

### Lendo o dataset pré-processado

```python

# Printando as 10 primeiras linhas para relembrar a estrutura dos dados


In [4]:
dataset = pd.read_csv('pre_processed_dataframe.csv')
dataset.head(10)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,...,Daily Steps,Sleep Disorder,Age Norm,Sleep Norm,Quality Norm,Physical Norm,Stress Norm,Heart Norm,Steps Norm,Sleep Discretized
0,1,Male,27,Software Engineer,6.1,6,42.0,6,Overweight,126/83,...,4200,,-1.75075,-1.29633,-1.096811,-0.826915,0.346556,1.652505,-1.617417,"(6.07, 6.34]"
1,2,Male,28,Doctor,6.2,6,60.0,8,Normal,125/80,...,10000,,-1.635452,-1.17067,-1.096811,0.03852,1.473618,1.168908,1.967442,"(6.07, 6.34]"
2,3,Male,28,Doctor,6.199996,6,60.0,8,Normal,125/80,...,10000,,-1.635452,-1.170675,-1.096811,0.03852,1.473618,1.168908,1.967442,"(6.07, 6.34]"
3,4,Male,28,Sales Representative,5.9,4,30.0,8,Obese,140/90,...,3000,Sleep Apnea,-1.635452,-1.54765,-2.767716,-1.403873,1.473618,3.586893,-2.359112,"(5.797, 6.07]"
4,5,Male,28,Sales Representative,5.9,4,30.0,8,Obese,140/90,...,3000,Sleep Apnea,-1.635452,-1.54765,-2.767716,-1.403873,1.473618,3.586893,-2.359112,"(5.797, 6.07]"
5,6,Male,28,Software Engineer,5.9,4,30.0,8,Obese,140/90,...,3000,Insomnia,-1.635452,-1.54765,-2.767716,-1.403873,1.473618,3.586893,-2.359112,"(5.797, 6.07]"
6,7,Male,29,Teacher,6.3,6,40.0,7,Obese,140/90,...,3500,Insomnia,-1.520153,-1.04501,-1.096811,-0.923075,0.910087,2.861497,-2.050073,"(6.07, 6.34]"
7,8,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,...,8000,,-1.520153,0.839891,-0.261358,0.759717,0.346556,-0.040084,0.731284,"(7.69, 7.96]"
8,9,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,...,8000,,-1.520153,0.839891,-0.261358,0.759717,0.346556,-0.040084,0.731284,"(7.69, 7.96]"
9,10,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,...,8000,,-1.520153,0.839891,-0.261358,0.759717,0.346556,-0.040084,0.731284,"(7.69, 7.96]"


### Printando os tipos de cada coluna

```python
# Interessante relembrar dessa vez como os tipos pós normalização ficaram

In [4]:
dataset.dtypes

Person ID                    int64
Gender                      object
Age                          int64
Occupation                  object
Sleep Duration             float64
Quality of Sleep             int64
Physical Activity Level    float64
Stress Level                 int64
BMI Category                object
Blood Pressure              object
Heart Rate                   int64
Daily Steps                  int64
Sleep Disorder              object
Age Norm                   float64
Sleep Norm                 float64
Quality Norm               float64
Physical Norm              float64
Stress Norm                float64
Heart Norm                 float64
Steps Norm                 float64
Sleep Discretized           object
dtype: object

### Trocando o tipo dos dados "object" para "category"

In [5]:
dataset["Gender"] = dataset["Gender"].astype("category")
dataset["Occupation"] = dataset["Occupation"].astype("category")
dataset["BMI Category"] = dataset["BMI Category"].astype("category")
dataset["Blood Pressure"] = dataset["Blood Pressure"].astype("category")
dataset["Sleep Disorder"] = dataset["Sleep Disorder"].astype("category")

### Dropando colunas

```python

# Aqui precisamos "dropar" as colunas desnecessárias ou redundantes para o treinamento do modelo, no caso várias colunas a quais já temos suas versões normalizadas e outras como "Person ID"/"Sleep Discretized"/"Quality norm"

In [6]:
ml_dataset = dataset.drop(columns=['Sleep Discretized', 'Person ID', 'Age', 'Sleep Duration', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps', 'Quality Norm'])
ml_dataset.head(10)

Unnamed: 0,Gender,Occupation,Quality of Sleep,BMI Category,Blood Pressure,Sleep Disorder,Age Norm,Sleep Norm,Physical Norm,Stress Norm,Heart Norm,Steps Norm
0,Male,Software Engineer,6,Overweight,126/83,,-1.75075,-1.29633,-0.826915,0.346556,1.652505,-1.617417
1,Male,Doctor,6,Normal,125/80,,-1.635452,-1.17067,0.03852,1.473618,1.168908,1.967442
2,Male,Doctor,6,Normal,125/80,,-1.635452,-1.170675,0.03852,1.473618,1.168908,1.967442
3,Male,Sales Representative,4,Obese,140/90,Sleep Apnea,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
4,Male,Sales Representative,4,Obese,140/90,Sleep Apnea,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
5,Male,Software Engineer,4,Obese,140/90,Insomnia,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
6,Male,Teacher,6,Obese,140/90,Insomnia,-1.520153,-1.04501,-0.923075,0.910087,2.861497,-2.050073
7,Male,Doctor,7,Normal,120/80,,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
8,Male,Doctor,7,Normal,120/80,,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
9,Male,Doctor,7,Normal,120/80,,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284


### Tipos finais do dataset

```python
ml_dataset.dtypes
# aqui temos como ficaram os tipos do dataset

In [23]:
ml_dataset.dtypes

Gender              category
Occupation          category
Quality of Sleep       int64
BMI Category        category
Blood Pressure      category
Sleep Disorder      category
Age Norm             float64
Sleep Norm           float64
Physical Norm        float64
Stress Norm          float64
Heart Norm           float64
Steps Norm           float64
dtype: object

#### categóricas -> numéricas

```python
# Como para alguns algoritmos de machine learning (não todos), precisamos que todas as colunas sejam numéricas, então iremos converter logo.

In [7]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

category_columns = ['Gender', 'Occupation', 'BMI Category', 'Blood Pressure', 'Sleep Disorder']

one_hot_encoder = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'),
    category_columns),
    remainder='passthrough')

data = one_hot_encoder.fit_transform(ml_dataset)

if not isinstance(data, pd.DataFrame):
    data = data.toarray()

# Get the feature names
feature_names = one_hot_encoder.get_feature_names_out()

# Create the DataFrame with the correct column names
data_df = pd.DataFrame(data, columns=feature_names)

# Display the first few rows of the new DataFrame
data_df.head(10)

Unnamed: 0,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Occupation_Accountant,onehotencoder__Occupation_Doctor,onehotencoder__Occupation_Engineer,onehotencoder__Occupation_Lawyer,onehotencoder__Occupation_Manager,onehotencoder__Occupation_Nurse,onehotencoder__Occupation_Sales Representative,onehotencoder__Occupation_Salesperson,...,onehotencoder__Sleep Disorder_Insomnia,onehotencoder__Sleep Disorder_Sleep Apnea,onehotencoder__Sleep Disorder_nan,remainder__Quality of Sleep,remainder__Age Norm,remainder__Sleep Norm,remainder__Physical Norm,remainder__Stress Norm,remainder__Heart Norm,remainder__Steps Norm
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,6.0,-1.75075,-1.29633,-0.826915,0.346556,1.652505,-1.617417
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,6.0,-1.635452,-1.17067,0.03852,1.473618,1.168908,1.967442
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,6.0,-1.635452,-1.170675,0.03852,1.473618,1.168908,1.967442
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,4.0,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,4.0,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,4.0,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,6.0,-1.520153,-1.04501,-0.923075,0.910087,2.861497,-2.050073
7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,7.0,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
8,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,7.0,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
9,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,7.0,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284


### Resultado do One Hot Encoder

```python
# Observe a mudança na dimensionalidade acarretada pelo algoritmo, utilizamos o One Hot Encoder para evitar problemas resultantes de ordem explícitas que possivelmente afetam as operações matemáticas

In [7]:
data_df.dtypes

onehotencoder__Gender_Female                      float64
onehotencoder__Gender_Male                        float64
onehotencoder__Occupation_Accountant              float64
onehotencoder__Occupation_Doctor                  float64
onehotencoder__Occupation_Engineer                float64
onehotencoder__Occupation_Lawyer                  float64
onehotencoder__Occupation_Manager                 float64
onehotencoder__Occupation_Nurse                   float64
onehotencoder__Occupation_Sales Representative    float64
onehotencoder__Occupation_Salesperson             float64
onehotencoder__Occupation_Scientist               float64
onehotencoder__Occupation_Software Engineer       float64
onehotencoder__Occupation_Teacher                 float64
onehotencoder__BMI Category_Normal                float64
onehotencoder__BMI Category_Normal Weight         float64
onehotencoder__BMI Category_Obese                 float64
onehotencoder__BMI Category_Overweight            float64
onehotencoder_

### Separando os dados
Usamos a seguinte distribuição:
`70% treino`, `15% validação`, `15% teste`

In [8]:
target = 'remainder__Quality of Sleep'

X = data_df.drop(columns=[target])
y = data_df[target]

# treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# agora dividindo treino/validação
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [9]:
len_X = len(X)
len(X_train)/len_X, len(X_val)/len_X, len(X_test)/len_X

(0.6978609625668449, 0.1497326203208556, 0.15240641711229946)

In [28]:
X_train.head()

Unnamed: 0,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Occupation_Accountant,onehotencoder__Occupation_Doctor,onehotencoder__Occupation_Engineer,onehotencoder__Occupation_Lawyer,onehotencoder__Occupation_Manager,onehotencoder__Occupation_Nurse,onehotencoder__Occupation_Sales Representative,onehotencoder__Occupation_Salesperson,...,onehotencoder__Blood Pressure_142/92,onehotencoder__Sleep Disorder_Insomnia,onehotencoder__Sleep Disorder_Sleep Apnea,onehotencoder__Sleep Disorder_nan,remainder__Age Norm,remainder__Sleep Norm,remainder__Physical Norm,remainder__Stress Norm,remainder__Heart Norm,remainder__Steps Norm
19,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,-1.404855,0.588571,0.759717,0.346556,-0.040084,0.731284
357,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.823506,1.091211,0.759717,-1.344036,-0.523681,0.113204
79,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,-1.058959,-1.42199,-1.403873,1.473618,0.443513,-1.122954
167,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,-0.13657,-0.039729,-0.201879,0.346556,0.443513,-0.504875
18,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-1.520153,-0.79369,-0.923075,0.910087,2.3779,-1.741033


In [29]:
y_train.head()

19     7.0
357    9.0
79     6.0
167    7.0
18     5.0
Name: remainder__Quality of Sleep, dtype: float64

Utilizaremos os seguintes algoritmos para a nossa tarefa de regressão:
- `KNN Regressor`
- ``Decision Tree Regressor``
- ``Random Forest Regressor``
- ``Support Vector Regressor (SVR)``

#### Setup do mlflow

In [10]:
host = "127.0.0.1"
port = "8080"

def setup_mlflow(host, port, name):
    # endereço local no qual o mlflow está rodando
    mlflow.set_tracking_uri(uri=f"http://{host}:{port}")

    # Setando o experimento onde serão publicados os dados
    mlflow.set_experiment(f"{name}")

#### Funções para mlflow e treinamento

In [11]:
from sklearn.metrics import make_scorer

In [12]:
# usaremos essas metricas no gridsearch
scores = {
    'MSE': make_scorer(mean_squared_error),
    'MAE': make_scorer(mean_absolute_error),
    'r2': make_scorer(r2_score)
}

In [13]:
def best_params_grid(model, param_grid, scores, refit):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scores, cv=5, refit=refit)
    grid_search.fit(X_train, y_train)
    
    top_5_results = pd.DataFrame(grid_search.cv_results_)[['params','rank_test_r2', 'mean_test_r2']].sort_values(by='rank_test_r2').head()
    
    return top_5_results

In [14]:
def log_data(model_func, top_params, run_name, model_name):
    i = 0
    for param_set in top_params:
        i += 1
        current_run_name = run_name + ' ' + str(i)
        model = model_func(**param_set)

        print(f"using model {model.get_params()}")

        # Start an MLFlow run
        with mlflow.start_run(run_name=current_run_name):
            # Fit the grid search
            model.fit(X_train, y_train)
            
            # Predict on the test set
            y_pred = model.predict(X_val)
            
            # Calculate performance metrics
            mae = mean_absolute_error(y_val, y_pred)
            mse = mean_squared_error(y_val, y_pred)
            r2 = r2_score(y_val, y_pred)
            
            # Log the hyperparameters and metrics
            mlflow.log_params(param_set)
            mlflow.log_metric("MAE", mae)
            mlflow.log_metric("MSE", mse)
            mlflow.log_metric("R2", r2)
            
            # Set a tag to remind ourselves what this run was for
            mlflow.set_tag("Model", f"{model_name}")
            
            # Infer the model signature
            signature = infer_signature(X_train, model.predict(X_train))
            
            # Log the best model
            mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path=f"{model_name}",
                signature=signature,
                input_example=X_train,
                registered_model_name=f"{model_name}",
            )

#### Decision Tree Regressor

In [67]:
# Define the range of hyperparameters for the Grid Search
param_grid = {
    "max_depth": list(range(10,110,20)),
    "min_samples_split": [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

setup_mlflow(host, port, 'First Models: DecisionTreeRegressor')

# Initialize the Decision Tree Regressor
best_params = best_params_grid(DecisionTreeRegressor(), param_grid, scores, 'r2')
log_data(DecisionTreeRegressor, best_params['params'], 'DecisionTreeRegressor', 'DecisionTreeRegressor')

2024/07/08 15:14:43 INFO mlflow.tracking.fluent: Experiment with name 'First Models: DecisionTreeRegressor' does not exist. Creating a new experiment.


using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 90, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 15:14:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 56
Created version '56' of model 'DecisionTreeRegressor'.


using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 70, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 15:14:47 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 57
Created version '57' of model 'DecisionTreeRegressor'.


using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 15:14:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 58
Created version '58' of model 'DecisionTreeRegressor'.


using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 90, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 15:14:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 59
Created version '59' of model 'DecisionTreeRegressor'.


using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 30, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 15:14:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 60
Created version '60' of model 'DecisionTreeRegressor'.


Melhor conjunto de parametros:
- max_depth = 70
- min_samples_leaf = 4
- min_samples_split = 5

Métricas:
- MAE = 0.048
- MSE = 0.027
- R2 = 0.984

#### Testes com `KNN`

In [71]:
# Escolhemos aqui os parâmetros que serão estudados/testados pelo GridSearch
param_grid = {
    'n_neighbors': range(1, 31),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
}

setup_mlflow(host, port, 'First Models: KNN regressor')

best_params = best_params_grid(KNeighborsRegressor(), param_grid, scores, 'r2')
log_data(KNeighborsRegressor, best_params['params'], 'KNN regressor', 'KNN regressor')

2024/07/08 15:17:42 INFO mlflow.tracking.fluent: Experiment with name 'First Models: KNN regressor' does not exist. Creating a new experiment.


using model {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 7, 'p': 2, 'weights': 'distance'}


Registered model 'KNN regressor' already exists. Creating a new version of this model...
2024/07/08 15:17:48 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: KNN regressor, version 2
Created version '2' of model 'KNN regressor'.


using model {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'euclidean', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 7, 'p': 2, 'weights': 'distance'}


Registered model 'KNN regressor' already exists. Creating a new version of this model...
2024/07/08 15:17:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: KNN regressor, version 3
Created version '3' of model 'KNN regressor'.


using model {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}


Registered model 'KNN regressor' already exists. Creating a new version of this model...
2024/07/08 15:17:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: KNN regressor, version 4
Created version '4' of model 'KNN regressor'.


using model {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'euclidean', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}


Registered model 'KNN regressor' already exists. Creating a new version of this model...
2024/07/08 15:17:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: KNN regressor, version 5
Created version '5' of model 'KNN regressor'.


using model {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'euclidean', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 8, 'p': 2, 'weights': 'distance'}


Registered model 'KNN regressor' already exists. Creating a new version of this model...
2024/07/08 15:17:55 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: KNN regressor, version 6
Created version '6' of model 'KNN regressor'.


Melhor conjunto de parametros:
- weights = distance
- n_neighbors = 7
- metric = minkowski

Métricas:
- MAE = 0.019
- MSE = 0.012
- R2 = 0.993

#### Resultados GridSearch
```python
# portanto encontramos os parâmetros teoricamente ideais para o KNN

print(knn_regressor.get_params())
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [31]:
knn_regressor.fit(X_train, y_train)
y_pred = knn_regressor.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

Mean Absolute Error (MAE): 0.04210526315789474
Mean Squared Error (MSE): 0.023859649122807018
R-squared (R²): 0.9833146792940164


#### Testes com Random Forest Regressor

In [72]:
# Escolhemos aqui os parâmetros que serão estudados/testados pelo GridSearch
param_grid = {
    'n_estimators': range(1,20),
    'max_depth': [None] + list(range(10,100,10)), # testando de 1 a 100 tava demorando demais
    'max_features': ['sqrt', 'log2']
}

setup_mlflow(host, port, 'First Models: Random Forest Regressor')

best_params = best_params_grid(RandomForestRegressor(), param_grid, scores, 'r2')
log_data(RandomForestRegressor, best_params['params'], 'Random Forest Regressor', 'Random Forest Regressor')

2024/07/08 15:20:56 INFO mlflow.tracking.fluent: Experiment with name 'First Models: Random Forest Regressor' does not exist. Creating a new experiment.


using model {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 90, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 11, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


Successfully registered model 'Random Forest Regressor'.
2024/07/08 15:21:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest Regressor, version 1
Created version '1' of model 'Random Forest Regressor'.


using model {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 90, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 15, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


Registered model 'Random Forest Regressor' already exists. Creating a new version of this model...
2024/07/08 15:21:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest Regressor, version 2
Created version '2' of model 'Random Forest Regressor'.


using model {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 80, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


Registered model 'Random Forest Regressor' already exists. Creating a new version of this model...
2024/07/08 15:21:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest Regressor, version 3
Created version '3' of model 'Random Forest Regressor'.


using model {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 90, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 18, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


Registered model 'Random Forest Regressor' already exists. Creating a new version of this model...
2024/07/08 15:21:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest Regressor, version 4
Created version '4' of model 'Random Forest Regressor'.


using model {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 20, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 15, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


Registered model 'Random Forest Regressor' already exists. Creating a new version of this model...
2024/07/08 15:21:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest Regressor, version 5
Created version '5' of model 'Random Forest Regressor'.


Melhor conjunto de parametros:
- max_depth = 90
- n_estimators = 18
- max_features = sqrt

Métricas:
- MAE = 0.04
- MSE = 0.024
- R2 = 0.986

#### Testes com Support Vector Regressor

In [16]:
# Escolhemos aqui os parâmetros que serão estudados/testados pelo GridSearch
param_grid = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 1],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
}

setup_mlflow(host, port, 'First Models: Support Vector Regressor')

best_params = best_params_grid(SVR(), param_grid, scores, 'r2')
log_data(SVR, best_params['params'], 'Support Vector Regressor', 'Support Vector Regressor')

2024/07/08 15:47:19 INFO mlflow.tracking.fluent: Experiment with name 'First Models: Support Vector Regressor' does not exist. Creating a new experiment.


using model {'C': 1, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.01, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Successfully registered model 'Support Vector Regressor'.
2024/07/08 15:47:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Regressor, version 1
Created version '1' of model 'Support Vector Regressor'.


using model {'C': 1, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.01, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Registered model 'Support Vector Regressor' already exists. Creating a new version of this model...
2024/07/08 15:47:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Regressor, version 2
Created version '2' of model 'Support Vector Regressor'.


using model {'C': 10, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.01, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Registered model 'Support Vector Regressor' already exists. Creating a new version of this model...
2024/07/08 15:47:28 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Regressor, version 3
Created version '3' of model 'Support Vector Regressor'.


using model {'C': 100, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.01, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Registered model 'Support Vector Regressor' already exists. Creating a new version of this model...
2024/07/08 15:47:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Regressor, version 4
Created version '4' of model 'Support Vector Regressor'.


using model {'C': 10, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Registered model 'Support Vector Regressor' already exists. Creating a new version of this model...
2024/07/08 15:47:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Regressor, version 5
Created version '5' of model 'Support Vector Regressor'.


Melhor conjunto de parametros:
- epsilon = 0.01
- C = 1
- kernel = poly

Métricas:
- MAE = 0.031
- MSE = 0.005
- R2 = 0.997