# Importação de Bibliotecas

```python
# importa as bibliotecas necessárias para nosso projeto

In [1]:
import pandas as pd

import mlflow
from mlflow.models import infer_signature

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.model_selection import GridSearchCV, train_test_split

In [2]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

### Lendo o dataset pré-processado

```python

# Printando as 10 primeiras linhas para relembrar a estrutura dos dados


In [9]:
dataset = pd.read_csv('pre_processed_dataframe.csv')
dataset.head(10)

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,...,Daily Steps,Sleep Disorder,Age Norm,Sleep Norm,Quality Norm,Physical Norm,Stress Norm,Heart Norm,Steps Norm,Sleep Discretized
0,1,Male,27,Software Engineer,6.1,6,42.0,6,Overweight,126/83,...,4200,,-1.75075,-1.29633,-1.096811,-0.826915,0.346556,1.652505,-1.617417,"(6.07, 6.34]"
1,2,Male,28,Doctor,6.2,6,60.0,8,Normal,125/80,...,10000,,-1.635452,-1.17067,-1.096811,0.03852,1.473618,1.168908,1.967442,"(6.07, 6.34]"
2,3,Male,28,Doctor,6.199996,6,60.0,8,Normal,125/80,...,10000,,-1.635452,-1.170675,-1.096811,0.03852,1.473618,1.168908,1.967442,"(6.07, 6.34]"
3,4,Male,28,Sales Representative,5.9,4,30.0,8,Obese,140/90,...,3000,Sleep Apnea,-1.635452,-1.54765,-2.767716,-1.403873,1.473618,3.586893,-2.359112,"(5.797, 6.07]"
4,5,Male,28,Sales Representative,5.9,4,30.0,8,Obese,140/90,...,3000,Sleep Apnea,-1.635452,-1.54765,-2.767716,-1.403873,1.473618,3.586893,-2.359112,"(5.797, 6.07]"
5,6,Male,28,Software Engineer,5.9,4,30.0,8,Obese,140/90,...,3000,Insomnia,-1.635452,-1.54765,-2.767716,-1.403873,1.473618,3.586893,-2.359112,"(5.797, 6.07]"
6,7,Male,29,Teacher,6.3,6,40.0,7,Obese,140/90,...,3500,Insomnia,-1.520153,-1.04501,-1.096811,-0.923075,0.910087,2.861497,-2.050073,"(6.07, 6.34]"
7,8,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,...,8000,,-1.520153,0.839891,-0.261358,0.759717,0.346556,-0.040084,0.731284,"(7.69, 7.96]"
8,9,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,...,8000,,-1.520153,0.839891,-0.261358,0.759717,0.346556,-0.040084,0.731284,"(7.69, 7.96]"
9,10,Male,29,Doctor,7.8,7,75.0,6,Normal,120/80,...,8000,,-1.520153,0.839891,-0.261358,0.759717,0.346556,-0.040084,0.731284,"(7.69, 7.96]"


### Printando os tipos de cada coluna

```python
# Interessante relembrar dessa vez como os tipos pós normalização ficaram

In [10]:
dataset.dtypes

Person ID                    int64
Gender                      object
Age                          int64
Occupation                  object
Sleep Duration             float64
Quality of Sleep             int64
Physical Activity Level    float64
Stress Level                 int64
BMI Category                object
Blood Pressure              object
Heart Rate                   int64
Daily Steps                  int64
Sleep Disorder              object
Age Norm                   float64
Sleep Norm                 float64
Quality Norm               float64
Physical Norm              float64
Stress Norm                float64
Heart Norm                 float64
Steps Norm                 float64
Sleep Discretized           object
dtype: object

Após salvarmos o dataset processado em um arquivo .csv e importá-lo novamente, observamos que as variáveis categóricas voltaram a ser "object", então iremos transformá-las em categóricas novamente.

### Trocando o tipo dos dados "object" para "category"

In [11]:
dataset["Gender"] = dataset["Gender"].astype("category")
dataset["Occupation"] = dataset["Occupation"].astype("category")
dataset["BMI Category"] = dataset["BMI Category"].astype("category")
dataset["Blood Pressure"] = dataset["Blood Pressure"].astype("category")
dataset["Sleep Disorder"] = dataset["Sleep Disorder"].astype("category")

### Dropando colunas

Aqui precisamos "dropar" as colunas desnecessárias ou redundantes para o treinamento do modelo, no caso várias colunas a quais já temos suas versões normalizadas e outras como "Person ID"/"Sleep Discretized"/"Quality norm", que não agregam informações relevantes para as previsões (como Person ID, que é um valor arbitrário). Iremos tentar prever a coluna **Quality of Sleep**, por isso iremos remover a sua versão normalizada e manter a sua versão original.

In [12]:
ml_dataset = dataset.drop(columns=['Sleep Discretized', 'Person ID', 'Age', 'Sleep Duration', 'Physical Activity Level', 'Stress Level', 'Heart Rate', 'Daily Steps', 'Quality Norm'])
ml_dataset.head(10)

Unnamed: 0,Gender,Occupation,Quality of Sleep,BMI Category,Blood Pressure,Sleep Disorder,Age Norm,Sleep Norm,Physical Norm,Stress Norm,Heart Norm,Steps Norm
0,Male,Software Engineer,6,Overweight,126/83,,-1.75075,-1.29633,-0.826915,0.346556,1.652505,-1.617417
1,Male,Doctor,6,Normal,125/80,,-1.635452,-1.17067,0.03852,1.473618,1.168908,1.967442
2,Male,Doctor,6,Normal,125/80,,-1.635452,-1.170675,0.03852,1.473618,1.168908,1.967442
3,Male,Sales Representative,4,Obese,140/90,Sleep Apnea,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
4,Male,Sales Representative,4,Obese,140/90,Sleep Apnea,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
5,Male,Software Engineer,4,Obese,140/90,Insomnia,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
6,Male,Teacher,6,Obese,140/90,Insomnia,-1.520153,-1.04501,-0.923075,0.910087,2.861497,-2.050073
7,Male,Doctor,7,Normal,120/80,,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
8,Male,Doctor,7,Normal,120/80,,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
9,Male,Doctor,7,Normal,120/80,,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284


In [13]:
ml_dataset.dtypes

Gender              category
Occupation          category
Quality of Sleep       int64
BMI Category        category
Blood Pressure      category
Sleep Disorder      category
Age Norm             float64
Sleep Norm           float64
Physical Norm        float64
Stress Norm          float64
Heart Norm           float64
Steps Norm           float64
dtype: object

#### categóricas -> numéricas

```python
# Como para alguns algoritmos de machine learning (não todos), precisamos que todas as colunas sejam numéricas, então iremos converter logo.

Utilizamos o one hot encoder para transformar as colunas 'Gender', 'Occupation', 'BMI Category', 'Blood Pressure', 'Sleep Disorder' em colunas numéricas.

In [14]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder

# colunas a serem convertidas
category_columns = ['Gender', 'Occupation', 'BMI Category', 'Blood Pressure', 'Sleep Disorder']

# instaciamos o one hot encoder
one_hot_encoder = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'),
    category_columns),
    remainder='passthrough')

# aplicamos no dataset
data = one_hot_encoder.fit_transform(ml_dataset)

if not isinstance(data, pd.DataFrame):
    data = data.toarray()

feature_names = one_hot_encoder.get_feature_names_out()

# transformamos de volta em um dataframe
data_df = pd.DataFrame(data, columns=feature_names)

data_df.head(10)

Unnamed: 0,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Occupation_Accountant,onehotencoder__Occupation_Doctor,onehotencoder__Occupation_Engineer,onehotencoder__Occupation_Lawyer,onehotencoder__Occupation_Manager,onehotencoder__Occupation_Nurse,onehotencoder__Occupation_Sales Representative,onehotencoder__Occupation_Salesperson,...,onehotencoder__Sleep Disorder_Insomnia,onehotencoder__Sleep Disorder_Sleep Apnea,onehotencoder__Sleep Disorder_nan,remainder__Quality of Sleep,remainder__Age Norm,remainder__Sleep Norm,remainder__Physical Norm,remainder__Stress Norm,remainder__Heart Norm,remainder__Steps Norm
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,6.0,-1.75075,-1.29633,-0.826915,0.346556,1.652505,-1.617417
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,6.0,-1.635452,-1.17067,0.03852,1.473618,1.168908,1.967442
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,6.0,-1.635452,-1.170675,0.03852,1.473618,1.168908,1.967442
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,4.0,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,4.0,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,4.0,-1.635452,-1.54765,-1.403873,1.473618,3.586893,-2.359112
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,6.0,-1.520153,-1.04501,-0.923075,0.910087,2.861497,-2.050073
7,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,7.0,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
8,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,7.0,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284
9,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,7.0,-1.520153,0.839891,0.759717,0.346556,-0.040084,0.731284


### Resultado do One Hot Encoder

```python
# Observe a mudança na dimensionalidade acarretada pelo algoritmo, utilizamos o One Hot Encoder para evitar problemas resultantes de ordem explícitas que possivelmente afetam as operações matemáticas

In [15]:
data_df.dtypes

onehotencoder__Gender_Female                      float64
onehotencoder__Gender_Male                        float64
onehotencoder__Occupation_Accountant              float64
onehotencoder__Occupation_Doctor                  float64
onehotencoder__Occupation_Engineer                float64
onehotencoder__Occupation_Lawyer                  float64
onehotencoder__Occupation_Manager                 float64
onehotencoder__Occupation_Nurse                   float64
onehotencoder__Occupation_Sales Representative    float64
onehotencoder__Occupation_Salesperson             float64
onehotencoder__Occupation_Scientist               float64
onehotencoder__Occupation_Software Engineer       float64
onehotencoder__Occupation_Teacher                 float64
onehotencoder__BMI Category_Normal                float64
onehotencoder__BMI Category_Normal Weight         float64
onehotencoder__BMI Category_Obese                 float64
onehotencoder__BMI Category_Overweight            float64
onehotencoder_

### Separando os dados
Usamos a seguinte distribuição:

`70% treino`, `15% validação`, `15% teste`

Tentaremos prever a qualidade de sono (problema de regressão), de acordo com os outros dados, então na divisão dos dados, colocamos essa coluna na variável "y", enquanto as outras permanecem em "X".

In [16]:
# nome da coluna após o one hot encoder
target = 'remainder__Quality of Sleep'

X = data_df.drop(columns=[target])
y = data_df[target]

# 70% treino e 30% teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# agora dividimos o teste em 50% teste e 50% validação
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [17]:
# verificando a distruibuição dos dados
len_X = len(X)
len(X_train)/len_X, len(X_val)/len_X, len(X_test)/len_X

(0.6978609625668449, 0.1497326203208556, 0.15240641711229946)

In [18]:
X_train.head()

Unnamed: 0,onehotencoder__Gender_Female,onehotencoder__Gender_Male,onehotencoder__Occupation_Accountant,onehotencoder__Occupation_Doctor,onehotencoder__Occupation_Engineer,onehotencoder__Occupation_Lawyer,onehotencoder__Occupation_Manager,onehotencoder__Occupation_Nurse,onehotencoder__Occupation_Sales Representative,onehotencoder__Occupation_Salesperson,...,onehotencoder__Blood Pressure_142/92,onehotencoder__Sleep Disorder_Insomnia,onehotencoder__Sleep Disorder_Sleep Apnea,onehotencoder__Sleep Disorder_nan,remainder__Age Norm,remainder__Sleep Norm,remainder__Physical Norm,remainder__Stress Norm,remainder__Heart Norm,remainder__Steps Norm
19,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,-1.404855,0.588571,0.759717,0.346556,-0.040084,0.731284
357,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.823506,1.091211,0.759717,-1.344036,-0.523681,0.113204
79,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,-1.058959,-1.42199,-1.403873,1.473618,0.443513,-1.122954
167,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,-0.13657,-0.039729,-0.201879,0.346556,0.443513,-0.504875
18,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-1.520153,-0.79369,-0.923075,0.910087,2.3779,-1.741033


In [19]:
y_train.head()

19     7.0
357    9.0
79     6.0
167    7.0
18     5.0
Name: remainder__Quality of Sleep, dtype: float64

### Testando algoritmos de machine learning
Utilizaremos os seguintes algoritmos para a nossa tarefa de regressão:
- `KNN Regressor`
- ``Decision Tree Regressor``
- ``Random Forest Regressor``
- ``Support Vector Regressor (SVR)``

#### Setup do mlflow

In [20]:
# precisamos criar um servidor local para o mlflow
host = "127.0.0.1"
port = "8080"

def setup_mlflow(host, port, name):
    # endereço local no qual o mlflow está rodando
    mlflow.set_tracking_uri(uri=f"http://{host}:{port}")

    # Setando o experimento onde serão publicados os dados
    mlflow.set_experiment(f"{name}")

#### Funções para mlflow e treinamento

In [21]:
from sklearn.metrics import make_scorer

Utilizaremos as seguintes métricas durante o treinamento:
- MSE: mede a média dos quadrados dos erros, onde o erro é a diferença entre o valor real e o valor previsto.
- MAE: mede a média das diferenças absolutas entre os valores reais e os valores previstos.
- R2: mede a proporção da variância dos valores reais que é explicada pelo modelo.

In [None]:
scores = {
    'MSE': make_scorer(mean_squared_error),
    'MAE': make_scorer(mean_absolute_error),
    'r2': make_scorer(r2_score)
}

Iremos passar para essa função o modelo, os hyperparametros que queremos testar e as métricas que utilizaremos para avaliar (passando refit para realizar o ranking, no caso de utilizarmos mais de 1 métrica), utiliza-se o gridsearch para testar todas as combinações de hiperparametros e retornamos as 5 melhores, para então testar no conjunto de validação.

In [5]:
def best_params_grid(model, param_grid, scores, refit):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scores, cv=5, refit=refit)
    grid_search.fit(X_train, y_train)
    
    top_5_results = pd.DataFrame(grid_search.cv_results_)[['params','rank_test_r2', 'mean_test_r2']].sort_values(by='rank_test_r2').head()
    
    return top_5_results

Aqui, testamos as combinações de hiperparametros retornadas na função acima, no conjunto de validação, registrando tudo no mlflow.

In [4]:
def log_data(model_func, top_params, run_name, model_name):
    i = 0
    for param_set in top_params:
        i += 1
        current_run_name = run_name + ' ' + str(i)
        model = model_func(**param_set)

        print(f"using model {model.get_params()}")

        # Start an MLFlow run
        with mlflow.start_run(run_name=current_run_name):
            # treinamos o modelo e realizamos a predição no conjunto de validação
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)
            
            # Cálculo das métricas
            mae = mean_absolute_error(y_val, y_pred)
            mse = mean_squared_error(y_val, y_pred)
            r2 = r2_score(y_val, y_pred)
            
            # Registramos as métricas e hiperparametros
            mlflow.log_params(param_set)
            mlflow.log_metric("MAE", mae)
            mlflow.log_metric("MSE", mse)
            mlflow.log_metric("R2", r2)
            
            # Nome da run, para identificarmos posteriormente
            mlflow.set_tag("Model", f"{model_name}")
            
            # Infere a assinatura do modelo
            signature = infer_signature(X_train, model.predict(X_train))
            
            # Registrar o modelo
            mlflow.sklearn.log_model(
                sk_model=model,
                artifact_path=f"{model_name}",
                signature=signature,
                input_example=X_train,
                registered_model_name=f"{model_name}",
            )

#### Decision Tree Regressor

Decision Tree Regressor é um modelo de aprendizado de máquina usado para prever valores contínuos. Ele funciona particionando os dados em subconjuntos baseados em características, de forma hierárquica, e calculando previsões nas folhas da árvore.

![Decision Tree](https://www.mastersindatascience.org/wp-content/uploads/sites/54/2022/05/tree-graphic.jpg)

Escolhemos os seguintes hiperparâmetros:

max_depth:
- Define a profundidade máxima da árvore de decisão.
- Controla o crescimento da árvore para evitar que ela se torne muito complexa (overfitting) ou muito simples (underfitting).

min_samples_split
- Define o número mínimo de amostras necessárias para que um nó interno possa ser dividido.
- Previne a divisão de nós que contêm muito poucas amostras, o que poderia levar a uma árvore excessivamente complexa e overfitting.

min_samples_leaf
- Define o número mínimo de amostras que uma folha deve conter.
- Garante que as folhas não sejam muito pequenas, o que pode ajudar a regularizar a árvore e melhorar a generalização.

In [67]:
param_grid = {
    "max_depth": list(range(10,110,20)),
    "min_samples_split": [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

setup_mlflow(host, port, 'First Models: DecisionTreeRegressor')

# Initialize the Decision Tree Regressor
best_params = best_params_grid(DecisionTreeRegressor(), param_grid, scores, 'r2')
log_data(DecisionTreeRegressor, best_params['params'], 'DecisionTreeRegressor', 'DecisionTreeRegressor')

2024/07/08 15:14:43 INFO mlflow.tracking.fluent: Experiment with name 'First Models: DecisionTreeRegressor' does not exist. Creating a new experiment.


using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 90, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 15:14:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 56
Created version '56' of model 'DecisionTreeRegressor'.


using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 70, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 5, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 15:14:47 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 57
Created version '57' of model 'DecisionTreeRegressor'.


using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 10, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 15:14:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 58
Created version '58' of model 'DecisionTreeRegressor'.


using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 90, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 15:14:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 59
Created version '59' of model 'DecisionTreeRegressor'.


using model {'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 30, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 10, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}


Registered model 'DecisionTreeRegressor' already exists. Creating a new version of this model...
2024/07/08 15:14:52 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DecisionTreeRegressor, version 60
Created version '60' of model 'DecisionTreeRegressor'.


Melhor conjunto de parametros:
- max_depth = 70
- min_samples_leaf = 4
- min_samples_split = 5

Métricas:
- MAE = 0.048
- MSE = 0.027
- R2 = 0.984

#### Testes com `KNN`

KNN calcula a distância entre o ponto de dados a ser previsto e todos os pontos de dados no conjunto de treinamento. Ele identifica os K pontos de dados no conjunto de treinamento que estão mais próximos do ponto de dados a ser previsto e, para o caso de regressão, o algoritmo calcula a média (ou a mediana) dos valores dos K vizinhos mais próximos e usa essa média como a previsão para o ponto de dados de teste.

Escolhemos os seguintes hiperparâmetros:

n_neighbors
- Define o número de vizinhos mais próximos a serem considerados para fazer uma previsão.
- Controla o número de pontos de dados do conjunto de treinamento que influenciam a previsão para um novo ponto de dados.

weights
- Define a função de ponderação usada para determinar a influência dos vizinhos na previsão.
- Controla como a proximidade dos vizinhos influencia a previsão.

metric
- Define a métrica de distância usada para calcular a proximidade entre os pontos de dados.
- Determina como a similaridade entre os pontos de dados é medida.

In [None]:
param_grid = {
    'n_neighbors': range(1, 31),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
}

setup_mlflow(host, port, 'First Models: KNN regressor')

best_params = best_params_grid(KNeighborsRegressor(), param_grid, scores, 'r2')
log_data(KNeighborsRegressor, best_params['params'], 'KNN regressor', 'KNN regressor')

Melhor conjunto de parametros:
- weights = distance
- n_neighbors = 7
- metric = minkowski

Métricas:
- MAE = 0.019
- MSE = 0.012
- R2 = 0.993

#### Testes com Random Forest Regressor

É uma coleção (floresta) de árvores de decisão que são treinadas de forma independente e cujas previsões são combinadas para produzir a previsão final. No caso de regressão, a previsão final é geralmente a média das previsões de todas as árvores individuais.

Escolhemos os seguintes hiperparâmetros:

n_estimators
 - Define o número de árvores na floresta.
 - Controla o número de modelos de árvores de decisão individuais que serão combinados para formar a previsão final.

max_depth
 - Define a profundidade máxima de cada árvore individual na floresta.
 - Controla o crescimento de cada árvore para evitar overfitting.

max_features
 - Define o número de características a serem consideradas ao procurar a melhor divisão em cada nó.
 - Introduz aleatoriedade adicional no modelo para reduzir a correlação entre as árvores individuais e melhorar a robustez do modelo.

In [72]:
# Escolhemos aqui os parâmetros que serão estudados/testados pelo GridSearch
param_grid = {
    'n_estimators': range(1,20),
    'max_depth': [None] + list(range(10,100,10)),
    'max_features': ['sqrt', 'log2']
}

setup_mlflow(host, port, 'First Models: Random Forest Regressor')

best_params = best_params_grid(RandomForestRegressor(), param_grid, scores, 'r2')
log_data(RandomForestRegressor, best_params['params'], 'Random Forest Regressor', 'Random Forest Regressor')

2024/07/08 15:20:56 INFO mlflow.tracking.fluent: Experiment with name 'First Models: Random Forest Regressor' does not exist. Creating a new experiment.


using model {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 90, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 11, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


Successfully registered model 'Random Forest Regressor'.
2024/07/08 15:21:18 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest Regressor, version 1
Created version '1' of model 'Random Forest Regressor'.


using model {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 90, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 15, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


Registered model 'Random Forest Regressor' already exists. Creating a new version of this model...
2024/07/08 15:21:19 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest Regressor, version 2
Created version '2' of model 'Random Forest Regressor'.


using model {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 80, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 10, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


Registered model 'Random Forest Regressor' already exists. Creating a new version of this model...
2024/07/08 15:21:21 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest Regressor, version 3
Created version '3' of model 'Random Forest Regressor'.


using model {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 90, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 18, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


Registered model 'Random Forest Regressor' already exists. Creating a new version of this model...
2024/07/08 15:21:23 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest Regressor, version 4
Created version '4' of model 'Random Forest Regressor'.


using model {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 20, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 15, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}


Registered model 'Random Forest Regressor' already exists. Creating a new version of this model...
2024/07/08 15:21:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest Regressor, version 5
Created version '5' of model 'Random Forest Regressor'.


Melhor conjunto de parametros:
- max_depth = 90
- n_estimators = 18
- max_features = sqrt

Métricas:
- MAE = 0.04
- MSE = 0.024
- R2 = 0.986

#### Testes com Support Vector Regressor

SVR é uma extensão do algoritmo de Support Vector Machines (SVM) para problemas de regressão. Em vez de apenas encontrar uma margem que separa classes, o SVR tenta encontrar uma função que desvia o mínimo possível dos pontos de dados observados, com uma margem de tolerância especificada.

Escolhemos os seguintes hiperparâmetros:

C (Parâmetro de Regularização)
 - Controla a penalização dos erros de treinamento.
 - Balancear a maximização da margem e a minimização do erro de treinamento.

epsilon (Margem de Tolerância)
 - Define uma faixa dentro da qual os erros não são penalizados.
 - Criar uma margem de tolerância ao redor da função de regressão onde os erros são considerados aceitáveis e não influenciam a função de custo.

kernel (Função do Kernel)
 - Define o tipo de função do kernel a ser usada para transformar os dados.
 - Permitir que o SVR lide com problemas de regressão linear e não linear.

In [16]:
param_grid = {
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.01, 0.1, 1],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
}

setup_mlflow(host, port, 'First Models: Support Vector Regressor')

best_params = best_params_grid(SVR(), param_grid, scores, 'r2')
log_data(SVR, best_params['params'], 'Support Vector Regressor', 'Support Vector Regressor')

2024/07/08 15:47:19 INFO mlflow.tracking.fluent: Experiment with name 'First Models: Support Vector Regressor' does not exist. Creating a new experiment.


using model {'C': 1, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.01, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Successfully registered model 'Support Vector Regressor'.
2024/07/08 15:47:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Regressor, version 1
Created version '1' of model 'Support Vector Regressor'.


using model {'C': 1, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.01, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Registered model 'Support Vector Regressor' already exists. Creating a new version of this model...
2024/07/08 15:47:27 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Regressor, version 2
Created version '2' of model 'Support Vector Regressor'.


using model {'C': 10, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.01, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Registered model 'Support Vector Regressor' already exists. Creating a new version of this model...
2024/07/08 15:47:28 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Regressor, version 3
Created version '3' of model 'Support Vector Regressor'.


using model {'C': 100, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.01, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Registered model 'Support Vector Regressor' already exists. Creating a new version of this model...
2024/07/08 15:47:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Regressor, version 4
Created version '4' of model 'Support Vector Regressor'.


using model {'C': 10, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'poly', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}


Registered model 'Support Vector Regressor' already exists. Creating a new version of this model...
2024/07/08 15:47:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Support Vector Regressor, version 5
Created version '5' of model 'Support Vector Regressor'.


Melhor conjunto de parametros:
- epsilon = 0.01
- C = 1
- kernel = poly

Métricas:
- MAE = 0.031
- MSE = 0.005
- R2 = 0.997

### Testando o melhor modelo de cada algoritmo, com o conjunto de teste

In [29]:
decision_tree_params = {
    "max_depth": 70,
    "min_samples_split": 5,
    'min_samples_leaf': 4
}

knn_params = {
    'n_neighbors': 7,
    'weights': 'distance',
    'metric': 'minkowski',
}

random_forest_params = {
    'n_estimators': 18,
    'max_depth': 80,
    'max_features': 'sqrt'
}

svr_params = {
    'C': 1,
    'epsilon': 0.01,
    'kernel': 'poly',
}

In [34]:
dt = DecisionTreeRegressor(**decision_tree_params)
knn = KNeighborsRegressor(**knn_params)
rf = RandomForestRegressor(**random_forest_params)
svr = SVR(**svr_params)

models = [[dt, "decistion tree"], [knn, "knn"], [rf, "random forest"], [svr, "support vector"]]

In [39]:
# criamos um novo experimento no mlflow
setup_mlflow(host, port, "Comparing Algorithms")

for model in models:
    # treinamos com os melhores parametros e calculamos as previsões no conjunto de teste
    model[0].fit(X_train, y_train)
    y_preds = model[0].predict(X_test)
    
    # calculamos as métricas
    mae = mean_absolute_error(y_test, y_preds)
    mse = mean_squared_error(y_test, y_preds)
    r2 = r2_score(y_test, y_preds)

    # registramos os dados no mlflow
    with mlflow.start_run(run_name=model[1]):
        # hiperparametros e métricas
        mlflow.log_params(model[0].get_params())
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("MSE", mse)
        mlflow.log_metric("R2", r2)
        
        # Nome da run
        mlflow.set_tag("Model", f"{model[1]}")
        
        # assinatura do modelo
        signature = infer_signature(X_train, model[0].predict(X_train))
        
        # registramos o modelo
        mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path=f"{model[1]}",
            signature=signature,
            input_example=X_train,
            registered_model_name=f"{model[1]}",
        )



Registered model 'decistion tree' already exists. Creating a new version of this model...
2024/07/10 14:43:01 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: decistion tree, version 2
Created version '2' of model 'decistion tree'.
Registered model 'knn' already exists. Creating a new version of this model...
2024/07/10 14:43:03 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: knn, version 2
Created version '2' of model 'knn'.
Registered model 'random forest' already exists. Creating a new version of this model...
2024/07/10 14:43:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random forest, version 2
Created version '2' of model 'random forest'.
Registered model 'support vector' already exists. Creating a new version of this model...
2024/07/10 14:43:06 IN

Resultados:
- Decision Tree:
  - MAE: 0.044
  - MSE: 0.033
  - R2: 0.977
- KNN:
  - MAE: 0.016
  - MSE: 0.004
  - R2: 0.997
- Random Forest:
  - MAE: 0.026
  - MSE: 0.006
  - R2: 0.995
- Support Vector:
  - MAE: 0.053
  - MSE: 0.010
  - R2: 0.993

Dessa forma, o melhor algoritmo para esse dataset em particular acabou sendo o KNN, com todas as métricas melhores que as métricas dos outros modelos.

## Diagnóstico

```python
# Aqui então realizaremos testes/diagnósticos com nosso melhor modelo encontrado, no caso o KNN, abaixo temos os 3 hiperparâmetros tunados

In [30]:
print(knn_params)

{'n_neighbors': 7, 'weights': 'distance', 'metric': 'minkowski'}


```python
# Aqui faremos a análise de alguns pontos chave do modelo, levantando a possibilidade de problemas como Underfit/Overfit

In [43]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Fazer predições
predTrain = knn.predict(X_train)
predTest = knn.predict(X_test)

# Calcular RMSE
rmseTrain = mean_squared_error(y_train, predTrain, squared=False)
rmseTest = mean_squared_error(y_test, predTest, squared=False)

print('RMSE Treino: ' + str(rmseTrain))
print('RMSE Teste: ' + str(rmseTest))

# Calcular Viés e Variância
bias = np.mean(y_test - predTest)
variance = rmseTest - rmseTrain

print("Viés: " + str(bias))
print("Variância: " + str(variance))


RMSE Treino: 1.602082143499235e-08
RMSE Teste: 0.06181305894166579
Viés: 0.011512772806170895
Variância: 0.06181304292084436


### Resultados do Modelo KNN

- **RMSE Treino:** 1.602882143499235e-08
  - O valor extremamente baixo indica que o modelo está perfeitamente ajustado aos dados de treino, sugerindo possível ``overfitting``.

- **RMSE Teste:** 0.06181305894166579
  - O valor baixo indica que o modelo tem um bom desempenho nos dados de teste, sugerindo boa capacidade de generalização.

- **Viés:** 0.011512772806178095
  - O viés próximo de zero indica que as predições do modelo estão muito próximas dos valores reais em média, sinalizando boas previsões e não ``underfit``.

- **Variância:** 0.06181304292084436
  - A baixa variância entre RMSE de teste e treino sugere que o modelo não está sobre ``overfitting`` e está generalizando bem.


Esses resultados indicam que o modelo KNN está bem ajustado e generalizando bem para novos dados. Irei testar mais coisas sobre o modelo.


In [46]:
from sklearn.model_selection import cross_val_score
# Apenas lembrando a definição anterior do KNN
# knn = KNeighborsRegressor(n_neighbors=7, weights='distance', metric='minkowski')

# Realizar a validação cruzada com 10 folds ( divisões/fatias )
cv_scores = cross_val_score(knn, X, y, cv=10, scoring='neg_mean_squared_error')

# Converter os resultados para valores positivos e calcular a média e o std
mse_scores = -cv_scores
rmse_scores = np.sqrt(mse_scores)

print("RMSE scores para cada fold: ", rmse_scores)
print("Média do RMSE: ", rmse_scores.mean())
print("Desvio Padrão do RMSE: ", rmse_scores.std())


RMSE scores para cada fold:  [0.49109208 0.07092867 0.47831867 0.26433819 0.18547952 0.1936721
 0.19713423 0.44453803 0.40979522 0.88968121]
Média do RMSE:  0.3624977907530027
Desvio Padrão do RMSE:  0.22330954204064926


### Interpretação dos Resultados

1. **Variabilidade dos RMSEs:**
   - Há uma certa variabilidade nos RMSEs das diferentes folds, variando de aproximadamente 0.07 a 0.89. Isso indica que o desempenho do modelo pode ser sensível a diferentes subconjuntos dos dados.

2. **Média do RMSE:**
   - A média do RMSE é 0.3624797707530027. Este é um valor relativamente baixo, sugerindo que o modelo tem um desempenho médio bom nos diferentes subconjuntos de dados.

3. **Desvio Padrão do RMSE:**
   - O desvio padrão de 0.22330954204064296 sugere que há uma moderada variabilidade no desempenho do modelo. Idealmente, gostaríamos de um desvio padrão mais baixo para garantir que o modelo é consistentemente bom em todos os subconjuntos dos dados.
4. **Testar mais!**

### Tunar outros hiperparâmetros do modelo

In [48]:
param_grid = {
    'n_neighbors': range(1, 31),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30, 40, 50]
}

best_params = best_params_grid(KNeighborsRegressor(), param_grid, scores, 'r2')

log_data(KNeighborsRegressor, best_params['params'], 'KNN regressor', 'KNN regressor')