## Importando bibliotecas python

In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRFRegressor

import mlflow
from mlflow.models.signature import infer_signature

## Lendo os dados

In [35]:
csv_path = "../data/processed/casas.csv"

df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,tamanho,ano,garagem,preco
0,159.0,2003,2,208500
1,117.0,1976,2,181500
2,166.0,2001,2,223500
3,160.0,1915,3,140000
4,204.0,2000,3,250000


In [40]:
X = df.drop(columns="preco")
y = df.preco

print(f"O shape dos dados inicialmente: \n - X: {X.shape} \n - y: {y.shape}")
print("\n")

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2)
print(f"O shape dos dados de treino: \n - X: {X_train.shape} \n - y: {y_train.shape}")
print("\n")
print(f"O shape dos dados de teste : \n - X: {X_test.shape} \n - y: {y_test.shape}")

O shape dos dados inicialmente: 
 - X: (1460, 3) 
 - y: (1460,)


O shape dos dados de treino: 
 - X: (1168, 3) 
 - y: (1168,)


O shape dos dados de teste : 
 - X: (292, 3) 
 - y: (292,)


## Aplicando Modelo de Linear Regression

O objetivo aqui será criar um modelo simples para termos um base line.

In [41]:
mlflow.set_experiment("preco-casas-eda")

<Experiment: artifact_location='file:///home/carlos/Documentos/programas/Alura_/CD4ML/mlflow/notebooks/mlruns/767021303406655053', creation_time=1746796522623, experiment_id='767021303406655053', last_update_time=1746796522623, lifecycle_stage='active', name='preco-casas-eda', tags={}>

#### Definindo uma funcao para calcular as metricas de avaliacao do modelo

In [42]:
def metricas_regressao(y_true,y_pred):
    """retorna e printa as metricas de avaliacao para um modelo de regressao

    Args:
        y_true: Valores reais de y
        y_pred: Valores preditos de y

    Returns:
        mae, mse, rmse, r_2: mean_absolute_error, mean_squared_error, root_mean_squared_error, r2_score
    """
    mae = mean_absolute_error(y_true,y_pred)
    mse = mean_squared_error(y_true,y_pred)
    rmse = (mse)**(1/2)
    r_2 = r2_score(y_true,y_pred)
    
    print(f"Erro Medio Absoluto: {mae:.3f}")
    print(f"Erro Quadratico Medio: {mse:.3f}")
    print(f"Raiz do Erro Quadratico Medio: {rmse:.3f}")
    print(f"R quadrado: {r_2:.3f}")
    return mae, mse, rmse, r_2

In [43]:
import mlflow.sklearn


with mlflow.start_run():
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)
    y_pred = lin_reg.predict(X_test)
    
    sig_pred = lin_reg.predict(X_train[:5])
    signature = infer_signature(X_train[:5], sig_pred)
    
    mlflow.sklearn.log_model(lin_reg, 
                             'lin_reg',
                             signature=signature,
                             input_example=X_train[:5])
    
    mae, mse, rmse, r_2 = metricas_regressao(y_test,y_pred)
    mlflow.log_metric('mae',mae)
    mlflow.log_metric('mse',mse)
    mlflow.log_metric('rmse',rmse)
    mlflow.log_metric('r_2',r_2)
    




Erro Medio Absoluto: 30303.704
Erro Quadratico Medio: 1791302335.476
Raiz do Erro Quadratico Medio: 42323.780
R quadrado: 0.680


#### Aplicando o XGBoost

In [44]:
xgb_params = {
    "learning_rate":0.2,
    "n_estimators": 150,
    "random_state": 42
}

with mlflow.start_run():    
    xgb = XGBRFRegressor(**xgb_params)
    xgb.fit(X_train, y_train)
    xgb_pred = xgb.predict(X_test)
    
    sig_pred = xgb.predict(X_train[:5])
    signature = infer_signature(X_train[:5],sig_pred)
    
    mlflow.xgboost.log_model(xgb,
                             'xgboost',
                             signature=signature,
                             input_example=X_train[:5])
    
    mae, mse, rmse, r_2 = metricas_regressao(y_test, xgb_pred)
    mlflow.log_metric('mae',mae)
    mlflow.log_metric('mse',mse)
    mlflow.log_metric('rmse',rmse)
    mlflow.log_metric('r_2',r_2)
    



Erro Medio Absoluto: 47098.590
Erro Quadratico Medio: 4046461952.000
Raiz do Erro Quadratico Medio: 63611.807
R quadrado: 0.276


In [48]:
type(X_train)

pandas.core.frame.DataFrame

In [23]:
xgb_pred = xgb.predict(X_test)

In [24]:
metricas_regressao(y_test,xgb_pred)

Erro Medio Absoluto: 27754.246
Erro Quadratico Medio: 1738928128.000
Raiz do Erro Quadratico Medio: 41700.457
R quadrado: 0.760


(27754.24609375, 1738928128.0, 41700.45716775776, 0.759798526763916)

### Mlflow tracking

Obeservando alguns dos modelos que foram treinados

In [23]:
experiment = mlflow.get_experiment_by_name('preco-casas-eda')

runs = mlflow.search_runs(experiment_ids=experiment.experiment_id)

In [26]:
top_models = runs.sort_values(by='metrics.rmse', ascending=True).head()

In [33]:
mlflow.get_run(top_models.run_id.values[0])

<Run: data=<RunData: metrics={'mae': 25877.19921875,
 'mse': 1436647296.0,
 'r_2': 0.7861969470977783,
 'rmse': 37903.130424807925}, params={}, tags={'mlflow.log-model.history': '[{"run_id": "be81a7483717428498b651672e4a0d55", '
                             '"artifact_path": "xgboost", "utc_time_created": '
                             '"2025-05-09 13:46:35.713058", "model_uuid": '
                             '"192b9d736fca4e84ad5dd6a4218309b2", "flavors": '
                             '{"python_function": {"loader_module": '
                             '"mlflow.xgboost", "python_version": "3.9.21", '
                             '"data": "model.xgb", "env": {"conda": '
                             '"conda.yaml", "virtualenv": "python_env.yaml"}}, '
                             '"xgboost": {"xgb_version": "2.1.4", "data": '
                             '"model.xgb", "model_class": '
                             '"xgboost.sklearn.XGBRFRegressor", '
                             '"mode