In [1]:
import pandas as pd
import numpy as np

# Machine learnig management
import mlflow
from mlflow.models.signature import infer_signature

# vizualização de dados
import matplotlib.pyplot as plt
from yellowbrick.regressor import residuals_plot, prediction_error

# Bibliotecas para usar Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# Seleção e validação de modelos
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV

# Pré-processamento para incluir no pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

# Modelos
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


#Manter saída do processamento do sklearn como DataFrame pandas
from sklearn import set_config
set_config(transform_output="pandas")



In [6]:
def get_metrics(y_true, y_pred):
  # Função que retorna as metricas de avaliação para esse modelo
  dict_metrics = {
      'R2': metrics.r2_score(y_true, y_pred),
      'MAE': metrics.mean_absolute_error(y_true, y_pred),
      'MAPE': metrics.mean_absolute_percentage_error(y_true, y_pred),
      'RMSE': np.sqrt(metrics.mean_squared_error(y_true, y_pred))
  }
  return dict_metrics

In [7]:
df = pd.read_csv('../data/insurance.csv')
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


# Modeling

In [8]:
# Separando as nossas features
numerical_features = ['age', 'bmi', 'children']
categorical_features = ['smoker']

features = numerical_features + categorical_features
target = 'charges'

In [9]:
# Separando as bases de treino e teste
x = df[features]
y = df[target]

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=2024)

In [11]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_transformer = OneHotEncoder(drop='if_binary', sparse_output=False)

preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

preprocessor