In [1]:
# EDA
import pandas as pd
import numpy as np
import plotly.express as px


# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

### Carregar os dados

In [2]:
# Carregar os dados já tratados
df_costs = pd.read_csv('./datasets/healthcosts_cleaned.csv')

In [3]:
# Mostrar as primeiras linhas do DataFrame
print(df_costs.head(10))

   age     sex     bmi  children  smoker     region  medical charges
0   19  female  27.900         0       1  southwest      16884.92400
1   18    male  33.770         1       0  southeast       1725.55230
2   28    male  33.000         3       0  southeast       4449.46200
3   33    male  22.705         0       0  northwest      21984.47061
4   32    male  28.880         0       0  northwest       3866.85520
5   31  female  25.740         0       0  southeast       3756.62160
6   46  female  33.440         1       0  southeast       8240.58960
7   37  female  27.740         3       0  northwest       7281.50560
8   37    male  29.830         2       0  northeast       6406.41070
9   60  female  25.840         0       0  northwest      28923.13692


In [4]:
# Mostrar as ultimas linhas do DataFrame
print(df_costs.tail(10))

      age     sex     bmi  children  smoker     region  medical charges
1328   23  female  24.225         2       0  northeast      22395.74424
1329   52    male  38.600         2       0  southwest      10325.20600
1330   57  female  25.740         2       0  southeast      12629.16560
1331   23  female  33.400         0       0  southwest      10795.93733
1332   52  female  44.700         3       0  southwest      11411.68500
1333   50    male  30.970         3       0  northwest      10600.54830
1334   18  female  31.920         0       0  northeast       2205.98080
1335   18  female  36.850         0       0  southeast       1629.83350
1336   21  female  25.800         0       0  southwest       2007.94500
1337   61  female  29.070         0       1  northwest      29141.36030


In [5]:
# Mostrar a estrutura do DataFrame
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   int64  
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 73.3+ KB


### Preparação dos dados

In [6]:
# Preparar os dados para o modelo
X = df_costs.drop(columns=['medical charges'])
y = df_costs['medical charges']

In [7]:
# Carregar o preprocessor
import joblib
preprocessor = joblib.load('./preprocessor_dataset_healthcosts.pkl')

In [8]:
# Dividir os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)


In [9]:
# Aplicar o preprocessor nos dados de treino e teste
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [10]:
# Mostrar os conjuntos de treino e teste
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (1070, 10)
X_test shape: (268, 10)


### Treinamento do modelo

In [11]:
# Criar o objeto de VotingRegressor com os modelos

lr_model = LinearRegression()
elastic_model = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=51)
tree_model = DecisionTreeRegressor(random_state=51)

voting_model = VotingRegressor(
    estimators=[
        ('linear regression', lr_model),
        ('elastic', elastic_model),
        ('tree', tree_model)
        ]
    )

In [12]:
# Treinar o modelo
voting_model.fit(X_train, y_train)

### Analise dos resultados

In [13]:
# Realizar predição
y_pred = voting_model.predict(X_test)

In [14]:
# Mostrar y_pred
print("Predições:", y_pred)

Predições: [ 8.70298366e+03  3.92616140e+04  1.03199686e+04  1.63330046e+04
  3.55685270e+04  1.08521141e+04  1.00002749e+04  1.37369099e+04
  5.84233422e+03  1.04088577e+04  9.45267433e+03  1.19137710e+04
  8.91146096e+03  3.58422703e+03  5.42039922e+03  1.27670445e+04
  4.66868395e+03  5.82905352e+03  2.25029762e+04  2.52467177e+04
  8.28512431e+03  7.93711136e+03  4.08881126e+04  1.24583875e+04
  6.15701494e+03  1.56687448e+04  1.09067860e+04  2.44015636e+03
  2.64587032e+04  1.32800145e+04  3.23803328e+03  2.71628725e+04
  4.43514694e+03  4.18960688e+03  7.67108797e+03  1.70238183e+04
  1.09868511e+04  2.18562345e+03  1.19929111e+04  7.96264873e+03
  1.06412920e+04  1.37320100e+03  5.28883689e+03  2.28571057e+03
  7.80236533e+03  1.41458732e+04  1.38237880e+04  3.66664046e+04
  8.59226277e+03  1.26776265e+04  5.58626766e+03  3.33192484e+04
  7.97877022e+03  4.24557096e+04  1.06814501e+04  2.97544713e+04
  9.23585085e+03  1.12360639e+04  1.25058183e+04  7.54110546e+03
  8.98530672e+

In [15]:
# Avaliar metricas do modelo
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


In [16]:
# Mostrar o Erro e o R2
print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.2f}")

RMSE: 6058.57
R2: 0.79


In [17]:
# Calcular a importancia das variáveis

importances = []

for estimador in voting_model.estimators_:
    if hasattr(estimador, 'coef_'):
        importances.append(np.abs(estimador.coef_))
    elif hasattr(estimador, 'feature_importances_'):
        importances.append(estimador.feature_importances_)
    else:
        print(f"Modelo {type(estimador)._name_} não possui coeficientes ou importâncias de recursos.")

In [18]:
importances[2].shape

(10,)

In [19]:
# Calccular a media das importancias
importancia_media = np.mean(importances, axis=0)

In [20]:
# Normalizar as importâncias
feature_importances = importancia_media / np.sum(importancia_media)

In [21]:
# Obter os nomes das features
feature_names = preprocessor.get_feature_names_out()

In [22]:
# Criar um DataFrame com as importâncias
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
})

In [23]:
# Ordenar o DataFrame por importância
importance_df = importance_df.sort_values(by='importance', ascending=True)

In [24]:
# Criar um grafico para mostrar as importâncias
fig = px.bar(
    importance_df,
    x='importance',
    y='feature',
    orientation='h',
    title='Importância das Variáveis no Modelo de Regressão'
)

fig.show()

In [25]:
# Mostrar evidencia do Hard Voting 

# Selecionar um registro para predição
X_sample = X_test[7].reshape(1, -1)

In [26]:
# Predições individuais dos modelos
linear_pred = voting_model.named_estimators_['linear regression'].predict(X_sample)
elastic_pred = voting_model.named_estimators_['elastic'].predict(X_sample)
tree_pred = voting_model.named_estimators_['tree'].predict(X_sample)
# Predição do modelo de VotingRegressor
voting_pred = voting_model.predict(X_sample)

In [27]:
# Media das predições dos estimadores
mean_pred = np.mean([linear_pred[0], elastic_pred[0], tree_pred[0]])

In [30]:
# Exibir os resultados
print(f"Predição do modelo de Regressão Linear: {linear_pred[0]}")
print(f"Predição do modelo ElasticNet: {elastic_pred[0]}")
print(f"Predição do modelo de Árvore de Decisão: {tree_pred[0]}")
print(f"Média das predições dos estimadores: {mean_pred}")
print(f'Predição final do Hard Voting: {voting_pred[0]}')

Predição do modelo de Regressão Linear: 14793.194288532948
Predição do modelo ElasticNet: 14561.124012679351
Predição do modelo de Árvore de Decisão: 11856.4115
Média das predições dos estimadores: 13736.909933737434
Predição final do Hard Voting: 13736.909933737434
