In [1]:
import pandas as pd 
import plotly.express as px 
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet, HuberRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

### Carregar Dados

In [2]:
df_costs = pd.read_csv(r'.\datasets\healthcosts_cleaned.csv')

In [3]:
df_costs.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,19,female,27.9,0,1,southwest,16884.924
1,18,male,33.77,1,0,southeast,1725.5523
2,28,male,33.0,3,0,southeast,4449.462
3,33,male,22.705,0,0,northwest,21984.47061
4,32,male,28.88,0,0,northwest,3866.8552
5,31,female,25.74,0,0,southeast,3756.6216
6,46,female,33.44,1,0,southeast,8240.5896
7,37,female,27.74,3,0,northwest,7281.5056
8,37,male,29.83,2,0,northeast,6406.4107
9,60,female,25.84,0,0,northwest,28923.13692


In [4]:
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   int64  
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 73.3+ KB


### Preparacao dos dados

In [5]:
X=df_costs.drop(columns=['medical charges'], axis=1)
y=df_costs['medical charges']

In [6]:
import joblib

preprocessor = joblib.load('preprocessor_dataset_healthcosts.pkl')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [8]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [9]:
print(X_train.shape)
print(X_test.shape)

(1070, 10)
(268, 10)


### Treinamento do Modelo Stacking

In [26]:
# Criar modelo de Stacking Regressor 

# Algoritmos Base
lr_model = LinearRegression()
elastic_model = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=51)
tree_model = DecisionTreeRegressor(random_state=51)

# Meta modelo
huber_model = HuberRegressor()

stacking_model = StackingRegressor(
    estimators=[
        ('linear regression', lr_model),
        ('elastic', elastic_model),
        ('decision tree', tree_model)
    ],
    final_estimator=huber_model,
    passthrough=True
    # False -> usa apenas as predicoes dos estimadores 
    # True -> usa as predicoes dos estimadores mais o conj de treinamento(dataset original)
)

In [27]:
stacking_model.fit(X_train, y_train)

0,1,2
,estimators,"[('linear regression', ...), ('elastic', ...), ...]"
,final_estimator,HuberRegressor()
,cv,
,n_jobs,
,passthrough,True
,verbose,0

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False

0,1,2
,alpha,1.0
,l1_ratio,0.5
,fit_intercept,True
,precompute,False
,max_iter,1000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,51

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,51
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,epsilon,1.35
,max_iter,100
,alpha,0.0001
,warm_start,False
,fit_intercept,True
,tol,1e-05


### Análise dos resultados

In [28]:
# Fazer predicoes
y_pred = stacking_model.predict(X_test)

In [29]:
# Avaliar metricas
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [30]:
print(f'RMSE: {rmse}\nR2_Score: {r2}')

RMSE: 6641.236391850827
R2_Score: 0.7463459307916287


In [31]:
importances = []

for estimator in stacking_model.estimators_:
    if hasattr(estimator, 'coef_'):
        importances.append(np.abs(estimator.coef_))
        print(f'Coeficiente do modelo {type(estimator).__name__}')
    elif hasattr(estimator, 'feature_importances_'):
        importances.append(np.abs(estimator.feature_importances_))
        print(f'Feature importances do modelo {type(estimator).__name__}')
    else:
        print(f'Não foi possivel calcular a importancia para {type(estimator).__name__}')

Coeficiente do modelo LinearRegression
Coeficiente do modelo ElasticNet
Feature importances do modelo DecisionTreeRegressor


In [32]:
importance_media = np.mean(importances, axis=0)

In [33]:
feature_impotance = importance_media / np.sum(importance_media)

In [34]:
feature_name = preprocessor.get_feature_names_out()

In [35]:
importance_df = pd.DataFrame({'Feature': feature_name, 'Importance': importance_media})

In [36]:
importance_df = importance_df.sort_values(by='Importance', ascending=True)

In [37]:
fig = px.bar(importance_df, x='Importance', y='Feature', title='Importancia das Features', orientation='h')
fig.show()


### Propriedades do modelo

In [38]:
# Mostrar a evidencia do stacking Regressor

# Selecionar uma amostra para fazer a predicao

X_sample = X_test[7].reshape(1,-1)

linear_pred = stacking_model.named_estimators_['linear regression'].predict(X_sample)
elastic_pred = stacking_model.named_estimators_['elastic'].predict(X_sample)
tree_pred = stacking_model.named_estimators_['decision tree'].predict(X_sample)

stacking_pred = stacking_model.predict(X_sample)

In [39]:
print(f"Predição da Regressao Linear: {linear_pred[0]}")
print(f"Predição do ElasticNET: {elastic_pred[0]}")
print(f"Predição da Decision Tree: {tree_pred[0]}")
print(f"Predição final da Stacking: {stacking_pred[0]}")

Predição da Regressao Linear: 14793.194288532948
Predição do ElasticNET: 13782.690606341423
Predição da Decision Tree: 11856.4115
Predição final da Stacking: 12191.715401547359
