In [42]:
#Data wrangling
import numpy as np
import pandas as pd

#seleção e validação de modelos
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Pré-processamento para incluir no pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Pipelines para tratar dados
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

# Modelos
from sklearn.linear_model import LinearRegression

#Manter saída do processamento do sklearn como DataFrame pandas
from sklearn import set_config
set_config(transform_output="pandas")



In [5]:
URL_DATA = 'https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/insurance.csv'
df = pd.read_csv(URL_DATA)
df


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [50]:
df_futuro = pd.DataFrame ({
    'age': [37, np.nan, 58,21,43],
    'sex': ['male', 'female', np.nan, 'female', 'female'],
    'bmi': [46.53, 32.395, 28.595, 21.89, 24.7],
    'children': [3, 1, 0, 2 ,2],
    'smoker': ['no', np.nan, 'no', 'no', 'yes'],
    'region': ['southeast', 'northeast', 'northwest', 'southeast', 'north']
})

df_futuro

Unnamed: 0,age,sex,bmi,children,smoker,region
0,37.0,male,46.53,3,no,southeast
1,,female,32.395,1,,northeast
2,58.0,,28.595,0,no,northwest
3,21.0,female,21.89,2,no,southeast
4,43.0,female,24.7,2,yes,north


In [9]:
lr = LinearRegression()

In [12]:
X = df.drop(columns='charges')
y = df['charges']

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=2023)

In [17]:
X_train_transformed = pd.get_dummies(X_train)
X_train_transformed.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
632,29,35.53,0,1,0,1,0,0,0,1,0
895,61,44.0,0,1,0,1,0,0,0,0,1
1173,38,29.26,2,0,1,1,0,0,1,0,0
1131,27,45.9,2,0,1,1,0,0,0,0,1
363,21,26.4,1,1,0,1,0,0,0,0,1


In [18]:
lr.fit(X_train_transformed, y_train)

In [19]:
lr.predict(X_train_transformed)

array([ 6297.81640554, 17604.53144584,  8564.16097047, ...,
        8171.64062054, 11728.69287826,  1928.89483764])

In [20]:
df_futuro_transformed = pd.get_dummies(df_futuro)
lr.predict(df_futuro_transformed)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- region_north
Feature names seen at fit time, yet now missing:
- region_southwest


In [21]:
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
632,29,female,35.53,0,no,southeast
895,61,female,44.0,0,no,southwest
1173,38,male,29.26,2,no,northwest
1131,27,male,45.9,2,no,southwest
363,21,female,26.4,1,no,southwest


In [22]:
onehot = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
onehot.fit(X_train)

In [24]:
onehot.transform(df_futuro)



Unnamed: 0,age_18,age_19,age_20,age_21,age_22,age_23,age_24,age_25,age_26,age_27,...,children_1,children_2,children_3,children_4,children_5,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [27]:
# Aplicar onehot apenas nas colunas categóricas

categorical_columns = ['sex', 'smoker', 'region']

onehot.fit(X_train[categorical_columns])

X_train_transformed = onehot.transform(X_train[categorical_columns])
X_train_transformed = X_train_transformed.join(X_train.drop(columns=categorical_columns))
X_train_transformed

Unnamed: 0,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,age,bmi,children
632,0.0,0.0,0.0,0.0,1.0,0.0,29,35.530,0
895,0.0,0.0,0.0,0.0,0.0,1.0,61,44.000,0
1173,1.0,0.0,0.0,1.0,0.0,0.0,38,29.260,2
1131,1.0,0.0,0.0,0.0,0.0,1.0,27,45.900,2
363,0.0,0.0,0.0,0.0,0.0,1.0,21,26.400,1
...,...,...,...,...,...,...,...,...,...
884,1.0,0.0,0.0,1.0,0.0,0.0,25,26.695,4
515,1.0,0.0,0.0,0.0,0.0,1.0,58,35.700,0
695,0.0,0.0,0.0,1.0,0.0,0.0,26,40.185,0
454,1.0,0.0,0.0,0.0,1.0,0.0,32,46.530,2


In [34]:
df_futuro_transformed = onehot.transform(df_futuro[categorical_columns])
df_futuro_transformed = df_futuro_transformed.join(df_futuro.drop(columns=categorical_columns))
df_futuro_transformed



Unnamed: 0,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,age,bmi,children
0,1.0,0.0,0.0,0.0,1.0,0.0,37.0,46.53,3
1,0.0,0.0,1.0,0.0,0.0,0.0,,32.395,1
2,0.0,0.0,0.0,1.0,0.0,0.0,58.0,28.595,0
3,0.0,0.0,0.0,0.0,1.0,0.0,21.0,21.89,2
4,0.0,1.0,0.0,0.0,0.0,0.0,43.0,24.7,2


In [35]:
lr.fit(X_train_transformed, y_train)

In [36]:
lr.predict(X_train_transformed)

array([ 6297.81640554, 17604.53144584,  8564.16097047, ...,
        8171.64062054, 11728.69287826,  1928.89483764])

In [40]:
df_futuro_transformed = df_futuro_transformed.dropna()


In [41]:
lr.predict(df_futuro_transformed)

array([13453.33806874, 12689.80988463,   391.53496421, 31451.96072786])

# Criando Pipeline

In [43]:
onehot = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer([
    ('onehot', onehot, ['sex', 'smoker', 'region']),
    ('scale', StandardScaler(), ['age', 'bmi', 'children'])
])

preprocessor

In [44]:
preprocessor.fit(X_train, y_train)

In [47]:
preprocessor.transform(df_futuro)



Unnamed: 0,onehot__sex_male,onehot__smoker_yes,onehot__region_northeast,onehot__region_northwest,onehot__region_southeast,onehot__region_southwest,scale__age,scale__bmi,scale__children
0,1.0,0.0,0.0,0.0,1.0,0.0,-0.163187,2.571937,1.629218
1,0.0,0.0,1.0,0.0,0.0,0.0,,0.274302,-0.055427
2,0.0,0.0,0.0,1.0,0.0,0.0,1.336468,-0.343386,-0.89775
3,0.0,0.0,0.0,0.0,1.0,0.0,-1.305781,-1.433279,0.786896
4,0.0,1.0,0.0,0.0,0.0,0.0,0.265286,-0.976516,0.786896


# Pipeline Pré-Processamento

In [51]:
onehot = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)

numeric_processor = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
categorical_processor = make_pipeline(SimpleImputer(strategy='most_frequent'), onehot)

preprocessor = ColumnTransformer([
    ('onehot', categorical_processor, ['sex', 'smoker', 'region']),
    ('scale', numeric_processor, ['age', 'bmi', 'children'])
])

model_pipiline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('lr_model', LinearRegression())
])

model_pipiline.fit(X_train, y_train)

In [49]:
model_pipiline.predict(X_test)

array([ 1.1744e+04,  1.1520e+04,  2.6912e+04,  6.8480e+03,  7.9040e+03,
        4.8000e+03,  7.0720e+03,  2.1760e+03,  1.1968e+04,  3.3344e+04,
        1.2224e+04,  3.3568e+04,  4.0320e+04,  3.2224e+04,  7.2000e+03,
        4.0032e+04,  1.0208e+04,  1.1328e+04,  4.2240e+03,  3.0400e+03,
        2.7200e+04,  2.4960e+03,  5.9840e+03,  2.2080e+03,  2.3360e+03,
        9.1520e+03,  2.9376e+04,  1.1360e+04,  3.7504e+04,  1.3984e+04,
        8.6400e+03,  9.6000e+01,  2.6112e+04,  4.8320e+03,  1.1328e+04,
        3.4112e+04,  9.8240e+03,  3.0496e+04,  1.1904e+04,  8.2880e+03,
        8.8320e+03,  8.0000e+03,  2.1120e+03,  7.5840e+03,  3.0880e+04,
        1.2544e+04,  8.6080e+03,  1.1008e+04,  5.7280e+03,  1.2800e+04,
        4.0640e+03,  2.5856e+04,  3.7280e+04,  1.2960e+04,  1.0496e+04,
        2.6624e+04,  5.6640e+03,  1.3920e+04, -3.2000e+02,  9.8560e+03,
        3.5520e+03,  1.0336e+04,  1.1200e+04,  1.3856e+04,  7.2640e+03,
        3.7888e+04,  1.1168e+04,  2.6784e+04,  7.6480e+03,  1.30