In [3]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split

# Modelos
from sklearn.linear_model import LinearRegression

# Pré-processamento para incluir no pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

#Manter saída do processamento do sklearn como DataFrame pandas
from sklearn import set_config
set_config(transform_output="pandas")


# Bibliotecas para usar Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline


In [4]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Deploy/Pipelines/data/insurance.csv')

In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
# Informações que não temos, mas que queremos prever
# Nesse exemplo temos tb dados faltantes o que pode atrapalhar, mas vamos tratar
df_futuro = pd.DataFrame ({
    'age': [37, np.nan, 58,21,43],
    'sex': ['male', 'female', np.nan, 'female', 'female'],
    'bmi': [46.53, 32.395, 28.595, 21.89, 24.7],
    'children': [3, 1, 0, 2 ,2],
    'smoker': ['no', np.nan, 'no', 'no', 'yes'],
    'region': ['southeast', 'northeast', 'northwest', 'southeast', 'north']
})

df_futuro

Unnamed: 0,age,sex,bmi,children,smoker,region
0,37.0,male,46.53,3,no,southeast
1,,female,32.395,1,,northeast
2,58.0,,28.595,0,no,northwest
3,21.0,female,21.89,2,no,southeast
4,43.0,female,24.7,2,yes,north


In [7]:
lr = LinearRegression()

In [8]:
x =  df.drop('charges', axis=1)
y = df['charges']

x_train, x_test, y_train, y_test = train_test_split(x,y, random_state=2024)
x_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
149,19,male,28.4,1,no,southwest
98,56,male,19.95,0,yes,northeast
739,29,male,35.5,2,yes,southwest
707,49,male,28.69,3,no,northwest
572,30,female,43.12,2,no,southeast


In [9]:
x_train_transformed = pd.get_dummies(x_train)

In [10]:
lr.fit(x_train_transformed, y_train)

In [11]:
lr.predict(x_train_transformed)

array([ 1975.11614994, 33040.88767453, 31489.06063699, ...,
       10807.85904867, 12607.05732245,  5369.03507359])

In [12]:
df_futuro_transformed = pd.get_dummies(df_futuro)
lr.predict(df_futuro_transformed)

# Aqui foi forçado o erro em que ocorreu pois tivemos um valor de uma coluna que no treino não existia
# Dentro de região o norte não existia em treino e quando colocamos um df futuro o modelo não consegue fazer o processo pois aquilo não tinha sido visto antes

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- region_north
Feature names seen at fit time, yet now missing:
- region_southwest


In [13]:
# Ajustamdo para esse tipo de cenário - Dummies com o OneHotEncoder
# if_binary - Ignora colunas que tem apenas duas informações como sexo
onehot = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
onehot.fit(x_train)

In [14]:
onehot.transform(df_futuro)
# Conseguimos lidar com as caracteristicas do futuro, mas isso deve ser apenas aplicadas nas colunas categoricas



Unnamed: 0,age_18,age_19,age_20,age_21,age_22,age_23,age_24,age_25,age_26,age_27,...,children_1,children_2,children_3,children_4,children_5,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [15]:
categorical_columns = ['sex', 'smoker', 'region']

onehot.fit(x_train[categorical_columns])
x_train_transformed = onehot.fit(x_train[categorical_columns])

x_train_transformed = x_train_transformed.transform(x_train[categorical_columns])

# Recuperando as numéricas - dando um concat com as núm
x_train_transformed.join(x_train.drop(columns=categorical_columns))

Unnamed: 0,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,age,bmi,children
149,1.0,0.0,0.0,0.0,0.0,1.0,19,28.40,1
98,1.0,1.0,1.0,0.0,0.0,0.0,56,19.95,0
739,1.0,1.0,0.0,0.0,0.0,1.0,29,35.50,2
707,1.0,0.0,0.0,1.0,0.0,0.0,49,28.69,3
572,0.0,0.0,0.0,0.0,1.0,0.0,30,43.12,2
...,...,...,...,...,...,...,...,...,...
183,0.0,0.0,0.0,1.0,0.0,0.0,44,26.41,0
446,1.0,0.0,1.0,0.0,0.0,0.0,60,29.64,0
539,1.0,0.0,0.0,0.0,1.0,0.0,53,31.35,0
640,1.0,0.0,0.0,0.0,0.0,1.0,33,42.40,5


In [16]:
df_futuro_transformed = onehot.transform(df_futuro[categorical_columns])
df_futuro_transformed = df_futuro_transformed.join(df_futuro.drop(columns=categorical_columns))
df_futuro_transformed




Unnamed: 0,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,age,bmi,children
0,1.0,0.0,0.0,0.0,1.0,0.0,37.0,46.53,3
1,0.0,0.0,1.0,0.0,0.0,0.0,,32.395,1
2,0.0,0.0,0.0,1.0,0.0,0.0,58.0,28.595,0
3,0.0,0.0,0.0,0.0,1.0,0.0,21.0,21.89,2
4,0.0,1.0,0.0,0.0,0.0,0.0,43.0,24.7,2


In [17]:
lr.fit(x_train_transformed, y_train)

In [18]:
lr.predict(x_train_transformed)

array([ 8512., 32768., 32064., ...,  8192.,  8512.,  9216.])

## Aplicando Pipeline

In [19]:
onehot = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)


# Recebe uma lista de tuplas, que vai definir a ação
preprocessor = ColumnTransformer([
    ('onehot', onehot, ['sex', 'smoker', 'region']),
    ('scale', StandardScaler(), ['age', 'bmi', 'children'])
])

preprocessor

In [20]:
preprocessor.fit(x_train, y_train)

In [21]:
preprocessor.transform(df_futuro)



Unnamed: 0,onehot__sex_male,onehot__smoker_yes,onehot__region_northeast,onehot__region_northwest,onehot__region_southeast,onehot__region_southwest,scale__age,scale__bmi,scale__children
0,1.0,0.0,0.0,0.0,1.0,0.0,-0.163166,2.616058,1.580462
1,0.0,0.0,1.0,0.0,0.0,0.0,,0.293494,-0.08292
2,0.0,0.0,0.0,1.0,0.0,0.0,1.329137,-0.330896,-0.914611
3,0.0,0.0,0.0,0.0,1.0,0.0,-1.300159,-1.432615,0.748771
4,0.0,1.0,0.0,0.0,0.0,0.0,0.263206,-0.970895,0.748771


## Criando o Pipeline

In [29]:
# Recebe uma lista de tuplas, que vai definir a ação

# Com esses ajustes os valores vazios do df_futuro ou de qualquer outro dataset não vão interferir
numeric_processor = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
categorical_processor = make_pipeline(SimpleImputer(strategy='most_frequent'), onehot)

preprocessor = ColumnTransformer([
    ('onehot', categorical_processor, ['sex', 'smoker', 'region']),
    ('scale', numeric_processor, ['age', 'bmi', 'children'])
])

model_pipeline = Pipeline(steps=[
    ('preprocessor',preprocessor ),#dando nome
    ('lr_model', LinearRegression()) #Colocando o modelo preditivo
])


model_pipeline.fit(x_train, y_train)

In [30]:
model_pipeline.predict(x_test)

array([26296.,  2400., 15008., 13880., 15424., 14544.,  2616., 29264.,
       30744., 10016., 32528.,  7952.,  5960.,  7464.,  5448.,  8728.,
       14872., 32416.,  8544., 31776.,   256., 12112., 10848.,  4216.,
       11576.,  2952., 10312.,  4440.,  9128.,  8240., 11424.,  5736.,
        3496., 10136.,  4696.,  7552.,  6064.,  9552., 11480., 25456.,
        3136., 13464., -1384.,  9128., 16304., 11744., 16480., 10272.,
       17960., 40096.,  9040.,  5976.,  1888.,  5672., 30384.,  4944.,
         864., 13560., 27896.,  7304.,  9624.,  3328.,  8320.,   216.,
        9168.,  6696.,  8432., 12896., 29888.,  5336., 10968.,  7960.,
       34552.,  1096.,  7624., 36440.,  4040.,  9128.,  3016., 12592.,
       10936.,  3256.,  6336.,  4776.,  3560.,  6224.,  8968., 14928.,
       13568.,  4248., 11928., 10464., 11328., 18224.,  2608., 13568.,
       34648.,  1024.,  3984., 39472.,  4392.,  3720., 40616.,  1744.,
        4472., 11448.,  3072., 10560., 28144.,  5832., 11200.,  6800.,
      

In [31]:
model_pipeline.predict(df_futuro)



array([ 1.35520000e+04,  9.51200000e+03,  1.23440000e+04,  9.12000000e+02,
       -6.09331115e+16])