In [82]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import set_config

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer

In [42]:
set_config(transform_output="pandas")

In [2]:
df = pd.read_csv("../data/insurance.csv")

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df_pred = pd.DataFrame({
    'age': [37, 40, 21, 27, 35],
    'sex': ['male', 'female', 'female', 'male', 'male'],
    'bmi': [46.53, 32.39, 28.59, 21.89, 25.1],
    'children': [3, 1, 2, 2, 0],
    'smoker': ['no', 'no', 'no', 'yes', 'no'],
    'region': ['southwest', 'northwest', 'northeast', 'southeast', 'north']
})

In [35]:
X = df.drop(columns=["charges"])
y = df.charges

In [6]:
X = pd.get_dummies(X)

In [7]:
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=2023)

In [8]:
train_X

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
632,29,35.530,0,True,False,True,False,False,False,True,False
895,61,44.000,0,True,False,True,False,False,False,False,True
1173,38,29.260,2,False,True,True,False,False,True,False,False
1131,27,45.900,2,False,True,True,False,False,False,False,True
363,21,26.400,1,True,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
884,25,26.695,4,False,True,True,False,False,True,False,False
515,58,35.700,0,False,True,True,False,False,False,False,True
695,26,40.185,0,True,False,True,False,False,True,False,False
454,32,46.530,2,False,True,True,False,False,False,True,False


In [9]:
model = LinearRegression()
model.fit(train_X, train_y)

In [10]:
prediction = model.predict(test_X)

In [11]:
df_result = pd.DataFrame({
    'Actual': test_y,
    'Predicted': prediction
})

In [12]:
df_result.head()

Unnamed: 0,Actual,Predicted
748,8556.907,11616.373614
745,9910.35985,11501.220107
57,34303.1672,26817.780967
546,3268.84665,7059.059481
279,9855.1314,7781.918147


In [13]:
df_result["r2"] = r2_score(test_y, prediction)

In [14]:
df_result["mse"] = mean_squared_error(test_y, prediction)

In [15]:
df_result.head()

Unnamed: 0,Actual,Predicted,r2,mse
748,8556.907,11616.373614,0.758479,36403600.0
745,9910.35985,11501.220107,0.758479,36403600.0
57,34303.1672,26817.780967,0.758479,36403600.0
546,3268.84665,7059.059481,0.758479,36403600.0
279,9855.1314,7781.918147,0.758479,36403600.0


In [18]:
df_result.style.background_gradient()

TypeError: 'Styler' object is not callable

In [26]:
ohe = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)

In [24]:
train_X

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
632,29,35.530,0,True,False,True,False,False,False,True,False
895,61,44.000,0,True,False,True,False,False,False,False,True
1173,38,29.260,2,False,True,True,False,False,True,False,False
1131,27,45.900,2,False,True,True,False,False,False,False,True
363,21,26.400,1,True,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...
884,25,26.695,4,False,True,True,False,False,True,False,False
515,58,35.700,0,False,True,True,False,False,False,False,True
695,26,40.185,0,True,False,True,False,False,True,False,False
454,32,46.530,2,False,True,True,False,False,False,True,False


In [27]:
ohe.fit(train_X)

In [28]:
ohe.transform(train_X)

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 0., 1.]])

In [29]:
ohe.transform(pd.get_dummies(df_pred))

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- region_north


In [32]:
categorical_columns = ['sex', 'region', 'smoker']
ohe.fit(df[categorical_columns])

In [34]:
X.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,True,False,False,True,False,False,False,True
1,18,33.77,1,False,True,True,False,False,False,True,False
2,28,33.0,3,False,True,True,False,False,False,True,False
3,33,22.705,0,False,True,True,False,False,True,False,False
4,32,28.88,0,False,True,True,False,False,True,False,False


In [43]:
df_transformed = ohe.transform(X[categorical_columns])


In [45]:
df_transformed.head()

Unnamed: 0,sex_male,region_northeast,region_northwest,region_southeast,region_southwest,smoker_yes
0,0.0,0.0,0.0,0.0,1.0,1.0
1,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0


In [49]:
df_transformed = df_transformed.join(X.drop(columns=categorical_columns))

In [50]:
df_transformed.head()

Unnamed: 0,sex_male,region_northeast,region_northwest,region_southeast,region_southwest,smoker_yes,age,bmi,children
0,0.0,0.0,0.0,0.0,1.0,1.0,19,27.9,0
1,1.0,0.0,0.0,1.0,0.0,0.0,18,33.77,1
2,1.0,0.0,0.0,1.0,0.0,0.0,28,33.0,3
3,1.0,0.0,1.0,0.0,0.0,0.0,33,22.705,0
4,1.0,0.0,1.0,0.0,0.0,0.0,32,28.88,0


In [53]:
model.fit(df_transformed, y)

In [55]:
df_pred.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,37,male,46.53,3,no,southwest
1,40,female,32.39,1,no,northwest
2,21,female,28.59,2,no,northeast
3,27,male,21.89,2,yes,southeast
4,35,male,25.1,0,no,north


In [61]:
ohe.transform(df_pred[categorical_columns]).join(df_pred.drop(columns=categorical_columns))



Unnamed: 0,sex_male,region_northeast,region_northwest,region_southeast,region_southwest,smoker_yes,age,bmi,children
0,1.0,0.0,0.0,0.0,1.0,0.0,37,46.53,3
1,0.0,0.0,1.0,0.0,0.0,0.0,40,32.39,1
2,0.0,1.0,0.0,0.0,0.0,0.0,21,28.59,2
3,1.0,0.0,0.0,1.0,0.0,1.0,27,21.89,2
4,1.0,0.0,0.0,0.0,0.0,0.0,35,25.1,0


In [62]:
preds = model.predict(ohe.transform(df_pred[categorical_columns]).join(df_pred.drop(columns=categorical_columns)))



In [63]:
preds.head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [64]:
df_pred['result'] = preds

In [65]:
df_pred.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,result
0,37,male,46.53,3,no,southwest,13682.954149
1,40,female,32.39,1,no,northwest,9444.728134
2,21,female,28.59,2,no,northeast,4103.986756
3,27,male,21.89,2,yes,southeast,26054.726865
4,35,male,25.1,0,no,north,4846.865854


In [68]:
ohe = OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)
preprocess = ColumnTransformer([
    ('ohe', ohe, ['sex', 'smoker', 'region']),
    ('scaler', StandardScaler(), ['age', 'bmi', 'children'])
])


In [69]:
preprocess

In [73]:
preprocess.fit(df)

In [74]:
preprocess.transform(df)

Unnamed: 0,ohe__sex_male,ohe__smoker_yes,ohe__region_northeast,ohe__region_northwest,ohe__region_southeast,ohe__region_southwest,scaler__age,scaler__bmi,scaler__children
0,0.0,1.0,0.0,0.0,0.0,1.0,-1.438764,-0.453320,-0.908614
1,1.0,0.0,0.0,0.0,1.0,0.0,-1.509965,0.509621,-0.078767
2,1.0,0.0,0.0,0.0,1.0,0.0,-0.797954,0.383307,1.580926
3,1.0,0.0,0.0,1.0,0.0,0.0,-0.441948,-1.305531,-0.908614
4,1.0,0.0,0.0,1.0,0.0,0.0,-0.513149,-0.292556,-0.908614
...,...,...,...,...,...,...,...,...,...
1333,1.0,0.0,0.0,1.0,0.0,0.0,0.768473,0.050297,1.580926
1334,0.0,0.0,1.0,0.0,0.0,0.0,-1.509965,0.206139,-0.908614
1335,0.0,0.0,0.0,0.0,1.0,0.0,-1.509965,1.014878,-0.908614
1336,0.0,0.0,0.0,0.0,0.0,1.0,-1.296362,-0.797813,-0.908614


In [75]:
preprocess.transform(df_pred)



Unnamed: 0,ohe__sex_male,ohe__smoker_yes,ohe__region_northeast,ohe__region_northwest,ohe__region_southeast,ohe__region_southwest,scaler__age,scaler__bmi,scaler__children
0,1.0,0.0,0.0,0.0,0.0,1.0,-0.157143,2.602829,1.580926
1,0.0,0.0,0.0,1.0,0.0,0.0,0.056461,0.28324,-0.078767
2,0.0,0.0,1.0,0.0,0.0,0.0,-1.296362,-0.340129,0.751079
3,1.0,1.0,0.0,0.0,1.0,0.0,-0.869155,-1.439227,0.751079
4,1.0,0.0,0.0,0.0,0.0,0.0,-0.299545,-0.912645,-0.908614


In [76]:
model_pipeline=Pipeline(
    steps=[
        ('preprocess', preprocess),
        ('lr_model', LinearRegression())
    ]
)

In [79]:
model_pipeline.fit(X, y)

In [80]:
model_pipeline.predict(df_pred)



array([1.36560000e+04, 9.49600000e+03, 3.90400000e+03, 2.61840000e+04,
       7.04690166e+16])

In [94]:
numeric_processor = make_pipeline(
    SimpleImputer(strategy='mean', fill_value = StandardScaler()),
)

categorical_processor = make_pipeline(
    SimpleImputer(strategy='most_frequent', fill_value=ohe)
)

In [95]:
preprocess2 = ColumnTransformer([
    ('ohe', categorical_processor, ['sex', 'smoker', 'region']),
    ('scaler', numeric_processor, ['age', 'bmi', 'children'])
])

model_pipeline2=Pipeline(
    steps=[
        ('preprocess', preprocess2),
        ('lr_model', LinearRegression())
    ]
)

In [96]:
model_pipeline2

In [97]:
model_pipeline2.fit(X,y)

ValueError: could not convert string to float: 'female'