In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
%matplotlib inline

In [2]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [3]:
#Data description
df.describe()

Unnamed: 0,age,bmi,children,expenses
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.665471,1.094918,13270.422414
std,14.04996,6.098382,1.205493,12110.01124
min,18.0,16.0,0.0,1121.87
25%,27.0,26.3,0.0,4740.2875
50%,39.0,30.4,1.0,9382.03
75%,51.0,34.7,2.0,16639.915
max,64.0,53.1,5.0,63770.43


In [27]:
#Making pipeline

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.ensemble import GradientBoostingRegressor

In [6]:
from sklearn import set_config
set_config(display='diagram')

In [7]:
df.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55


In [8]:
transformer = ColumnTransformer(transformers=[
    ('tnf1',OneHotEncoder(sparse=False, handle_unknown='ignore'), [1,5]),
    ('tnf2',OrdinalEncoder(categories=[['no','yes']]),[4]),
    ('tnf3',StandardScaler(),[0,2,3])
],remainder='passthrough')

In [16]:
x_train,x_test,y_train,y_test = train_test_split(df.drop(columns=['expenses']),
                                                df['expenses'], test_size=0.2,random_state=42)

In [17]:
x_train.shape

(1070, 6)

In [18]:
x_test.shape

(268, 6)

In [19]:
gb_model = Pipeline(steps=[('transformer',transformer),
                           ('model', GradientBoostingRegressor())
                          ])

In [20]:
gb_model.fit(x_train,y_train)

In [21]:
y_pred = gb_model.predict(x_test)

In [22]:
y_pred

array([11421.6714098 ,  5808.63037282, 27906.9183161 ,  9681.11044612,
       34047.0200666 ,  5183.00393405,  2752.57227725, 18563.83887847,
        4827.72273672, 10634.7104045 , 18572.32120822,  7721.11801601,
        6003.7782621 , 45198.12616351, 47255.11398648, 44826.30873308,
       10177.23243304, 44617.77748845,  9686.23329811, 23765.43013936,
        5835.27730277,  8939.15318796,  2031.09584191,  4863.55281579,
       11577.69040603, 12938.72533886, 14575.42484381,  6786.2866381 ,
       11995.2453876 ,  3479.50992597,  7330.47394123, 12233.22259554,
        2859.07322313,  5945.76777319,  4528.81793137, 10500.57250467,
        3770.85395551,  9008.29985117, 26216.28064772, 40317.66934875,
        4896.22359098,  3882.74017424, 13182.82377272, 14455.7400242 ,
        7772.49461673, 15523.45251067,  6381.78017444,  6474.98554754,
       42256.00647235,  6733.42150055, 15026.57557543,  2732.91128299,
        6603.34192324,  2148.41472248, 12381.32166229, 11566.36512319,
      

In [23]:
from sklearn.metrics import accuracy_score,mean_squared_error,r2_score

In [24]:
gradient_boosting_mse = mean_squared_error(y_test, y_pred)
gradient_boosting_rmse = mean_squared_error(y_test, y_pred, squared=False)
gradient_boosting_r2_score = r2_score(y_test, y_pred)

print("The Mean Squared Error using Gradient Boosting Regressor : {}".format(gradient_boosting_mse))
print("The Root Mean Squared Error using Gradient Boosting Regressor : {}".format(gradient_boosting_rmse))
print("The r2_sccore using Gradient Boosting Regressor : {}".format(gradient_boosting_r2_score))

The Mean Squared Error using Gradient Boosting Regressor : 18724852.903812498
The Root Mean Squared Error using Gradient Boosting Regressor : 4327.222308110885
The r2_sccore using Gradient Boosting Regressor : 0.8793880474623511


In [26]:
import pickle
pickle.dump(gb_model, open('gb_model.pkl', 'wb'))