### Building pipleline to deploy premium prediction model

In [1]:
import numpy as np
import pandas as pd
import klib

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer 
from sklearn.pipeline import Pipeline,FeatureUnion

In [2]:
# loading dataset:
data_file = 'insurance_premium.csv'
ip = pd.read_csv(data_file)

# Using Kblib conver data types reduces memory
ip = klib.convert_datatypes(ip)
ip.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337 entries, 0 to 1336
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   age       1337 non-null   int8    
 1   sex       1337 non-null   category
 2   bmi       1337 non-null   float32 
 3   children  1337 non-null   int8    
 4   smoker    1337 non-null   category
 5   region    1337 non-null   category
 6   expenses  1337 non-null   float32 
dtypes: category(3), float32(2), int8(2)
memory usage: 17.9 KB


In [3]:
ip_train, ip_test = train_test_split(ip, test_size=0.2, random_state=2)
print(ip_train.shape)
print(ip_test.shape)

(1069, 7)
(268, 7)


In [4]:
# Taking log of the target variable:
x_train = ip_train.drop('expenses', axis=1)
y_train = np.log(ip_train['expenses'])

x_test = ip_test.drop('expenses', axis=1)
y_test = np.log(ip_test['expenses'])

Defining get_dummies custom transformer

In [5]:
class get_dummies(BaseEstimator, TransformerMixin):
    
    def __init__(self,cols):
        self.cols=cols
        
    def fit(self,x,y=None):
        return self
            
    def transform(self, X):
#        print('In tranform from get_dummies')
        for col in self.cols:
#            print(col)
            temp_dummies = pd.get_dummies(X[col], drop_first=True, prefix=col)
            X = pd.concat([X, temp_dummies], axis=1)
            del X[col]
#        print(X.head(2))
        return X

In [6]:
class outlier_treatment(BaseEstimator, TransformerMixin):
    
    def __init__(self,cols):
        self.cols=cols
        
    def fit(self,x,y=None):
        return self
            
    def transform(self,X):
#        print('In transform from outlier_treatment')
        for col in self.cols:
#            print(col)
            tenth_percentile = np.percentile(X[col], 10)
            ninetieth_percentile = np.percentile(X[col], 90)
            X[col] = np.where(X[col]<tenth_percentile, tenth_percentile, X[col])
            X[col] = np.where(X[col]>ninetieth_percentile, ninetieth_percentile, X[col])
#        print(X.head(2))    
        return X

In [7]:
# Final model obtained after training and hyper tuning parameters:
prediction_model = GradientBoostingRegressor(learning_rate=0.01, max_features=4, n_estimators=700, 
                                                  max_depth=3, min_samples_leaf=3, min_samples_split=5, random_state=2)

In [8]:
pre_process = Pipeline([('cat_var_encode',get_dummies(['sex', 'smoker', 'region'])),
                        ('outlier_treatment',outlier_treatment(['bmi']))
                       ])

In [9]:
model_pipeline=Pipeline([
                        ('data_prep', pre_process),
                        ('model', prediction_model)
                        ])

In [10]:
model_pipeline.fit(x_train, y_train)

Pipeline(steps=[('data_prep',
                 Pipeline(steps=[('cat_var_encode',
                                  get_dummies(cols=['sex', 'smoker',
                                                    'region'])),
                                 ('outlier_treatment',
                                  outlier_treatment(cols=['bmi']))])),
                ('model',
                 GradientBoostingRegressor(learning_rate=0.01, max_features=4,
                                           min_samples_leaf=3,
                                           min_samples_split=5,
                                           n_estimators=700, random_state=2))])

In [11]:
print('Cross validation score : ', cross_val_score(model_pipeline, x_train, y_train, scoring='neg_root_mean_squared_error', cv=10).mean())

Cross validation score :  -0.36008679624594386


In [12]:
print('Train score : ', np.sqrt(mean_squared_error(y_train, model_pipeline.predict(x_train))))
print('Test score : ', np.sqrt(mean_squared_error(y_test, model_pipeline.predict(x_test))))

Train score :  0.3354904241281576
Test score :  0.36471931086375636


##### Saving model to pickle file

In [13]:
import sklearn.externals 
import joblib

joblib.dump(model_pipeline,'insurance_premium_perdiction.pkl')

['insurance_premium_perdiction.pkl']