In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("/kaggle/input/medical-cost/insurance.csv")

In [3]:
df = data

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Feature Engineering

1. As the data do not contains missing values and only 1 duplicate values.

In [5]:
### Dropping duplicate
df.drop_duplicates(inplace =True)

In [6]:
df.reset_index(inplace =True)

In [7]:
df.drop("index",axis = 1,inplace =True)

**Handling target feature**

As charges is our target feature so scaling it using log transformations will be benificial.

In [8]:
df['charges'] = np.log1p(df['charges'])

In [9]:
df["charges"].skew()

-0.08955835073326

In [10]:
numerical_feature = []
categorical_feature = []
for i in df.columns:
    if df[i].dtype == "object":
        categorical_feature.append(i)
    else:
        numerical_feature.append(i)

In [11]:
print(f"We have {len(categorical_feature)} categorical features and are {categorical_feature}")

We have 3 categorical features and are ['sex', 'smoker', 'region']


1. Label encoding should be used for smoker and sex.
2. One hot encoding for region.

In [12]:
numerical_feature.remove("charges")

In [13]:
print(f"We have {len(numerical_feature)} numerical features and these are {numerical_feature}")

We have 3 numerical features and these are ['age', 'bmi', 'children']


Among all numerical features bmi contains outliers but having very low skewness as 0.28 . So it can be scaled using Robust Scaler.

## Model Training

In [15]:
from sklearn.model_selection import train_test_split,GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder,OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

**Train_Test_Split**

In [16]:
X = df.drop("charges",axis = 1)
y = df["charges"]

In [17]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state =42)

In [18]:
print(numerical_feature,categorical_feature)

['age', 'bmi', 'children'] ['sex', 'smoker', 'region']


In [19]:
categorical_pipeline = Pipeline(steps = [
    ("onehot_encoder",OneHotEncoder(handle_unknown = "ignore",drop = None,sparse_output = False))
])
numerical_pipeline = Pipeline(steps =[
    ("scaler", RobustScaler())
])

preprocessor = ColumnTransformer([
    ("Numerical Pipeline" ,numerical_pipeline,numerical_feature),
    ("Categorical Pipeline",categorical_pipeline,categorical_feature)
])

In [20]:
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

In [32]:
def evaluate_metrics(true,predicted):
    mse = mean_squared_error(true,predicted)
    mae = mean_absolute_error(true,predicted)
    score = r2_score(true,predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    return mse,mae,score,rmse

In [29]:
# Initialize models required for model
models = {
    "Linear Regression": LinearRegression(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False, max_depth=5),
    "AdaBoost Regressor": AdaBoostRegressor(), 
    "SVR": SVR()
}

In [35]:
# function which can evaluate models and return a report 
def evaluate_models(X_train, X_test, y_train, y_test, models):
    
    models_list = []
    r2_list = []
    
    for i in range(len(list(models))):
        model = list(models.values())[i]
        model.fit(X_train, y_train) # Train model

        # Make predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Evaluate Train and Test dataset
        model_train_mse,model_train_mae , model_train_r2, model_train_rmse = evaluate_metrics(y_train, y_train_pred)

        model_test_mse,model_test_mae, model_test_r2,model_test_rmse = evaluate_metrics(y_test, y_test_pred)


        print(list(models.keys())[i])
        models_list.append(list(models.keys())[i])

        print('Model performance for Training set')
        print("- Mean Squared error: {:.4f}".format(model_train_mse))
        print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
        print("- R2 Score: {:.4f}".format(model_train_r2))

        print('----------------------------------')

        print('Model performance for Test set')
        print("- Mean Squared error: {:.4f}".format(model_test_mse))
        print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
        print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
        print("- R2 Score: {:.4f}".format(model_test_r2))
        r2_list.append(model_test_r2)

        print('='*35)
        print('\n')
        
    report=pd.DataFrame(list(zip(models_list, r2_list)), columns=['Model Name', 'r2_score']).sort_values(by=['r2_score'], ascending=False)
        
    return report

In [36]:
base_report = evaluate_models(x_train, x_test, y_train, y_test, models)

Linear Regression
Model performance for Training set
- Mean Squared error: 0.2057
- Root Mean Squared Error: 0.4535
- Mean Absolute Error: 0.2844
- R2 Score: 0.7497
----------------------------------
Model performance for Test set
- Mean Squared error: 0.1582
- Root Mean Squared Error: 0.3978
- Mean Absolute Error: 0.2606
- R2 Score: 0.8295


K-Neighbors Regressor
Model performance for Training set
- Mean Squared error: 0.1291
- Root Mean Squared Error: 0.3592
- Mean Absolute Error: 0.2198
- R2 Score: 0.8429
----------------------------------
Model performance for Test set
- Mean Squared error: 0.1538
- Root Mean Squared Error: 0.3921
- Mean Absolute Error: 0.2564
- R2 Score: 0.8343


Random Forest Regressor
Model performance for Training set
- Mean Squared error: 0.0230
- Root Mean Squared Error: 0.1517
- Mean Absolute Error: 0.0781
- R2 Score: 0.9720
----------------------------------
Model performance for Test set
- Mean Squared error: 0.1498
- Root Mean Squared Error: 0.3870
- Mean

In [37]:
base_report

Unnamed: 0,Model Name,r2_score
6,SVR,0.885426
4,CatBoosting Regressor,0.862998
2,Random Forest Regressor,0.838595
1,K-Neighbors Regressor,0.834287
0,Linear Regression,0.829479
3,XGBRegressor,0.800271
5,AdaBoost Regressor,0.795463


## Hyperparameter Tunning

In [38]:
cat_params = {"learning_rate": [0.1, 0.01, 0.6, 0.5],
              "max_depth": [4, 5, 6, 8, 12]}
svr_params = {
    "kernel" : ["rbf","poly","linear"],
    "C" : [0.1,1,10,100],
    "epsilon" : [0.01,0.1,0.5],
    "gamma" : ["scale","auto",0.01,0.1],
    "degree" : [2,3,4]
}

In [39]:
randomcv_models = [
    ("SVR",SVR(),svr_params),
    ("CatBoost",CatBoostRegressor(verbose = False),cat_params)
]

In [41]:
## Applying hypeparameter tunning
model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                   param_distributions=params,
                                   n_iter=100,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    random.fit(x_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END C=10, degree=3, epsilon=0.5, gamma=auto, kernel=linear; total time=   0.1s
[CV] END C=100, degree=3, epsilon=0.01, gamma=0.1, kernel=poly; total time=   1.3s
[CV] END C=100, degree=4, epsilon=0.01, gamma=scale, kernel=poly; total time=  48.4s
[CV] END C=1, degree=4, epsilon=0.01, gamma=0.01, kernel=linear; total time=   0.2s
[CV] END C=1, degree=4, epsilon=0.01, gamma=0.01, kernel=linear; total time=   0.2s
[CV] END C=1, degree=4, epsilon=0.01, gamma=0.01, kernel=linear; total time=   0.2s
[CV] END C=1, degree=3, epsilon=0.1, gamma=scale, kernel=poly; total time=   0.1s
[CV] END C=1, degree=3, epsilon=0.1, gamma=scale, kernel=poly; total time=   0.1s
[CV] END C=1, degree=3, epsilon=0.1, gamma=scale, kernel=poly; total time=   0.1s
[CV] END C=100, degree=3, epsilon=0.5, gamma=0.01, kernel=rbf; total time=   0.1s
[CV] END C=100, degree=3, epsilon=0.5, gamma=0.01, kernel=rbf; total time=   0.1s
[CV] END C=100, degree=

In [43]:
models = {
    "SVR" : SVR(**model_param["SVR"]),
    "CatBoosting Regressor": CatBoostRegressor(**model_param['CatBoost'],verbose=False)
}

In [44]:
retrained_report = evaluate_models(x_train, x_test, y_train, y_test, models)

SVR
Model performance for Training set
- Mean Squared error: 0.1415
- Root Mean Squared Error: 0.3762
- Mean Absolute Error: 0.1767
- R2 Score: 0.8277
----------------------------------
Model performance for Test set
- Mean Squared error: 0.1009
- Root Mean Squared Error: 0.3176
- Mean Absolute Error: 0.1629
- R2 Score: 0.8913


CatBoosting Regressor
Model performance for Training set
- Mean Squared error: 0.1175
- Root Mean Squared Error: 0.3427
- Mean Absolute Error: 0.1785
- R2 Score: 0.8570
----------------------------------
Model performance for Test set
- Mean Squared error: 0.1059
- Root Mean Squared Error: 0.3255
- Mean Absolute Error: 0.1824
- R2 Score: 0.8858




In [45]:
retrained_report

Unnamed: 0,Model Name,r2_score
0,SVR,0.891308
1,CatBoosting Regressor,0.885838


**Implementing Grid Search**

In [47]:
grid_model_svr = GridSearchCV(
    estimator = SVR(),
    param_grid = svr_params,
    cv = 3,
    n_jobs =-1,
    verbose = 2,
    scoring = "neg_mean_squared_error"
)

In [48]:
grid_model_svr.fit(x_train,y_train)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=scale, kernel=poly; total time=   0.1s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=scale, kernel=linear; total time=   0.1s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=auto, kernel=rbf; total time=   0.1s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=auto, kernel=linear; total time=   0.1s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=0.01, kernel=rbf; total time=   0.1s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=0.01, kernel=poly; total time=   0.1s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=0.1, kernel=rbf; total time=   0.1s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=0.1, kernel=poly; total time=   0.1s
[CV] END C=0.1, degree=2, epsilon=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END C=0.1, degree=2, epsilon=0.1, gamma=scale, kernel=poly; total time=   0.0s
[CV] END C=0.1, degree=2, epsilon=0.1, gamma=scale, kernel=linear; total time=   0.0s
[CV] END C

In [51]:
svr_params = {}
svr_params["SVR"] = grid_model_svr.best_params_


[CV] END C=100, degree=3, epsilon=0.5, gamma=scale, kernel=rbf; total time=   0.1s
[CV] END C=100, degree=3, epsilon=0.5, gamma=scale, kernel=rbf; total time=   0.1s
[CV] END C=100, degree=3, epsilon=0.5, gamma=scale, kernel=rbf; total time=   0.1s
[CV] END C=100, degree=3, epsilon=0.5, gamma=scale, kernel=poly; total time=   1.3s
[CV] END C=100, degree=3, epsilon=0.5, gamma=scale, kernel=poly; total time=   1.3s
[CV] END C=100, degree=3, epsilon=0.5, gamma=auto, kernel=linear; total time=   0.5s
[CV] END C=100, degree=3, epsilon=0.5, gamma=0.1, kernel=rbf; total time=   0.1s
[CV] END C=100, degree=3, epsilon=0.5, gamma=0.1, kernel=rbf; total time=   0.1s
[CV] END C=100, degree=3, epsilon=0.5, gamma=0.1, kernel=poly; total time=   0.1s
[CV] END C=100, degree=3, epsilon=0.5, gamma=0.1, kernel=linear; total time=   0.4s
[CV] END C=100, degree=4, epsilon=0.01, gamma=scale, kernel=rbf; total time=   1.8s
[CV] END C=100, degree=4, epsilon=0.01, gamma=scale, kernel=poly; total time=  46.9s


In [52]:
final_model = SVR(**svr_params["SVR"])
final_model.fit(x_train,y_train)

In [54]:
y_train_pred = final_model.predict(x_train)
y_test_pred = final_model.predict(x_test)

In [55]:
training_r2_score = r2_score(y_train,y_train_pred)
testing_r2_score = r2_score(y_test,y_test_pred)

In [56]:
print(f"Training R2_score is {training_r2_score}")
print(f"Testing R2_score is {testing_r2_score}")

Training R2_score is 0.8277112548148613
Testing R2_score is 0.8913081703732441


**Best Model is SVR with score of 89.13**

In [57]:
import pickle
with open('best_model.pkl', 'wb') as f:
    pickle.dump(final_model, f)

In [58]:
with open("preprocessor.pkl","wb") as file:
    pickle.dump(preprocessor,file)