In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNetCV 
from sklearn.metrics import mean_absolute_error,mean_squared_error,root_mean_squared_error,r2_score

---
# Model Evalution without Hyper Parameter tunning

In [4]:
def model_evaluation(models):
    mae = []
    mse = []
    rmse = []
    score = []
    for model in models :
        model.fit(X_train , y_train)
        y_pred = model.predict(X_test)
        mae.append(mean_absolute_error(y_test , y_pred))
        mse.append(mean_squared_error(y_test , y_pred))
        rmse.append(root_mean_squared_error(y_test , y_pred))
        score.append(r2_score(y_test , y_pred))
    model_performance = {
        "Model" : models,
        "mean_absolute_error" : mae,
        "mean_squared_error" : mse,
        "root_mean_squared_error" : rmse,
        "r2_score" : score
    }
    return pd.DataFrame(model_performance)

In [6]:
dataframe = pd.read_csv(r'dataset.csv')
dataframe.drop('Unnamed: 0' , axis = 1 , inplace=True)
dataframe.head(3)

Unnamed: 0,Age,BMI,Children,Sex,Smoker,Region,Medical Cost
0,58,15.6,2,1,1,1,17907.54
1,24,29.8,0,1,1,0,16312.64
2,50,29.0,5,1,0,1,6819.21


In [8]:
X = dataframe.drop("Medical Cost" , axis = 1)
y = dataframe["Medical Cost"]

In [10]:
X_train , X_test , y_train , y_test = train_test_split( X , y , test_size = 0.2 , random_state = 42 )

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
models = [ DecisionTreeRegressor() , LinearRegression() , SVR() , ElasticNetCV() ]

In [16]:
performance_result = model_evaluation(models)
performance_result

Unnamed: 0,Model,mean_absolute_error,mean_squared_error,root_mean_squared_error,r2_score
0,DecisionTreeRegressor(),347.50001,180061.8,424.336864,0.995102
1,LinearRegression(),252.93201,84935.36,291.436709,0.997689
2,SVR(),5564.241996,44830710.0,6695.573807,-0.219558
3,ElasticNetCV(),5148.587853,27045220.0,5200.501766,0.264272


---
# Model Evaluation with Hyper Parameter Tunning

In [18]:
from sklearn.model_selection import GridSearchCV

In [19]:
def performance_result_hyperparameter_tunning(models , params):
    best_params = []
    mae = []
    mse = []
    rmse = []
    score = []
    for i in range(len(models)) :
        grid = GridSearchCV(estimator= models[i] , param_grid= params[i] , scoring = 'r2')
        grid.fit(X_train , y_train)
        best_params.append(grid.best_params_)
        y_pred = grid.predict(X_test)
        mae.append(mean_absolute_error(y_test , y_pred))
        mse.append(mean_squared_error(y_test , y_pred))
        rmse.append(root_mean_squared_error(y_test , y_pred))
        score.append(r2_score(y_test , y_pred))
    model_performance = {
        "Model" : models,
        "best_params" : best_params,
        "mean_absolute_error" : mae,
        "mean_squared_error" : mse,
        "root_mean_squared_error" : rmse,
        "r2_score" : score
    }
    return pd.DataFrame(model_performance)

In [20]:
'''
models_hyper = [ DecisionTreeRegressor() , SVR() , ElasticNetCV() ]
params = {
    0 : dict(criterion = [ 'squared_error' , 'friedman_mse' ] ,
     splitter =  [ 'best' ,  'random' ] ),
    1 : dict(kernel = ['linear', 'poly', 'rbf'],
     gamma = ['scale', 'auto']),
    2 : dict(l1_ratio = [.1, .5, .7, .9, .95, 1],
     selection = ['cyclic', 'random'])
} 
'''

"\nmodels_hyper = [ DecisionTreeRegressor() , SVR() , ElasticNetCV() ]\nparams = {\n    0 : dict(criterion = [ 'squared_error' , 'friedman_mse' ] ,\n     splitter =  [ 'best' ,  'random' ] ),\n    1 : dict(kernel = ['linear', 'poly', 'rbf'],\n     gamma = ['scale', 'auto']),\n    2 : dict(l1_ratio = [.1, .5, .7, .9, .95, 1],\n     selection = ['cyclic', 'random'])\n} \n"

In [21]:
#performance_result_hyperparameter_tunning = performance_result_hyperparameter_tunning(models_hyper , params)

In [22]:
#performance_result_hyperparameter_tunning

---
# Model Evaluation of Ensemblers

In [24]:
from sklearn import ensemble

In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import VotingRegressor

In [26]:
models_ensemble = [ RandomForestRegressor() , AdaBoostRegressor() , 
                    BaggingRegressor() , ExtraTreesRegressor() , 
                    GradientBoostingRegressor() , HistGradientBoostingRegressor()
                  ]

In [27]:
performance_result_ensembler = model_evaluation(models_ensemble)

In [28]:
performance_result_ensembler

Unnamed: 0,Model,mean_absolute_error,mean_squared_error,root_mean_squared_error,r2_score
0,"(DecisionTreeRegressor(max_features=1.0, rando...",274.309541,105427.015566,324.695266,0.997132
1,"(DecisionTreeRegressor(max_depth=3, random_sta...",288.756869,123192.290624,350.987593,0.996649
2,(DecisionTreeRegressor(random_state=1726439530...,280.408789,112298.147691,335.109158,0.996945
3,"(ExtraTreeRegressor(random_state=1776628138), ...",285.229775,118068.436758,343.61088,0.996788
4,([DecisionTreeRegressor(criterion='friedman_ms...,254.871774,86829.283548,294.66809,0.997638
5,HistGradientBoostingRegressor(),256.293489,88670.432359,297.775809,0.997588


---
# pickling scaler object

In [30]:
import pickle as pkl
with open('Scaler.pkl' , 'wb') as file:
    pkl.dump(scaler , file)

---
# pickling GradientBoostingRegressor

In [32]:
model = GradientBoostingRegressor().fit(X_train , y_train)
with open('model.pkl' , 'wb') as file:
    pkl.dump(model , file)

In [64]:
X_train

array([[-0.55338227,  1.04408645,  1.46326866,  1.00075028, -1.00225254,
        -1.34016524],
       [ 1.6166379 ,  1.05787118,  0.87634716,  1.00075028,  0.99775253,
        -1.34016524],
       [-0.55338227, -0.78928261,  1.46326866,  1.00075028,  0.99775253,
         0.44211752],
       ...,
       [ 0.09762378, -1.03740775,  0.87634716, -0.99925028, -1.00225254,
         1.33325889],
       [-1.63839235, -1.5198733 ,  0.87634716,  1.00075028,  0.99775253,
        -1.34016524],
       [ 0.3146258 , -0.51358802,  0.28942566,  1.00075028, -1.00225254,
         0.44211752]])

In [46]:
with open('model.pkl' , 'rb') as file:
    m = pkl.load(file)

In [None]:
m.predict([[-0.55338227,  1.04408645,  1.46326866,  1.00075028, -1.00225254,
        -1.34016524]