In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import pickle

##These 3 lines are needed to make utils module accessible to this module
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("non-existent.txt"))))

from utils.utils import find_best_hyperparameters

In [4]:
import warnings
#suppress warnings
warnings.filterwarnings('ignore')

In [5]:
dataset = pd.read_csv("insurance_pre.csv")

In [6]:
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.9,0,yes,16884.924
1,18,male,33.77,1,no,1725.5523
2,28,male,33.0,3,no,4449.462
3,33,male,22.705,0,no,21984.47061
4,32,male,28.88,0,no,3866.8552


In [7]:
#check for any null values. If Non-null count is not equal to total rows then we need to drop rows with null columns
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 62.8+ KB


In [8]:
dataset = pd.get_dummies(dataset,drop_first=True)

In [15]:
#check how the columns have transformed
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         1338 non-null   int64  
 1   bmi         1338 non-null   float64
 2   children    1338 non-null   int64  
 3   charges     1338 non-null   float64
 4   sex_male    1338 non-null   bool   
 5   smoker_yes  1338 non-null   bool   
dtypes: bool(2), float64(2), int64(2)
memory usage: 44.6 KB


In [17]:
dataset.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.9,0,16884.924,False,True
1,18,33.77,1,1725.5523,True,False
2,28,33.0,3,4449.462,True,False
3,33,22.705,0,21984.47061,True,False
4,32,28.88,0,3866.8552,True,False


In [19]:
independent_vars = ["age","bmi","children","sex_male","smoker_yes"]

In [21]:
x = dataset[independent_vars]

In [23]:
x.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.9,0,False,True
1,18,33.77,1,True,False
2,28,33.0,3,True,False
3,33,22.705,0,True,False
4,32,28.88,0,True,False


In [25]:
x.shape

(1338, 5)

In [27]:
dependent_vars = ["charges"]

In [29]:
y=dataset[dependent_vars]

In [31]:
y.head()

Unnamed: 0,charges
0,16884.924
1,1725.5523
2,4449.462
3,21984.47061
4,3866.8552


In [33]:
y.shape

(1338, 1)

In [35]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.30,random_state=0)

In [37]:
def find_best_model_lr() -> dict:
    print("------------Findings of Multiple LR---------------------------")
    best_combo = dict()
    
    #There are no hyper parameters to be set. We will just get the r_score
    regressor = LinearRegression()
    regressor.fit(x_train, y_train)
    y_predict = regressor.predict(x_test)
    r_score = r2_score(y_test, y_predict)
    best_combo["regressor"] = regressor
    best_combo["r_score"] = r_score
    print(f'r_score={best_combo["r_score"]}')

    return best_combo
    

In [39]:
def find_best_model_svr() -> dict:
    param_dict = {
        "C": [10,100,500,1000,2000,3000],
        "kernel": ["linear", "poly", "rbf", "sigmoid"]
    }

    print("------------Findings of SVR---------------------------")
    #print csv header
    print("C,kernel,r_score")
    
    best_combo = find_best_hyperparameters(
        param_dict,
        x_train, y_train, x_test, y_test,
        create_regressor_callback=lambda combo: make_pipeline(StandardScaler(), SVR(**combo)),
        print_combo_callback=lambda combo: print(f'{combo["C"]},{combo["kernel"]},{combo["r_score"]}')
    )
    
    print("\nBest combination:")
    print(f'C={best_combo["C"]}, kernel={best_combo["kernel"]}, r_score={best_combo["r_score"]}')

    return best_combo



In [41]:
def find_best_model_dt() -> dict:
    param_dict = {
        "criterion": ["squared_error", "friedman_mse", "absolute_error", "poisson"],
        "splitter" : ["best","random"],
        "max_features": ["sqrt", "log2",None]
    }

    print("------------Findings of DecisionTree---------------------------")

    #print csv header
    print("criterion,splitter,max_features,r_score")
    
    best_combo = find_best_hyperparameters(
        param_dict,
        x_train, y_train, x_test, y_test,
        create_regressor_callback=lambda combo: DecisionTreeRegressor(**combo),
        print_combo_callback=lambda combo: print(f'{combo["criterion"]},{combo["splitter"]},{combo["max_features"]},{combo["r_score"]}')
    )
    
    print("\nBest combination:")
    print(f'criterion={best_combo["criterion"]}, splitter={best_combo["splitter"]}, max_features={best_combo["max_features"]},r_score={best_combo["r_score"]}')

    return best_combo

    

In [43]:
def find_best_model_rf() -> dict:
    param_dict = {
        "n_estimators": [50,100,200,500],
        "max_depth" : [None,10,20,30],
        "min_samples_split": [2,4,8],
        "max_features": [1.0,"sqrt", "log2",None],
        "random_state": [0]
    }
    print("------------Findings of RandomForest---------------------------")

    #print csv header
    print("n_estimators,max_depth,min_samples_split,max_features,random_state,r_score")
    
    best_combo = find_best_hyperparameters(
        param_dict,
        x_train, y_train, x_test, y_test,
        create_regressor_callback=lambda combo: RandomForestRegressor(**combo),
        print_combo_callback=lambda combo: print(f'{combo["n_estimators"]},{combo["max_depth"]},{combo["min_samples_split"]},{combo["max_features"]},{combo["random_state"]},{combo["r_score"]}')
    )
    
    print("\nBest combination:")
    print(f'n_estimators={best_combo["n_estimators"]}, max_depth={best_combo["max_depth"]}, min_samples_split={best_combo["min_samples_split"]},max_features={best_combo["max_features"]},random_state={best_combo["random_state"]},r_score={best_combo["r_score"]}')

    return best_combo


In [45]:
lr_result = find_best_model_lr()

------------Findings of Multiple LR---------------------------
r_score=0.7894790349867009


In [47]:
svr_result = find_best_model_svr()

------------Findings of SVR---------------------------
C,kernel,r_score
10,linear,0.462468414233968
10,poly,0.038716222760231456
10,rbf,-0.0322732939067103
10,sigmoid,0.03930714378274347
100,linear,0.6288792857320367
100,poly,0.6179569624059797
100,rbf,0.3200317832050832
100,sigmoid,0.5276103546510407
500,linear,0.763105797597539
500,poly,0.8263683541268981
500,rbf,0.6642984611986598
500,sigmoid,0.4446061033869473
1000,linear,0.7649311738597033
1000,poly,0.856648767594656
1000,rbf,0.8102064874808204
1000,sigmoid,0.2874706948697654
2000,linear,0.7440418308108018
2000,poly,0.8605579258597715
2000,rbf,0.8547766422240716
2000,sigmoid,-0.5939509731283503
3000,linear,0.7414236599249162
3000,poly,0.8598930084494385
3000,rbf,0.8663393963090398
3000,sigmoid,-2.1244194786689863

Best combination:
C=3000, kernel=rbf, r_score=0.8663393963090398


In [49]:
dt_result = find_best_model_dt()

------------Findings of DecisionTree---------------------------
criterion,splitter,max_features,r_score
squared_error,best,sqrt,0.7380769261109636
squared_error,best,log2,0.6900374461444749
squared_error,best,None,0.6797264926054288
squared_error,random,sqrt,0.707195125235043
squared_error,random,log2,0.6183761434872016
squared_error,random,None,0.7063972872711168
friedman_mse,best,sqrt,0.7675119985836285
friedman_mse,best,log2,0.7256905156181405
friedman_mse,best,None,0.7075954227130331
friedman_mse,random,sqrt,0.7102030341728325
friedman_mse,random,log2,0.6577815196542737
friedman_mse,random,None,0.7413075807638254
absolute_error,best,sqrt,0.48580312095134304
absolute_error,best,log2,0.739105508561963
absolute_error,best,None,0.6513855705079554
absolute_error,random,sqrt,0.7797752846762114
absolute_error,random,log2,0.7019856146291856
absolute_error,random,None,0.7454631015676909
poisson,best,sqrt,0.7423953161312683
poisson,best,log2,0.7253411450784442
poisson,best,None,0.73579272226

In [51]:
rf_result = find_best_model_rf()

------------Findings of RandomForest---------------------------
n_estimators,max_depth,min_samples_split,max_features,random_state,r_score
50,None,2,1.0,0,0.8495860472309916
50,None,2,sqrt,0,0.8699196004695238
50,None,2,log2,0,0.8699196004695238
50,None,2,None,0,0.8495860472309916
50,None,4,1.0,0,0.859394781190592
50,None,4,sqrt,0,0.8716951777191232
50,None,4,log2,0,0.8716951777191232
50,None,4,None,0,0.859394781190592
50,None,8,1.0,0,0.8703788087689401
50,None,8,sqrt,0,0.8770051502116674
50,None,8,log2,0,0.8770051502116674
50,None,8,None,0,0.8703788087689401
50,10,2,1.0,0,0.8569187146682384
50,10,2,sqrt,0,0.875894126475048
50,10,2,log2,0,0.875894126475048
50,10,2,None,0,0.8569187146682384
50,10,4,1.0,0,0.8641791908647469
50,10,4,sqrt,0,0.879150481572974
50,10,4,log2,0,0.879150481572974
50,10,4,None,0,0.8641791908647469
50,10,8,1.0,0,0.8713934379341124
50,10,8,sqrt,0,0.8797319437278242
50,10,8,log2,0,0.8797319437278242
50,10,8,None,0,0.8713934379341124
50,20,2,1.0,0,0.8499024837812883


In [57]:
#find the best model among the best model from each algorithm
list_of_dicts = [lr_result,svr_result,dt_result,rf_result]
max_dict = max(list_of_dicts, key=lambda x: x["r_score"])
print(max_dict)



{'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 8, 'max_features': 'sqrt', 'random_state': 0, 'r_score': 0.8844928536280982, 'regressor': RandomForestRegressor(max_depth=10, max_features='sqrt', min_samples_split=8,
                      n_estimators=200, random_state=0)}


In [67]:
#save the best model
best_regressor = max_dict["regressor"]
finalized_model = "insurance_model.sav"
pickle.dump(best_regressor,open(finalized_model,"wb"))
print(f"model {finalized_model} saved successfully")

model insurance_model.sav saved successfully
