In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt  
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler,RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

In [5]:
df=pd.read_csv(r"E:\InsurancePrediction\notebooks\data\insurance.csv")

In [6]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [7]:
input_df = df.drop(columns='expenses')
output_df=df["expenses"]

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
#splitting the dataset into train and test data
input_train,input_test,target_train,target_test=train_test_split(input_df,output_df,test_size=0.2,random_state=19)

In [10]:
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

We have 4 numerical features : ['age', 'bmi', 'children', 'expenses']

We have 3 categorical features : ['sex', 'smoker', 'region']


In [11]:
numeric_features =['age', 'bmi']
categorical_features =['sex', 'smoker', 'region']

In [12]:
transformer = ColumnTransformer(
                transformers=[
                    ('trf1',OneHotEncoder(sparse=False,handle_unknown='ignore'),categorical_features),
                    ('trf2',StandardScaler(),numeric_features),
                            ],remainder='passthrough'
                        ) 

In [13]:
transformer

In [14]:
train_array = np.c_[transformer.fit_transform(input_train), target_train]
test_array = np.c_[transformer.transform(input_test), target_test]




In [15]:
train_array.shape

(1070, 12)

In [16]:
models = {     
    "Random Forest": RandomForestRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Linear Regression": LinearRegression(),
    "XGBRegressor": XGBRegressor(),
            }

In [17]:
params={
    "Decision Tree": {
        # 'splitter':['best','random'],
        'max_features':['sqrt','log2'],
    },
    "Random Forest":{
        'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],

        'max_features':['sqrt','log2',None],
        'n_estimators': [8,16,32,64,128,256]
    },
    "Linear Regression":{},
    
    "XGBRegressor":{
        'learning_rate':[.1,.01,.05,.001],
        'n_estimators': [8,16,32,64,128,256]
    } 
}

In [18]:
X_train,y_train,X_test,y_test=(
                train_array[:,:-1],
                train_array[:,-1],
                test_array[:,:-1],
                test_array[:,-1]
            )

In [19]:
from sklearn.model_selection import GridSearchCV
def get_best_model(X_train,y_train,X_test,y_test,model_dict,params_,base_accuracy=0.7):
    models = model_dict.keys()
    for model in models:
        print(f'Initial base accuracy is {base_accuracy}')
        print(20*'*',"Estimator: ",model,20*'*')
        gs = GridSearchCV(estimator=model_dict[model],param_grid=params_[model],cv=3)
        gs.fit(X_train,y_train)
        print(f'Best parameters are :{gs.best_params_} With R2 score:{gs.best_score_}')
        
        regressor = model_dict[model].set_params(**gs.best_params_)
        regressor.fit(X_train,y_train)

        y_train_pred = regressor.predict(X_train)
        y_test_pred = regressor.predict(X_test)
        
        train_model_score = r2_score(y_train, y_train_pred)

        test_model_score = r2_score(y_test, y_test_pred)
        
        if test_model_score >= base_accuracy:
            print('Acceptabel model found!')
            base_accuracy = test_model_score
            best_model = {model:regressor}
        else:
            print('Model rejected\n',30*'__')
            
    
    print(f"Best model is:{list(best_model.keys())} with R2 Score: {round(test_model_score,2)}")        
    return best_model

In [20]:
get_best_model(X_train,y_train,X_test,y_test,model_dict=models,params_=params)

Initial base accuracy is 0.7
******************** Estimator:  Random Forest ********************


Best parameters are :{'criterion': 'squared_error', 'max_features': None, 'n_estimators': 128} With R2 score:0.8333613594033826
Acceptabel model found!
Initial base accuracy is 0.8637687406214307
******************** Estimator:  Decision Tree ********************
Best parameters are :{'max_features': 'sqrt'} With R2 score:0.6865440824566105
Model rejected
 ____________________________________________________________
Initial base accuracy is 0.8637687406214307
******************** Estimator:  Linear Regression ********************
Best parameters are :{} With R2 score:0.7444272069573613
Model rejected
 ____________________________________________________________
Initial base accuracy is 0.8637687406214307
******************** Estimator:  XGBRegressor ********************
Best parameters are :{'learning_rate': 0.05, 'n_estimators': 64} With R2 score:0.8349934319786455
Acceptabel model found!
Best model is:['XGBRegressor'] with R2 Score: 0.87


{'XGBRegressor': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=64, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)}

### Conclusion: We can find 'XGBRegressor is best model with r2 score :87%

In [21]:
model=XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
            colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
            early_stopping_rounds=None, enable_categorical=False,
            eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
            importance_type=None, interaction_constraints='',
            learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
            max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
            monotone_constraints='()', n_estimators=32, n_jobs=0,
            num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
            reg_lambda=1)

In [26]:
model.fit(X_train, y_train)

Parameters: { "predictor" } are not used.



In [23]:
y_pred=model.predict(X_test)

In [24]:
y_pred

array([33696.535 ,  5679.9473, 11939.843 ,  8832.17  ,  6460.8223,
       11897.357 , 44499.504 ,  5793.459 ,  2738.5825,  3812.117 ,
        6497.2793, 10378.021 ,  2602.7568, 16547.346 , 45761.78  ,
        8396.309 , 12977.093 ,  2747.0798,  9007.516 , 36992.746 ,
        8660.955 ,  5679.9473, 11825.154 , 15201.771 ,  8217.302 ,
       11535.888 ,  3884.7502, 43859.094 ,  6779.321 , 45460.99  ,
        9026.699 , 13408.273 , 13122.717 ,  6695.5645,  2995.7188,
       13133.109 , 13747.032 ,  5483.433 ,  6720.2285,  6117.2144,
       43511.625 , 39127.324 , 13315.57  , 19396.393 , 44277.3   ,
        7282.763 , 43956.72  , 13955.638 ,  4236.815 ,  8282.549 ,
       26913.44  , 27388.293 , 12347.5205, 10103.89  ,  5876.775 ,
        6407.917 , 13012.818 , 11787.423 , 10210.89  , 17669.26  ,
        4639.149 , 29863.533 ,  6358.5317,  2875.418 , 10202.172 ,
       17945.898 , 11856.545 , 10112.464 ,  6276.0864,  7264.112 ,
       26913.44  ,  4185.8833,  6641.3965, 22813.877 , 19747.3

In [25]:
print(r2_score(y_test,y_pred))

0.8717978601393725


In [29]:
y_pred=model.predict(X_test[:1])
y_pred[0]

33696.535