In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import StandardScaler, OneHotEncoder 
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error,accuracy_score

In [2]:
df = pd.read_csv('Final_Insurance_data_processed_imputed.csv')
df.head()

Unnamed: 0,age,bmi,children,charges,sex,smoker,region
0,19.0,27.9,0.0,16884.924,F,yes,southwest
1,18.0,33.77,1.0,1725.5523,M,no,southeast
2,28.0,33.0,3.0,13424.206937,M,no,southeast
3,33.0,22.705,0.0,13424.206937,M,no,northwest
4,32.0,28.88,0.0,13424.206937,M,no,northwest


In [3]:
x = df.drop(['charges'], axis=1)
y = df['charges']

In [4]:
x.head(1)

Unnamed: 0,age,bmi,children,sex,smoker,region
0,19.0,27.9,0.0,F,yes,southwest


In [5]:
print(type(x),'\n', x.shape)

<class 'pandas.core.frame.DataFrame'> 
 (1326, 6)


In [6]:
print(type(y),'\n', y.shape)

<class 'pandas.core.series.Series'> 
 (1326,)


In [7]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [8]:
x_train.shape

(928, 6)

In [9]:
x_test.shape

(398, 6)

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [11]:
df.columns

Index(['age', 'bmi', 'children', 'charges', 'sex', 'smoker', 'region'], dtype='object')

In [12]:
num_features = ['age', 'bmi']
cat_features = ['sex', 'smoker', 'region']

In [13]:
preprocessor = ColumnTransformer([
    ('scaler', StandardScaler(),num_features),
    ('encoding', OneHotEncoder(drop='first'), cat_features)
])

In [14]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('grad_boosting_model', GradientBoostingRegressor())
])

In [15]:
param_grid = {'grad_boosting_model__n_estimators' : [20,44,50,60],
             'grad_boosting_model__learning_rate' : [0.1,0.01,0.5]}
             # 'grad_boosting_model__max_depth':[2,3,5]}
             # 'grad_boosting_model__min_samples_split':[2,4,6],
             # 'grad_boosting_model__min_samples_leaf':[1,2,4,6]}
             #'grad_boosting_model__max_depth' : [3,4,5]}

In [16]:
grid_search_gb = GridSearchCV(pipe,param_grid,cv=5,scoring="r2")

In [17]:
grid_search_gb.fit(x_train, y_train)

In [18]:
print("best parameters are : ", grid_search_gb.best_params_ )

best parameters are :  {'grad_boosting_model__learning_rate': 0.1, 'grad_boosting_model__n_estimators': 50}


In [19]:
best_model_gb = grid_search_gb.best_estimator_
best_model_gb

In [20]:
best_model_gb.score(x_test,y_test)

0.561147895654881

In [21]:
y_pred_gb = best_model_gb.predict(x_test)

In [22]:
rmse = np.sqrt(mean_squared_error(y_test,y_pred_gb))
rmse

np.float64(6788.494456436838)

In [23]:
best_model_gb.score(x_train,y_train)

0.7149738265704253

In [24]:
best_model_gb.score(x_test,y_test)

0.561147895654881

# Linear Regression model

In [25]:
# Pipeline: Linear Regression
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_model', LinearRegression())  # Ridge regression model
])


In [26]:
pipe.fit(x_train,y_train)

In [27]:
y_pred_lr = pipe.predict(x_test)

In [28]:
rmse = np.sqrt(mean_squared_error(y_test,y_pred_lr))
rmse

np.float64(7552.5365889893865)

## Polynomial Regression with Lasso

In [29]:
from sklearn.preprocessing import PolynomialFeatures

In [30]:
# Pipeline: Polynomial Features -> Linear Regression
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures()),  # Using polynomial degree 2
    ('lasso_model', Lasso())
])


In [31]:
# Hyperparameter grid
param_grid = {
    'poly__degree': [2, 3],  # Degree of polynomial expansion
    'lasso_model__alpha': [0.1, 1, 5]  # Regularization strength for Ridge regression
}

In [32]:
# Grid Search
grid_search = GridSearchCV(pipe, param_grid, cv=5)
grid_search.fit(x_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [33]:
y_pred_lasso = grid_search.predict(x_test)

In [34]:
rmse = np.sqrt(mean_squared_error(y_test,y_pred_lasso))
rmse

np.float64(7280.17366108862)

## RandomForest Regressor

In [35]:
# Pipeline: Polynomial Features -> Linear Regression
pipe = Pipeline([
    ('preprocessor', preprocessor),  # Using polynomial degree 2
    ('random_model', RandomForestRegressor())
])

In [36]:
param_grid = {'random_model__n_estimators':[20,30,50,66],
              'random_model__max_depth':[None,2,5],
              'random_model__min_samples_split':[2,4,6],
             'random_model__min_samples_leaf':[1,2,4,6]}

In [37]:
grid_search = GridSearchCV(pipe,param_grid,cv=5)

In [38]:
grid_search.fit(x_train,y_train)

In [39]:
y_pred_RF = grid_search.predict(x_test)

In [40]:
rmse = np.sqrt(mean_squared_error(y_test,y_pred_RF))
rmse

np.float64(7045.581894244044)

## import model

In [41]:
import pickle
pickle.dump(best_model_gb, open("GBR_model.pkl", "wb"))

In [42]:
new_data=pd.DataFrame({'age':19,'sex':'female','bmi':25.9,'children':3,'smoker':'no','region':'northeast'},index=[0])
new_data

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,25.9,3,no,northeast


In [43]:
new_data['smoker'] = new_data['smoker'].map({'yes':1,'no':0})
new_data

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,25.9,3,0,northeast


In [44]:
new_data=pd.DataFrame({'age':55,'sex':'male','bmi':43.9,'children':4,'smoker':'yes','region':'northeast'},index=[0])
new_data

Unnamed: 0,age,sex,bmi,children,smoker,region
0,55,male,43.9,4,yes,northeast


In [45]:
new_data['smoker'] = new_data['smoker'].map({'yes':1,'no':0})
new_data

Unnamed: 0,age,sex,bmi,children,smoker,region
0,55,male,43.9,4,1,northeast


In [46]:
new_data

Unnamed: 0,age,sex,bmi,children,smoker,region
0,55,male,43.9,4,1,northeast


In [47]:
new_data=pd.DataFrame({'age':55,'sex':'M','bmi':43.9,'children':4,'smoker':'yes','region':'northeast'},index=[0])
new_data

Unnamed: 0,age,sex,bmi,children,smoker,region
0,55,M,43.9,4,yes,northeast


In [48]:
# new_data['smoker'] = new_data['smoker'].map({'yes':1,'no':0})
# new_data

In [49]:
best_model_gb.predict(new_data)

array([41453.42867223])