In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('medical_insurance_expenditure_prediction.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.shape

(1338, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [7]:
df.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [9]:
X = df.drop('charges', axis=1)

In [10]:
Y = df['charges']

In [11]:
X.shape, Y.shape

((1338, 6), (1338,))

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=7)

In [14]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((936, 6), (402, 6), (936,), (402,))

In [16]:
X_train.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
dtype: object

In [15]:
X_train.select_dtypes(include=['object'])

Unnamed: 0,sex,smoker,region
934,male,no,southeast
530,male,yes,southeast
518,female,no,southwest
619,female,no,southwest
691,male,no,southwest
...,...,...,...
211,male,no,northwest
502,male,yes,southeast
537,female,no,southwest
1220,female,no,northeast


In [17]:
X_train_ohe = pd.get_dummies(X_train)

In [18]:
X_train.shape, X_train_ohe.shape

((936, 6), (936, 11))

In [19]:
X_train_ohe.columns

Index(['age', 'bmi', 'children', 'sex_female', 'sex_male', 'smoker_no',
       'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest'],
      dtype='object')

In [21]:
X_train_ohe.dtypes

age                   int64
bmi                 float64
children              int64
sex_female             bool
sex_male               bool
smoker_no              bool
smoker_yes             bool
region_northeast       bool
region_northwest       bool
region_southeast       bool
region_southwest       bool
dtype: object

In [20]:
X_test.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
dtype: object

In [22]:
X_test.select_dtypes(include=['object'])

Unnamed: 0,sex,smoker,region
13,female,no,southeast
1052,male,no,northeast
172,male,no,northeast
1085,female,yes,southwest
683,male,no,northwest
...,...,...,...
968,male,no,northeast
1145,male,no,northwest
4,male,no,northwest
697,male,yes,southeast


In [23]:
X_test_ohe = pd.get_dummies(X_test)

In [24]:
X_test.shape, X_test_ohe.shape

((402, 6), (402, 11))

In [25]:
X_train_ohe.columns

Index(['age', 'bmi', 'children', 'sex_female', 'sex_male', 'smoker_no',
       'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest'],
      dtype='object')

In [26]:
X_train_ohe.dtypes

age                   int64
bmi                 float64
children              int64
sex_female             bool
sex_male               bool
smoker_no              bool
smoker_yes             bool
region_northeast       bool
region_northwest       bool
region_southeast       bool
region_southwest       bool
dtype: object

In [43]:
X_train_ohe.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
934,32,37.18,2,False,True,True,False,False,False,True,False
530,57,42.13,1,False,True,False,True,False,False,True,False
518,35,31.0,1,True,False,True,False,False,False,False,True
619,55,37.1,0,True,False,True,False,False,False,False,True
691,47,36.2,1,False,True,True,False,False,False,False,True


In [29]:
X_train.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
dtype: int64

In [31]:
numeric_cols = X_train.select_dtypes(exclude='object').columns

In [32]:
numeric_cols

Index(['age', 'bmi', 'children'], dtype='object')

In [34]:
training_median_values = X_train.loc[:,numeric_cols].median()

In [35]:
X_train.fillna(training_median_values, inplace=True)

# Linear regression

In [58]:
from sklearn.linear_model import LinearRegression

In [57]:
from sklearn.metrics import mean_squared_error

In [39]:
model = LinearRegression()

In [44]:
model.fit(X_train_ohe, Y_train)

In [46]:
Y_pred = model.predict(X_test_ohe)

In [47]:
mse = mean_squared_error(Y_test, Y_pred)

In [48]:
print('mean squared error : ', mse)

mean squared error :  36313025.727236286


# Lasso Regression

In [49]:
from sklearn.linear_model import Lasso

In [50]:
lasso_model = Lasso()

In [52]:
lasso_model.fit(X_train_ohe, Y_train)

In [54]:
lasso_Y_pred = lasso_model.predict(X_test_ohe)

In [55]:
lasso_mse = mean_squared_error(Y_test, lasso_Y_pred)

In [56]:
print('mean squared error : ', lasso_mse)

mean squared error :  36314506.46120058


# Ridge Regression

In [59]:
from sklearn.linear_model import Ridge

In [61]:
ridge_model = Ridge()

In [62]:
ridge_model.fit(X_train_ohe, Y_train)

In [63]:
ridge_Y_pred = ridge_model.predict(X_test_ohe)

In [64]:
ridge_mse = mean_squared_error(Y_test, ridge_Y_pred)

In [65]:
print('mean squared error : ', ridge_mse)

mean squared error :  36328890.96067001


In [66]:
print("Linear Regression MSE:", mse)
print("Lasso Regression MSE:", lasso_mse)
print("Ridge Regression MSE:", ridge_mse)

Linear Regression MSE: 36313025.727236286
Lasso Regression MSE: 36314506.46120058
Ridge Regression MSE: 36328890.96067001


In [67]:
from sklearn.metrics import r2_score

In [68]:
linear_r2 = r2_score(Y_test, Y_pred)
print('Linear Regression R-squared : ', linear_r2)

Linear Regression R-squared :  0.7516810217281062


In [69]:
lasso_r2 = r2_score(Y_test, lasso_Y_pred)
print('Lasso Regression R-squared : ', lasso_r2)

Lasso Regression R-squared :  0.7516708960407601


In [70]:
ridge_r2 = r2_score(Y_test, ridge_Y_pred)
print('Ridge Regression R-squared : ', ridge_r2)

Ridge Regression R-squared :  0.7515725306707677


In [71]:
print('Linear Regression R-squared : ', linear_r2)
print('Lasso Regression R-squared : ', lasso_r2)
print('Ridge Regression R-squared : ', ridge_r2)

Linear Regression R-squared :  0.7516810217281062
Lasso Regression R-squared :  0.7516708960407601
Ridge Regression R-squared :  0.7515725306707677
