In [16]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import ml_utils as mt
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, test_train_split
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('insurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [4]:
df["sex"].value_counts()

sex
male      676
female    662
Name: count, dtype: int64

In [5]:
df["smoker"].value_counts()

smoker
no     1064
yes     274
Name: count, dtype: int64

In [6]:
df["region"].value_counts()

region
southeast    364
southwest    325
northwest    325
northeast    324
Name: count, dtype: int64

In [8]:
categorical_cols = ['sex', 'smoker', 'region']
numerical_cols = ['age', 'bmi', 'children']

In [9]:
X = df.drop(columns=['charges'])
y = df['charges']

In [10]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first'), categorical_cols)  # Drop first to avoid multicollinearity
])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [13]:
X_train.shape, X_test.shape

((1070, 8), (268, 8))

In [17]:
lasso = Lasso(alpha=100)  # Adjust alpha for tuning
lasso.fit(X_train, y_train)
y_lasso_pred = lasso.predict(X_test)
mse_lasso = mean_squared_error(y_test, y_lasso_pred)
r2_lasso = r2_score(y_test, y_lasso_pred)

In [18]:
ridge = Ridge(alpha=100)  # Adjust alpha for tuning
ridge.fit(X_train, y_train)
y_ridge_pred = ridge.predict(X_test)
mse_ridge = mean_squared_error(y_test, y_ridge_pred)
r2_ridge = r2_score(y_test, y_ridge_pred)


In [19]:
print(f"Lasso Regression - MSE: {mse_lasso}, R2: {r2_lasso}")
print(f"Ridge Regression - MSE: {mse_ridge}, R2: {r2_ridge}")

Lasso Regression - MSE: 34266062.50186862, R2: 0.7792828181421608
Ridge Regression - MSE: 48768747.58083766, R2: 0.6858670140991014


In [21]:
model = LinearRegression()
model.fit(X_train, y_train)

In [22]:
model.coef_

array([ 3.61497541e+03,  2.03622812e+03,  5.16890247e+02, -1.85916916e+01,
        2.36511289e+04, -3.70677326e+02, -6.57864297e+02, -8.09799354e+02])

In [23]:
y_pred = model.predict(X_test)

In [24]:
mean_squared_error(y_test, y_pred)

33596915.85136148