In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict
import seaborn as sns
!pip install seaborn
import matplotlib.pyplot as plt
import math


In [15]:
# A. Data Load
# Load the dataset as CSV file 
data = pd.read_csv('insurance.csv')
data.shape


(1338, 7)

In [16]:
# B. Exploratory Data Analysis

# Information of data 
data.info()

# Statistical summary data
print(data.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 57.6+ KB
               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max     

In [19]:
# Correlation matrix
correlation_matrix = data.corr()
print("Correlation Matrix:")
print(correlation_matrix)


  correlation_matrix = data.corr()


Correlation Matrix:
               age       bmi  children   charges
age       1.000000  0.109272  0.042469  0.299008
bmi       0.109272  1.000000  0.012759  0.198341
children  0.042469  0.012759  1.000000  0.067998
charges   0.299008  0.198341  0.067998  1.000000


In [59]:
# Other graphs (pairplot for numeric values)
sns.pairplot(data)
plt.show()

In [None]:
# Other graphs (scatter plots, histograms, etc.)
data['column'].hist(bins=20, color='red', edgecolor='black')
plt.xlabel('Smokers')
plt.ylabel('Region')
plt.title('Histogram of number smokers in each region')
plt.show()


# Relationships
sns.jointplot(x=data['age'],y=data['charges'])
sns.jointplot(x = data['smoker'],y = data['charges'])
sns.jointplot(x=data['region'],y=data['charges'])


In [None]:
# Missing for outliers(Age, Children, BMI...)
sns.boxplot(x=data['age'])
plt.show()

sns.boxplot(x=data['bmi'])
plt.show()

sns.boxplot(x=data['charges'])
plt.show()


In [29]:
# C. Data Preparation
#candidate checked only with isnull but better add sum to check number of nulls
print(data.isnull().sum())

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64


In [21]:
# Checking for missing values
data = data.dropna()
data.shape

(1338, 7)

In [31]:
# Categorical values encoding
data['sex'] = data['sex'].apply({'male':0, 'female':1}.get)
data['smoker'] = data['smoker'].apply({'yes':1, 'no':0}.get)
data['region'] = data['region'].apply({'southwest':1, 'southeast':2, 'northwest':3, 
'northeast':4}.get)

In [33]:
# Split dataset into training and test sets; isolate the test set
X = data[['age', 'bmi', 'children', 'smoker', 'region']]
y = data['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [34]:
# Standart Scaling
scaler = StandardScaler()
data[['age', 'bmi', 'children']] = scaler.fit_transform(data[['age', 'bmi', 'children']])

In [55]:
# D. Model training

# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Lasso Regression
lasso = Lasso(alpha=0.1)  # alpha is the regularization strength
lasso.fit(X_train, y_train)

#Elastic net 
elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)  # alpha is the regularization strength, l1_ratio is the mix parameter
elastic.fit(X_train, y_train)


In [58]:
# E. Model evaluation
# Predict on the test set
rf_preds = rf_model.predict(X_test)
lr_preds = lr_model.predict(X_test)
lasso_preds = lasso.predict(X_test)
elastic_preds = elastic.predict(X_test)

# Regression metrics
print("Random Forest Metrics:")
print("MAE:", mean_absolute_error(y_test, rf_preds))
print("MSE:", mean_squared_error(y_test, rf_preds))
print("RMSE:", math.sqrt(mean_squared_error(y_test, rf_preds)))
print("R-squared:", r2_score(y_test, rf_preds))


print("Linear Regression Metrics:")
print("MAE:", mean_absolute_error(y_test, lr_preds))
print("MSE:", mean_squared_error(y_test, lr_preds))
print("RMSE:", math.sqrt(mean_squared_error(y_test, lr_preds)))
print("R-squared:", r2_score(y_test, lr_preds))

print("Lasso Metrics:")
print("MAE:", mean_absolute_error(y_test, lasso_preds))
print("MSE:", mean_squared_error(y_test, lasso_preds))
print("RMSE:", math.sqrt(mean_squared_error(y_test, lasso_preds)))
print("R-squared:", r2_score(y_test, lasso_preds))

print("Elastic net Metrics:")
print("MAE:", mean_absolute_error(y_test, elastic_preds))
print("MSE:", mean_squared_error(y_test, elastic_preds))
print("RMSE:", math.sqrt(mean_squared_error(y_test, elastic_preds)))
print("R-squared:", r2_score(y_test, elastic_preds))


# Confusion matrix is not applicable for regression tasks


Random Forest Metrics:
MAE: 2596.86941348332
MSE: 20996746.452338
RMSE: 4582.220690051713
R-squared: 0.8593471564218195
Linear Regression Metrics:
MAE: 4058.7743990269228
MSE: 35426968.29311936
RMSE: 5952.055803931896
R-squared: 0.7626820973862629
Lasso Metrics:
MAE: 4058.8130496205385
MSE: 35427153.29471496
RMSE: 5952.071344894562
R-squared: 0.7626808580989979
Elastic net Metrics:
MAE: 4606.221698631367
MSE: 41111053.465440676
RMSE: 6411.79019193865
R-squared: 0.7246055913693727
