In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Dataset : https://www.kaggle.com/datasets/mirichoi0218/insurance#



In [5]:
df = pd.read_csv("insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [8]:
# All objects values converted to numeric values.
df = pd.get_dummies(df, columns=["sex","smoker","region"],drop_first=True,dtype='int')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   bmi               1338 non-null   float64
 2   children          1338 non-null   int64  
 3   charges           1338 non-null   float64
 4   sex_male          1338 non-null   int32  
 5   smoker_yes        1338 non-null   int32  
 6   region_northwest  1338 non-null   int32  
 7   region_southeast  1338 non-null   int32  
 8   region_southwest  1338 non-null   int32  
dtypes: float64(2), int32(5), int64(2)
memory usage: 68.1 KB


In [10]:
df.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


In [11]:
x = df.drop("charges",axis=1)
y = df["charges"]

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, test_size=0.3)

In [13]:
# Linear Regression Model
linear_regresion = LinearRegression()
linear_model = linear_regresion.fit(x_train, y_train)

# Linear Regression Model Score
linear_model.score(x_test, y_test)

0.7696118054369011

In [15]:
# Linear Regression Prediction
y_pred = linear_model.predict(x_test)

# Actual and Prediction Result
result = pd.DataFrame({"Actual":y_test, "Prediction":y_pred})
result.head()

Unnamed: 0,Actual,Prediction
764,9095.06825,9016.367529
887,5272.1758,7019.976349
890,29330.98315,36870.834268
1293,9301.89355,9518.537736
259,33750.2918,26974.030822


In [14]:
# Random Forest Regression Model
random_forest = RandomForestRegressor(n_estimators=300, max_depth=4)
random_forest_model = random_forest.fit(x_train, y_train)

# Random Forest Model Score
random_forest_model.score(x_test, y_test)

0.8655777699589466

In [16]:
# Random Forest Prediction
y_pred = random_forest_model.predict(x_test)

# Actual and Prediction Result
result = pd.DataFrame({"Actual":y_test, "Prediction":y_pred})
result.head()

Unnamed: 0,Actual,Prediction
764,9095.06825,9268.56391
887,5272.1758,5957.362539
890,29330.98315,27102.873773
1293,9301.89355,10211.780217
259,33750.2918,35335.164858
