# Insurance cost prediction using TreeRegressor model

**importing necessary modules**

In [105]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder


**importing csv and looking at basic summaries and datatypes**

In [106]:
data= pd.read_csv('insurance.csv')
df= pd.DataFrame(data)

print(df.head(10))
print(df.info())
print(df.describe())


   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520
5   31  female  25.740         0     no  southeast   3756.62160
6   46  female  33.440         1     no  southeast   8240.58960
7   37  female  27.740         3     no  northwest   7281.50560
8   37    male  29.830         2     no  northeast   6406.41070
9   60  female  25.840         0     no  northwest  28923.13692
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   flo

**encoding our columns for better features**

In [107]:
od_encoder= OrdinalEncoder(categories=[['no','yes'],['female','male']])
df[['smoker','sex']]= od_encoder.fit_transform(df[['smoker','sex']])

oh_encoder= OneHotEncoder(drop='first',sparse_output=False)

region_encoded = oh_encoder.fit_transform(df[['region']])

region_df= pd.DataFrame(region_encoded, columns=oh_encoder.get_feature_names_out(['region']))

df = pd.concat([df.drop('region', axis=1), region_df], axis=1)

df.head(10)


Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
0,19,0.0,27.9,0,1.0,16884.924,0.0,0.0,1.0
1,18,1.0,33.77,1,0.0,1725.5523,0.0,1.0,0.0
2,28,1.0,33.0,3,0.0,4449.462,0.0,1.0,0.0
3,33,1.0,22.705,0,0.0,21984.47061,1.0,0.0,0.0
4,32,1.0,28.88,0,0.0,3866.8552,1.0,0.0,0.0
5,31,0.0,25.74,0,0.0,3756.6216,0.0,1.0,0.0
6,46,0.0,33.44,1,0.0,8240.5896,0.0,1.0,0.0
7,37,0.0,27.74,3,0.0,7281.5056,1.0,0.0,0.0
8,37,1.0,29.83,2,0.0,6406.4107,0.0,0.0,0.0
9,60,0.0,25.84,0,0.0,28923.13692,1.0,0.0,0.0


**Creating features and target variables and predicting with our model**

In [108]:
x= df[['age','sex','bmi','children','smoker','region_northwest','region_southeast','region_southwest']]
y= df['charges']

X_train, X_test, y_train, y_test= train_test_split(x,y, test_size=0.2, random_state=42)

model= DecisionTreeRegressor(criterion='squared_error', max_depth=6,min_samples_split=20, min_samples_leaf=10, random_state=42)

model.fit(X_train,y_train)

y_pred= model.predict(X_test)

mse= mean_squared_error(y_test,y_pred)
print("MSE",mse)
r2= r2_score(y_test,y_pred)
print("Test_r2",r2)
r2_train = r2_score(y_train, model.predict(X_train))
print("Training R²:", r2_train)

MSE 20486710.216187872
Test_r2 0.8680394356892156
Training R²: 0.879078628744025
