In [1]:
import pandas as pd
import numpy as np
import pickle 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [2]:
datasets = pd.read_csv("insurance_cost_prediction_dataset.csv")

In [3]:
datasets

Unnamed: 0,age,sex,bmi,children,smoker,region,medical_history_score,annual_income,exercise_frequency,charges
0,56,male,25.2,1,no,southeast,1,940686,3,45846.46
1,46,female,37.4,2,no,northeast,5,668731,5,49306.03
2,32,female,20.8,4,yes,northwest,0,984752,0,64036.32
3,60,female,17.0,4,no,northeast,4,613532,5,36274.49
4,25,male,30.2,3,yes,northwest,6,357255,5,53202.85
...,...,...,...,...,...,...,...,...,...,...
995,22,male,23.4,1,no,southwest,5,566047,5,34050.65
996,40,male,15.3,1,no,southwest,1,718790,5,30612.95
997,27,female,26.6,2,no,northeast,5,349725,3,30001.14
998,61,male,33.5,4,yes,northeast,3,827867,1,76191.48


In [4]:
le = LabelEncoder()

In [5]:
datasets["sex"] = le.fit_transform(datasets["sex"])
datasets["smoker"] = le.fit_transform(datasets["smoker"])
datasets["region"] = le.fit_transform(datasets["region"])

In [6]:
X = datasets.drop("charges", axis=1)
y = datasets["charges"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [8]:
X_train.shape

(800, 9)

In [9]:
X_test.shape

(200, 9)

In [10]:
y_train.shape

(800,)

In [11]:
y_test.shape

(200,)

In [12]:
model = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42
)

In [13]:
model

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
model.fit(X_train, y_train)

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
y_pred = model.predict(X_test)

In [16]:
y_pred

array([44773.23465, 68111.59225, 43252.1564 , 27320.3668 , 40235.4186 ,
       33864.9298 , 42764.13045, 31623.99225, 46246.29105, 46941.58185,
       36715.52065, 23411.15165, 38386.2392 , 38954.96955, 31127.59445,
       40804.493  , 31983.90405, 47373.0583 , 64133.4917 , 34606.93115,
       74921.67485, 65254.8296 , 50093.2771 , 32545.587  , 44278.8455 ,
       45437.70705, 37367.85405, 35911.6061 , 70586.13685, 53095.7847 ,
       46798.8475 , 70645.81405, 39443.6593 , 48345.1794 , 61321.27665,
       45116.8639 , 42311.9377 , 31534.1067 , 57643.7577 , 36746.79255,
       37693.3681 , 45589.01275, 27163.78935, 49468.4354 , 47458.81075,
       39832.08995, 47228.87925, 45248.5667 , 75624.6515 , 70337.4225 ,
       37168.77455, 45795.5097 , 26046.23295, 45504.72185, 41547.55715,
       49928.38545, 43492.785  , 35280.12925, 58167.57915, 38219.82065,
       68065.94135, 35346.2621 , 62617.1548 , 41185.28575, 51455.0415 ,
       48465.4966 , 39194.6633 , 29003.88245, 44581.4174 , 44384

In [17]:
print("Train Score :", model.score(X_train, y_train))
print("Test Score  :", model.score(X_test, y_test))


Train Score : 0.9877999697214759
Test Score  : 0.9181192337688866


In [18]:
r_score = r2_score(y_test, y_pred)

In [19]:
r_score

0.9181192337688866

In [20]:
filename = "Final_model_insurance.pkl"

In [21]:
pickle.dump(model,open(filename,'wb'))

In [22]:
print("Model Saved Successfully!")

Model Saved Successfully!
