In [48]:
# imports
import numpy as np
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
df = pl.read_csv('insurance.csv')
df.head(5)

age,sex,bmi,children,smoker,region,charges
i64,str,f64,i64,str,str,f64
19,"""female""",27.9,0,"""yes""","""southwest""",16884.924
18,"""male""",33.77,1,"""no""","""southeast""",1725.5523
28,"""male""",33.0,3,"""no""","""southeast""",4449.462
33,"""male""",22.705,0,"""no""","""northwest""",21984.47061
32,"""male""",28.88,0,"""no""","""northwest""",3866.8552


In [9]:
df

age,sex,bmi,children,smoker,region,charges
i64,str,f64,i64,str,str,f64
19,"""female""",27.9,0,"""yes""","""southwest""",16884.924
18,"""male""",33.77,1,"""no""","""southeast""",1725.5523
28,"""male""",33.0,3,"""no""","""southeast""",4449.462
33,"""male""",22.705,0,"""no""","""northwest""",21984.47061
32,"""male""",28.88,0,"""no""","""northwest""",3866.8552
…,…,…,…,…,…,…
50,"""male""",30.97,3,"""no""","""northwest""",10600.5483
18,"""female""",31.92,0,"""no""","""northeast""",2205.9808
18,"""female""",36.85,0,"""no""","""southeast""",1629.8335
21,"""female""",25.8,0,"""no""","""southwest""",2007.945


In [20]:
# checking for null values
df.drop_nulls()

age,sex,bmi,children,smoker,region,charges
i64,str,f64,i64,str,str,f64
19,"""female""",27.9,0,"""yes""","""southwest""",16884.924
18,"""male""",33.77,1,"""no""","""southeast""",1725.5523
28,"""male""",33.0,3,"""no""","""southeast""",4449.462
33,"""male""",22.705,0,"""no""","""northwest""",21984.47061
32,"""male""",28.88,0,"""no""","""northwest""",3866.8552
…,…,…,…,…,…,…
50,"""male""",30.97,3,"""no""","""northwest""",10600.5483
18,"""female""",31.92,0,"""no""","""northeast""",2205.9808
18,"""female""",36.85,0,"""no""","""southeast""",1629.8335
21,"""female""",25.8,0,"""no""","""southwest""",2007.945


In [32]:
df = df.rename({"sex": "gender"})

In [33]:
df.filter(df.is_duplicated())

age,gender,bmi,children,smoker,region,charges
i64,str,f64,i64,str,str,f64


In [34]:
df = df.filter(df.is_unique())

In [35]:
df

age,gender,bmi,children,smoker,region,charges
i64,str,f64,i64,str,str,f64
19,"""female""",27.9,0,"""yes""","""southwest""",16884.924
18,"""male""",33.77,1,"""no""","""southeast""",1725.5523
28,"""male""",33.0,3,"""no""","""southeast""",4449.462
33,"""male""",22.705,0,"""no""","""northwest""",21984.47061
32,"""male""",28.88,0,"""no""","""northwest""",3866.8552
…,…,…,…,…,…,…
50,"""male""",30.97,3,"""no""","""northwest""",10600.5483
18,"""female""",31.92,0,"""no""","""northeast""",2205.9808
18,"""female""",36.85,0,"""no""","""southeast""",1629.8335
21,"""female""",25.8,0,"""no""","""southwest""",2007.945


In [39]:
df.is_duplicated().any()

False

In [42]:
le = LabelEncoder()
df = df.with_columns(pl.Series("gender", le.fit_transform(df["gender"].to_list())))
df

age,gender,bmi,children,smoker,region,charges
i64,i64,f64,i64,str,str,f64
19,0,27.9,0,"""yes""","""southwest""",16884.924
18,1,33.77,1,"""no""","""southeast""",1725.5523
28,1,33.0,3,"""no""","""southeast""",4449.462
33,1,22.705,0,"""no""","""northwest""",21984.47061
32,1,28.88,0,"""no""","""northwest""",3866.8552
…,…,…,…,…,…,…
50,1,30.97,3,"""no""","""northwest""",10600.5483
18,0,31.92,0,"""no""","""northeast""",2205.9808
18,0,36.85,0,"""no""","""southeast""",1629.8335
21,0,25.8,0,"""no""","""southwest""",2007.945


In [43]:
df = df.with_columns(pl.Series("smoker", le.fit_transform(df["smoker"].to_list())))

In [45]:
df = df.with_columns(pl.Series("region", le.fit_transform(df["region"].to_list())))

In [46]:
df

age,gender,bmi,children,smoker,region,charges
i64,i64,f64,i64,i64,i64,f64
19,0,27.9,0,1,3,16884.924
18,1,33.77,1,0,2,1725.5523
28,1,33.0,3,0,2,4449.462
33,1,22.705,0,0,1,21984.47061
32,1,28.88,0,0,1,3866.8552
…,…,…,…,…,…,…
50,1,30.97,3,0,1,10600.5483
18,0,31.92,0,0,0,2205.9808
18,0,36.85,0,0,2,1629.8335
21,0,25.8,0,0,3,2007.945


## Model Creation

In [49]:

X = df.drop("charges").to_numpy()
y = df["charges"].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [57]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define the parameter grid
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
}

# Initialize the model
rf_model = RandomForestRegressor(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    estimator=rf_model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error"
)

# Fit the grid search model
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f"Best Parameters: {best_params}")

# Predict using the best model
y_pred_best = best_model.predict(X_test)

# Evaluate the best model
mse_best = mean_squared_error(y_test, y_pred_best)
mae_best = mean_absolute_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"Best Random Forest MSE: {mse_best}")
print(f"Best Random Forest MAE: {mae_best}")
print(f"Best Random Forest R²: {r2_best}")

Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
Best Random Forest MSE: 23370714.59795716
Best Random Forest MAE: 2702.563916808688
Best Random Forest R²: 0.8537894787120879


In [58]:
import pickle


with open("rf_model.pkl", "wb") as f:
    pickle.dump(best_model, f)