In [232]:
import polars as pl
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [233]:
df = pl.read_csv("/home/addo/dev/projects/mip/dumped_stuff/assets/insurance_new.csv")
df

age,sex,bmi,children,smoker,region,charges,heart_disease_history,occupation
i64,str,f64,i64,str,str,f64,str,str
19,"""female""",27.9,0,"""yes""","""Oceania""",16884.924,"""no""","""unemployed"""
18,"""male""",33.77,1,"""no""","""Asia""",1725.5523,"""no""","""part-time"""
28,"""male""",33.0,3,"""no""","""Oceania""",4449.462,"""no""","""salaried"""
33,"""male""",22.705,0,"""no""","""South America""",21984.47061,"""no""","""salaried"""
32,"""male""",28.88,0,"""no""","""Antarctica""",3866.8552,"""yes""","""business"""
…,…,…,…,…,…,…,…,…
47,"""female""",45.32,1,"""no""","""Asia""",8569.8618,"""yes""","""salaried"""
21,"""female""",34.6,0,"""no""","""Africa""",2020.177,"""no""","""part-time"""
19,"""male""",26.03,1,"""yes""","""Africa""",16450.8947,"""no""","""unemployed"""
23,"""male""",18.715,0,"""no""","""North America""",21595.38229,"""no""","""unemployed"""


In [234]:
# df = df.filter(df.is_unique())
# df
# df.is_duplicated().count()
df = df.filter(df.is_unique())

In [235]:
df = df.rename({"sex": "gender"})

In [236]:
le = LabelEncoder()
df = df.with_columns(pl.Series("gender", le.fit_transform(df["gender"].to_list())))
df = df.with_columns(pl.Series("smoker", le.fit_transform(df["smoker"].to_list())))

df = df.with_columns(
    pl.Series(
        "heart_disease_history", le.fit_transform(df["heart_disease_history"].to_list())
    )
)
df.head(10)

age,gender,bmi,children,smoker,region,charges,heart_disease_history,occupation
i64,i64,f64,i64,i64,str,f64,i64,str
19,0,27.9,0,1,"""Oceania""",16884.924,0,"""unemployed"""
18,1,33.77,1,0,"""Asia""",1725.5523,0,"""part-time"""
28,1,33.0,3,0,"""Oceania""",4449.462,0,"""salaried"""
33,1,22.705,0,0,"""South America""",21984.47061,0,"""salaried"""
32,1,28.88,0,0,"""Antarctica""",3866.8552,1,"""business"""
31,0,25.74,0,0,"""South America""",3756.6216,1,"""business"""
46,0,33.44,1,0,"""Europe""",8240.5896,1,"""business"""
37,0,27.74,3,0,"""Africa""",7281.5056,1,"""salaried"""
37,1,29.83,2,0,"""Africa""",6406.4107,1,"""salaried"""
60,0,25.84,0,0,"""Oceania""",28923.13692,1,"""retired"""


In [237]:
df = df.to_dummies("occupation")
df = df.to_dummies("region")
df

age,gender,bmi,children,smoker,region_Africa,region_Antarctica,region_Asia,region_Europe,region_North America,region_Oceania,region_South America,charges,heart_disease_history,occupation_business,occupation_consultant,occupation_part-time,occupation_retired,occupation_salaried,occupation_student,occupation_unemployed
i64,i64,f64,i64,i64,u8,u8,u8,u8,u8,u8,u8,f64,i64,u8,u8,u8,u8,u8,u8,u8
19,0,27.9,0,1,0,0,0,0,0,1,0,16884.924,0,0,0,0,0,0,0,1
18,1,33.77,1,0,0,0,1,0,0,0,0,1725.5523,0,0,0,1,0,0,0,0
28,1,33.0,3,0,0,0,0,0,0,1,0,4449.462,0,0,0,0,0,1,0,0
33,1,22.705,0,0,0,0,0,0,0,0,1,21984.47061,0,0,0,0,0,1,0,0
32,1,28.88,0,0,0,1,0,0,0,0,0,3866.8552,1,1,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
47,0,45.32,1,0,0,0,1,0,0,0,0,8569.8618,1,0,0,0,0,1,0,0
21,0,34.6,0,0,1,0,0,0,0,0,0,2020.177,0,0,0,1,0,0,0,0
19,1,26.03,1,1,1,0,0,0,0,0,0,16450.8947,0,0,0,0,0,0,0,1
23,1,18.715,0,0,0,0,0,0,1,0,0,21595.38229,0,0,0,0,0,0,0,1


In [238]:
X = df.drop("charges").to_numpy()
y = df.select(pl.col("charges")).to_numpy()

# y_log = np.log(y + 1)
y_log = y

X_train, X_test, y_train, y_test = train_test_split(
    X, y_log, test_size=0.2, random_state=42
)

In [239]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [240]:
param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5],
}

rf_model = RandomForestRegressor(random_state=42)

grid_search = GridSearchCV(
    estimator=rf_model, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error"
)

grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_

y_pred_best = best_model.predict(X_test_scaled)
# y_pred_best_original = np.exp(y_pred_best) - 1  # Inverse of log transformation
y_pred_best_original = y_pred_best

mse_best = mean_squared_error(y_test, y_pred_best_original)
mae_best = mean_absolute_error(y_test, y_pred_best_original)
r2_best = r2_score(y_test, y_pred_best_original)

print(f"Best Random Forest MSE: {mse_best}")
print(f"Best Random Forest MAE: {mae_best}")
print(f"Best Random Forest R²: {r2_best}")

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

Best Random Forest MSE: 12342526.145498062
Best Random Forest MAE: 1798.6348072513501
Best Random Forest R²: 0.9161952248690163


In [241]:
# import xgboost as xgb
# from sklearn.model_selection import RandomizedSearchCV

# # Define the model
# xgb_model = xgb.XGBRegressor(random_state=42)

# # Define the parameter grid for Randomized Search
# param_distributions = {
#     "n_estimators": [100, 200, 300],
#     "max_depth": [None, 10, 20, 30],
#     "learning_rate": [0.01, 0.1, 0.2],
#     "subsample": [0.5, 0.7, 1.0],
#     "colsample_bytree": [0.5, 0.7, 1.0],
# }

# # Set up Randomized Search CV
# randomized_search = RandomizedSearchCV(
#     estimator=xgb_model,
#     param_distributions=param_distributions,
#     n_iter=10,  # Number of parameter settings to sample
#     cv=5,
#     scoring="neg_mean_squared_error",
#     random_state=42,
#     verbose=1,
# )

# # Fit the model
# randomized_search.fit(X_train_scaled, y_train)

# # Get the best parameters and model
# best_params = randomized_search.best_params_
# best_model = randomized_search.best_estimator_

# print(f"Best Parameters: {best_params}")

# # Predictions
# y_pred_best = best_model.predict(X_test_scaled)

# # Convert predictions back to original scale if needed
# # y_pred_best_original = np.exp(y_pred_best)
# y_pred_best_original = y_pred_best

# np_stuff = y_test
# # Evaluate the model
# mse_best = mean_squared_error(np_stuff, y_pred_best_original)
# mae_best = mean_absolute_error(np_stuff, y_pred_best_original)
# r2_best = r2_score(np_stuff, y_pred_best_original)

# print(f"XGBoost MSE: {mse_best}")
# print(f"XGBoost MAE: {mae_best}")
# print(f"XGBoost R²: {r2_best}")

In [242]:
import pickle


with open("/home/addo/dev/projects/mip/dumped_stuff/model/rf_model.pkl", "wb") as f:
    pickle.dump(best_model, f)