In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import pickle
import pandas as pd

In [None]:
# Load the dataset
df = pd.read_csv('datasets/health_insurance_data.csv')

In [None]:
# Define the features and target
numerical_cols = [
    'Age', 'BMI', 'Annual Income', 'Diabetes', 'Hypertension', 
    'Heart Disease', 'Cancer', 'Asthma', 'Arthritis', 'Stroke', 
    'Epilepsy', 'Kidney Disease', 'Liver Disease', 'Tuberculosis', 'HIV'
]
categorical_cols = [
    'Sex', 'Smoking Status', 'Family History of Disease', 'Occupation', 'City'
]
target = 'Risk Score'

In [None]:
X = df[numerical_cols + categorical_cols]
y = df[target]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_numerical = scaler.fit_transform(X_train[numerical_cols])
X_test_numerical = scaler.transform(X_test[numerical_cols])

In [None]:
encoder = OneHotEncoder()
X_train_categorical = encoder.fit_transform(X_train[categorical_cols]).toarray()
X_test_categorical = encoder.transform(X_test[categorical_cols]).toarray()

In [None]:
import numpy as np
X_train_processed = np.hstack((X_train_numerical, X_train_categorical))
X_test_processed = np.hstack((X_test_numerical, X_test_categorical))

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_processed, y_train)

In [None]:
with open('src/models/insurance_model.pkl', 'wb') as file:
    pickle.dump({
        'model': model,
        'scaler': scaler,
        'encoder': encoder,
        'categorical_cols': categorical_cols,
        'numerical_cols': numerical_cols
    }, file)

In [None]:
score = model.score(X_test_processed, y_test)
print(f'Model R^2 score: {score}')