In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import json
import pickle

In [2]:
# Load dataset
insurance_df = pd.read_csv('insurance.csv')

In [3]:
# Data preprocessing
insurance_df.replace({'sex': {'male': 0, 'female': 1}}, inplace=True)
insurance_df.replace({'smoker': {'yes': 0, 'no': 1}}, inplace=True)
insurance_df.replace({'region': {'southeast': 0, 'southwest': 1, 'northeast': 2, 'northwest': 3}}, inplace=True)

In [4]:
# Feature engineering
insurance_df['bmi_age'] = insurance_df['bmi'] * insurance_df['age']
insurance_df['children_smoker'] = insurance_df['children'] * insurance_df['smoker']

In [5]:
# Save columns for later use in app
columns = {'data_columns': [col for col in insurance_df.columns if col != 'charges']}
with open("columns.json", "w") as f:
    f.write(json.dumps(columns))

In [12]:
# Prepare data for model
X = insurance_df.drop('charges', axis=1)
y = insurance_df['charges']


In [7]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Model training
model = LinearRegression()
model.fit(X_train, y_train)

In [13]:
with open('insurance_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [10]:
# Test prediction
test_input = np.array([19, 0, 27.9, 0, 0, 1, 27.9*19, 0*0])
test_input = test_input.reshape(1, -1)
print(model.predict(test_input)[0])

25952.209970397875


