# Train and Save a Model for Flask Application

In [1]:
# Essential imports

# Add project root to path
import sys
import os
sys.path.append(os.path.abspath('..'))

import warnings
warnings.filterwarnings("ignore")


import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

import pickle

## 1. Load and prepare data


In [2]:
# Load and prepare data
df = pd.read_csv('../data/insurance.csv')

# Remove duplicate
df = df.drop_duplicates()

# Feature engineering
def prepare_features(df):
    df_processed = df.copy()
    
    # Create BMI categories
    df_processed['bmi_category'] = pd.cut(df_processed['bmi'], 
                                        bins=[0, 18.5, 25, 30, 100], 
                                        labels=['underweight', 'normal', 'overweight', 'obese'])
    
    # Create age groups
    df_processed['age_group'] = pd.cut(df_processed['age'], 
                                     bins=[0, 30, 45, 60, 100], 
                                     labels=['young', 'adult', 'middle_aged', 'senior'])
    
    # Interaction features
    df_processed['smoker_age_interaction'] = df_processed['smoker'].map({'yes': 1, 'no': 0}) * df_processed['age']
    df_processed['smoker_bmi_interaction'] = df_processed['smoker'].map({'yes': 1, 'no': 0}) * df_processed['bmi']
    
    return df_processed

# Apply feature engineering
df_processed = prepare_features(df)

# Define features and target
X = df_processed.drop('charges', axis=1)
y = df_processed['charges']


In [3]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define preprocessing for numerical and categorical features
numerical_features = ['age', 'bmi', 'children']
categorical_features = ['sex', 'smoker', 'region', 'bmi_category', 'age_group']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
    ])

# Create pipeline with Random Forest
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Define parameter grid for the winning model configuration
param_grid = {
    'model__n_estimators': [100],
    'model__max_depth': [20],
    'model__min_samples_split': [2]
}

In [4]:
# Train the model with grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get best model
best_model = grid_search.best_estimator_

# Evaluate model
y_pred = best_model.predict(X_test)
test_r2 = r2_score(y_test, y_pred)


In [5]:
print(f"=== FINAL MODEL RESULTS ===")
print(f"Best CV R²: {grid_search.best_score_:.4f}")
print(f"Test R²: {test_r2:.4f}")
print(f"Best parameters: {grid_search.best_params_}")


=== FINAL MODEL RESULTS ===
Best CV R²: 0.8296
Test R²: 0.8655
Best parameters: {'model__max_depth': 20, 'model__min_samples_split': 2, 'model__n_estimators': 100}


In [None]:
# Save the trained model using pickle
pickle.dump(best_model, open('../models/insurance_model.pkl', 'wb'))

# Save model info
model_info = {
    'feature_names': list(X.columns),
    'numerical_features': numerical_features,
    'categorical_features': categorical_features,
    'test_r2': test_r2,
    'best_params': grid_search.best_params_
}

pickle.dump(model_info, open('../models/model_info.pkl', 'wb'))

print("\n=== MODEL SAVED ===")
print("insurance_model.pkl - The trained model")
print("model_info.pkl - Model metadata and feature information")


=== MODEL SAVED ===
insurance_model.pkl - The trained model
model_info.pkl - Model metadata and feature information


In [8]:
# Create prediction function for Flask
def prepare_prediction_features(input_data, model, model_info):
    """
    Prepare features for prediction in Flask app
    
    Args:
        input_data: Dictionary with input features
        model: Trained model pipeline
        model_info: Model metadata
    
    Returns:
        prediction: Model prediction
    """
    # Convert input data to DataFrame
    input_df = pd.DataFrame([input_data])
    
    # Apply the same feature engineering
    input_df = prepare_features(input_df)
    
    # Make prediction
    prediction = model.predict(input_df)[0]
    
    return prediction


In [10]:
# Test the prediction function
sample_input = {
    'age': 35,
    'sex': 'male',
    'bmi': 28.5,
    'children': 2,
    'smoker': 'no',
    'region': 'southeast'
}

loaded_model = pickle.load(open('../models/insurance_model.pkl', 'rb'))
loaded_model_info = pickle.load(open('../models/model_info.pkl', 'rb'))

sample_prediction = prepare_prediction_features(sample_input, loaded_model, loaded_model_info)
print(f"\n=== SAMPLE PREDICTION ===")
print(f"Input: {sample_input}")
print(f"Predicted charges: ${sample_prediction:.2f}")


=== SAMPLE PREDICTION ===
Input: {'age': 35, 'sex': 'male', 'bmi': 28.5, 'children': 2, 'smoker': 'no', 'region': 'southeast'}
Predicted charges: $11687.23


-------