In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Load processed data
processed_data = pd.read_csv('processed_data.csv')
print(f"Loaded processed data: {processed_data.shape}")

columns_to_remove = ['charges', 'children']  # These shouldn't be in features
for col in columns_to_remove:
    if col in processed_data.columns:
        processed_data = processed_data.drop(columns=col)
        print(f"Removed '{col}' column to prevent target leakage")

# Check and fix categorical data
print("\nColumns before preprocessing:")
print(processed_data.columns.tolist())

# Identify categorical columns
categorical_cols = processed_data.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"\nCategorical columns found: {categorical_cols}")

# One-hot encode if categorical columns exist
if categorical_cols:
    print("Applying one-hot encoding to categorical columns...")
    processed_data = pd.get_dummies(processed_data, columns=categorical_cols, drop_first=True, dtype=float)
    
# Verify all columns are numeric 
print("\nData types after encoding:")
print(processed_data.dtypes)

print("\nColumns after preprocessing:")
print(processed_data.columns.tolist())

# Prepare features and target
X = processed_data.drop(columns=['log_charges'])
y = processed_data['log_charges']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Standardize numerical features
scaler = StandardScaler()
num_cols = ['age', 'bmi', 'smoker_age']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

print("\nData ready for modeling:")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

Loaded processed data: (1337, 11)
Removed 'charges' column to prevent target leakage
Removed 'children' column to prevent target leakage

Columns before preprocessing:
['age', 'sex', 'bmi', 'smoker', 'region', 'log_charges', 'smoker_age', 'obese', 'has_children']

Categorical columns found: ['sex', 'smoker', 'region']
Applying one-hot encoding to categorical columns...

Data types after encoding:
age                   int64
bmi                 float64
log_charges         float64
smoker_age            int64
obese                 int64
has_children          int64
sex_male            float64
smoker_yes          float64
region_northwest    float64
region_southeast    float64
region_southwest    float64
dtype: object

Columns after preprocessing:
['age', 'bmi', 'log_charges', 'smoker_age', 'obese', 'has_children', 'sex_male', 'smoker_yes', 'region_northwest', 'region_southeast', 'region_southwest']

Data ready for modeling:
Training set: (1069, 10)
Test set: (268, 10)


In [2]:
# Initialize and train model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate model
def evaluate_model(y_true, y_pred, label):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{label} Performance:")
    print(f"R²: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print("="*50)
    return r2, rmse

# Evaluate on log scale
print("Log-transformed scale:")
train_r2, train_rmse = evaluate_model(y_train, y_pred_train, "Training")
test_r2, test_rmse = evaluate_model(y_test, y_pred_test, "Test")

# Convert to dollars for interpretation
y_train_dollars = np.expm1(y_train)
y_test_dollars = np.expm1(y_test)
y_pred_train_dollars = np.expm1(y_pred_train)
y_pred_test_dollars = np.expm1(y_pred_test)

print("\nOriginal dollar scale:")
evaluate_model(y_train_dollars, y_pred_train_dollars, "Training")
evaluate_model(y_test_dollars, y_pred_test_dollars, "Test")

# Interpret coefficients
coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Impact': model.coef_
}).sort_values('Impact', ascending=False)

print("\nFeature Impacts on Insurance Costs:")
print(coefficients)

Log-transformed scale:
Training Performance:
R²: 0.7882
RMSE: 0.4171
Test Performance:
R²: 0.8555
RMSE: 0.3661

Original dollar scale:
Training Performance:
R²: 0.7312
RMSE: 6066.7475
Test Performance:
R²: 0.7772
RMSE: 6397.8437

Feature Impacts on Insurance Costs:
            Feature    Impact
6        smoker_yes  2.789297
0               age  0.568943
4      has_children  0.238363
3             obese  0.119069
1               bmi  0.018645
7  region_northwest -0.039853
5          sex_male -0.067588
8  region_southeast -0.109959
9  region_southwest -0.121761
2        smoker_age -0.538560


## Insights

1. **Smoking is the number 1 cost driver**
    - Smokers pay 3 to 4 times more than non smokers
    - The age increases the cost for them

2. ** Age has a compunding effect **
    - Insurance costs increase exponentially with the age

3. ** BMI matters mostly at obesity levels**
    - People with more than 30 BMI have a significant higher cost
    - Below 30 there is only minimal impact

4. ** Region is of low impact**
    - Southeast has a slight higher cost
    - The rest does not show any considerable difference.