In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

# Load processed data
processed_data = pd.read_csv('processed_data.csv')
print(f"Loaded processed data: {processed_data.shape}")

# Prepare features and target
X = processed_data.drop(columns=['log_charges'])
y = processed_data['log_charges']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Standardize numerical features
scaler = StandardScaler()
num_cols = ['age', 'bmi', 'smoker_age']
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

print("\nData ready for modeling:")
print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

FileNotFoundError: [Errno 2] No such file or directory: 'processed_data.csv'

In [None]:
# Initialize and train model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate model
def evaluate_model(y_true, y_pred, label):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{label} Performance:")
    print(f"R²: {r2:.4f}")
    print(f"RMSE: {rmse:.4f}")
    print("="*50)
    return r2, rmse

# Evaluate on log scale
print("Log-transformed scale:")
train_r2, train_rmse = evaluate_model(y_train, y_pred_train, "Training")
test_r2, test_rmse = evaluate_model(y_test, y_pred_test, "Test")

# Convert to dollars for interpretation
y_train_dollars = np.expm1(y_train)
y_test_dollars = np.expm1(y_test)
y_pred_train_dollars = np.expm1(y_pred_train)
y_pred_test_dollars = np.expm1(y_pred_test)

print("\nOriginal dollar scale:")
evaluate_model(y_train_dollars, y_pred_train_dollars, "Training")
evaluate_model(y_test_dollars, y_pred_test_dollars, "Test")

# Interpret coefficients
coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Impact': model.coef_
}).sort_values('Impact', ascending=False)

print("\nFeature Impacts on Insurance Costs:")
print(coefficients)

## Insights

1. **Smoking is the number 1 cost driver**
    - Smokers pay 3 to 4 times more than non smokers
    - The age increases the cost for them

2. ** Age has a compunding effect **
    - Insurance costs increase exponentially with the age

3. ** BMI matters mostly at obesity levels**
    - People with more than 30 BMI have a significant higher cost
    - Below 30 there is only minimal impact

4. ** Region is of low impact**
    - Southeast has a slight higher cost
    - The rest does not show any considerable difference.