In [2]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic dataset
n_samples = 1000
age = np.random.randint(18, 65, n_samples)
bmi = np.random.normal(27, 5, n_samples)
smoker = np.random.choice(['yes','no'], n_samples, p=[0.2,0.8])
dependents = np.random.randint(0, 5, n_samples)
region = np.random.choice(['north','south','east','west'], n_samples)

# Generate insurance premium (target) with some logic
premium = 200 + age*3 + bmi*2 + dependents*50
premium += np.where(smoker=='yes', 1000, 0)
premium += np.random.normal(0, 100, n_samples)  # noise

# Create dataframe
df = pd.DataFrame({
    'age': age,
    'BMI': bmi,
    'smoker': smoker,
    'dependents': dependents,
    'region': region,
    'insurance_premium': premium
})

df.head()

Unnamed: 0,age,BMI,smoker,dependents,region,insurance_premium
0,56,32.031464,no,2,south,582.621852
1,46,24.115541,no,4,west,735.142392
2,32,31.178461,yes,2,south,1685.501896
3,60,21.351466,yes,0,south,1382.263189
4,25,29.649021,no,0,north,383.440987


In [3]:
# Create age bands
df['age_band'] = pd.cut(df['age'], bins=[17,25,35,45,55,65], labels=['18-25','26-35','36-45','46-55','56-65'])

In [4]:
# Create BMI bands
df['bmi_band'] = pd.cut(df['BMI'], bins=[0,18.5,25,30,100], labels=['Underweight','Normal','Overweight','Obese'])

In [5]:
from sklearn.preprocessing import OneHotEncoder

# Encode categorical variables
categorical_cols = ['smoker','region','age_band','bmi_band']
encoder = OneHotEncoder(drop='first', sparse_output=False)  # <- updated argument
encoded = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_cols))

# Drop original categorical columns and add encoded columns
df = df.drop(categorical_cols, axis=1)
df = pd.concat([df, encoded_df], axis=1)

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

# Assuming 'df' is your dataframe and 'insurance_premium' is your target
# -----------------------
# 1. Separate features and target
X = df.drop('insurance_premium', axis=1)  # all columns except target
y = df['insurance_premium']

# -----------------------
# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------
# 3. Train Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)

# -----------------------
# 4. Predict on test set
y_pred_lr = lr.predict(X_test)

# -----------------------
# 5. Evaluate with MAE
mae_lr = mean_absolute_error(y_test, y_pred_lr)
print("Baseline Linear Regression MAE:", mae_lr)

Baseline Linear Regression MAE: 79.10205772435212


In [7]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting
gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=4, random_state=42)
gbr.fit(X_train, y_train)
y_pred_gbr = gbr.predict(X_test)

mae_gbr = mean_absolute_error(y_test, y_pred_gbr)
print("Boosting Model MAE:", mae_gbr)

Boosting Model MAE: 89.20097117755456


In [8]:

print(df.columns)

Index(['age', 'BMI', 'dependents', 'insurance_premium', 'smoker_yes',
       'region_north', 'region_south', 'region_west', 'age_band_26-35',
       'age_band_36-45', 'age_band_46-55', 'age_band_56-65', 'bmi_band_Obese',
       'bmi_band_Overweight', 'bmi_band_Underweight'],
      dtype='object')


In [9]:
# ================================
# SEGMENT-WISE MAE ANALYSIS (FULL)
# ================================

from sklearn.metrics import mean_absolute_error

# -------------------------------
# 1. Create test dataframe
# -------------------------------
test_df = X_test.copy()

# Add real and predicted values
test_df['y_true'] = y_test.values
test_df['y_pred'] = y_pred_gbr


# -------------------------------
# 2. Rebuild Age Band
# -------------------------------

def get_age_band(row):
    if row['age_band_26-35'] == 1:
        return '26-35'
    elif row['age_band_36-45'] == 1:
        return '36-45'
    elif row['age_band_46-55'] == 1:
        return '46-55'
    elif row['age_band_56-65'] == 1:
        return '56-65'
    else:
        return '18-25'   # dropped first category


# Apply function
test_df['age_band'] = test_df.apply(get_age_band, axis=1)


# -------------------------------
# 3. Rebuild BMI Band
# -------------------------------

def get_bmi_band(row):
    if row['bmi_band_Obese'] == 1:
        return 'Obese'
    elif row['bmi_band_Overweight'] == 1:
        return 'Overweight'
    elif row['bmi_band_Underweight'] == 1:
        return 'Underweight'
    else:
        return 'Normal'   # dropped first category


# Apply function
test_df['bmi_band'] = test_df.apply(get_bmi_band, axis=1)


# -------------------------------
# 4. MAE by Age Band
# -------------------------------

mae_by_age = test_df.groupby('age_band').apply(
    lambda x: mean_absolute_error(x['y_true'], x['y_pred'])
)

print("========== MAE BY AGE BAND ==========")
print(mae_by_age)


# -------------------------------
# 5. MAE by BMI Band
# -------------------------------

mae_by_bmi = test_df.groupby('bmi_band').apply(
    lambda x: mean_absolute_error(x['y_true'], x['y_pred'])
)

print("\n========== MAE BY BMI BAND ==========")
print(mae_by_bmi)

age_band
18-25    82.724077
26-35    91.259866
36-45    82.520129
46-55    92.654448
56-65    99.162130
dtype: float64

bmi_band
Normal         99.273307
Obese          79.119069
Overweight     89.705525
Underweight    89.515625
dtype: float64


  mae_by_age = test_df.groupby('age_band').apply(
  mae_by_bmi = test_df.groupby('bmi_band').apply(
