# **Import libraries**

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error


# Load **Dataset**

In [None]:
path = "/content/drive/MyDrive/Colab Notebooks/microsoft hackathon/insurance.csv"
df = pd.read_csv(path)

# Preview
print(df.head())


   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [None]:
X = df.drop("charges", axis=1)
y = df["charges"]

categorical_features = ["sex", "smoker", "region"]
numeric_features = ["age", "bmi", "children"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(drop="first"), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)


In [None]:
model = LinearRegression()

pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ]
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))


R2 Score: 0.7835929767120722
MAE: 4181.194473753643


In [None]:
# Get encoded feature names
encoded_cat_features = pipeline.named_steps["preprocessor"] \
    .named_transformers_["cat"] \
    .get_feature_names_out(categorical_features)

feature_names = list(encoded_cat_features) + numeric_features
coefficients = pipeline.named_steps["model"].coef_

coef_df = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": coefficients
})

print(coef_df)


            Feature   Coefficient
0          sex_male    -18.591692
1        smoker_yes  23651.128856
2  region_northwest   -370.677326
3  region_southeast   -657.864297
4  region_southwest   -809.799354
5               age    256.975706
6               bmi    337.092552
7          children    425.278784


In [None]:
SCALING_FACTOR = 100  # 1 point = ₹100


In [None]:
coef_df["Points_per_unit"] = coef_df["Coefficient"] / SCALING_FACTOR
print(coef_df)


            Feature   Coefficient  Points_per_unit
0          sex_male    -18.591692        -0.185917
1        smoker_yes  23651.128856       236.511289
2  region_northwest   -370.677326        -3.706773
3  region_southeast   -657.864297        -6.578643
4  region_southwest   -809.799354        -8.097994
5               age    256.975706         2.569757
6               bmi    337.092552         3.370926
7          children    425.278784         4.252788


In [None]:
def calculate_risk_score(input_df, coef_df):
    score = 0
    for _, row in coef_df.iterrows():
        feature = row["Feature"]
        points = row["Points_per_unit"]
        if feature in input_df.columns:
            score += input_df[feature].values[0] * points
    return score


In [None]:
X_encoded = pipeline.named_steps["preprocessor"].transform(X)
X_encoded = pd.DataFrame(X_encoded, columns=feature_names)

risk_scores = []

for i in range(len(X_encoded)):
    score = calculate_risk_score(X_encoded.iloc[[i]], coef_df)
    risk_scores.append(score)

df["risk_score"] = risk_scores


In [None]:
score_to_charge = LinearRegression()
score_to_charge.fit(df[["risk_score"]], y)

print("Charge Mapping Coefficient:", score_to_charge.coef_)
print("Intercept:", score_to_charge.intercept_)


Charge Mapping Coefficient: [100.5865091]
Intercept: -12123.200203717966


In [None]:
new_patient = pd.DataFrame({
    "age": [35],
    "sex": ["male"],
    "bmi": [31.0],
    "children": [2],
    "smoker": ["yes"],
    "region": ["southwest"]
})

# Encode
new_encoded = pipeline.named_steps["preprocessor"].transform(new_patient)
new_encoded = pd.DataFrame(new_encoded, columns=feature_names)

# Risk Score
new_risk_score = calculate_risk_score(new_encoded, coef_df)

# Predicted Charges
predicted_charge = score_to_charge.predict([[new_risk_score]])

print("Risk Score:", round(new_risk_score, 2))
print("Predicted Insurance Charge: ₹", round(predicted_charge[0], 2))


Risk Score: 431.17
Predicted Insurance Charge: ₹ 31247.0


