In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [8]:
insurance_data_path = 'insurance.csv'
insurance = pd.read_csv(insurance_data_path)
insurance.head(1338)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.900,0.0,yes,southwest,16884.924
1,18.0,male,33.770,1.0,no,Southeast,1725.5523
2,28.0,male,33.000,3.0,no,southeast,$4449.462
3,33.0,male,22.705,0.0,no,northwest,$21984.47061
4,32.0,male,28.880,0.0,no,northwest,$3866.8552
...,...,...,...,...,...,...,...
1333,50.0,male,30.970,3.0,no,Northwest,$10600.5483
1334,-18.0,female,31.920,0.0,no,Northeast,2205.9808
1335,18.0,female,36.850,0.0,no,southeast,$1629.8335
1336,21.0,female,25.800,0.0,no,southwest,2007.945


In [9]:
summary = pd.DataFrame({
    "Column": insurance.columns,
    "DataType": insurance.dtypes,
    "NonNullCount": insurance.notnull().sum()
})
print(summary)
insurance["region"].unique()

            Column DataType  NonNullCount
age            age  float64          1272
sex            sex   object          1272
bmi            bmi  float64          1272
children  children  float64          1272
smoker      smoker   object          1272
region      region   object          1272
charges    charges   object          1284


array(['southwest', 'Southeast', 'southeast', 'northwest', 'Northwest',
       'Northeast', 'northeast', 'Southwest', nan], dtype=object)

In [10]:
def data_preprocessing(df, train_mode=True):
    if train_mode:
        df.loc[:, 'charges'] = df['charges'].replace({'\$': ''}, regex=True).astype(float)
        df = df[df["charges"].notna()]

    df = df[~((df["age"] < 0) | (df["children"] < 0))]

    df["age"] = df["age"].fillna(df["age"].median())

    df["bmi"] = df["bmi"].fillna(df["bmi"].median())

    df["children"] = df["children"].fillna(df["children"].mode()[0])

    df["smoker"] = df["smoker"].map({"yes": 1, "no": 0})
    df["smoker"] = df["smoker"].fillna(df["smoker"].mode()[0])

    df["region"] = df["region"].str.lower().str.strip()
    df["region"] = df["region"].fillna(df["region"].mode()[0])
    region_dummies = pd.get_dummies(df["region"], prefix="region", drop_first=True)
    df = pd.concat([df.drop(columns=["region"]), region_dummies], axis=1)

    df["sex"] = df["sex"].str.lower().str.strip()
    sex_mapping = {
        "m": 1,
        "male": 1,
        "man": 1,
        "f": 0,
        "female": 0
    }
    df["sex"] = df["sex"].map(sex_mapping)
    df["sex"] = df["sex"].fillna(df["sex"].mode()[0])

    scaler = StandardScaler()
    numeric_cols = ["age", "bmi", "children"]
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    return df

In [11]:
insurance = data_preprocessing(insurance)

In [13]:
X = insurance.drop(columns=["charges"])
y = insurance["charges"]
model = LinearRegression()
cv_scores = cross_val_score(model, X, y, cv=5, scoring='r2')
print(f"Mean R^2 score: {np.mean(cv_scores)}")
model.fit(X, y)

Cross-validated R^2 scores: [0.69714266 0.68477326 0.77653326 0.67322763 0.76143529]
Mean R^2 score: 0.7186224197650594


In [None]:
validation_set = pd.read_csv("validation_dataset.csv")
validation_set = data_preprocessing(validation_set, train_mode=False)
predictions = model.predict(validation_set)
validation_set['predicted_charges'] = predictions
validation_set.loc[validation_set['predicted_charges'] < 1000, 'predicted_charges'] = 1000