In [2]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# Loading the insurance dataset
insurance_data_path = 'insurance.csv'
insurance = pd.read_csv(insurance_data_path)
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0.0,yes,southwest,16884.924
1,18.0,male,33.77,1.0,no,Southeast,1725.5523
2,28.0,male,33.0,3.0,no,southeast,$4449.462
3,33.0,male,22.705,0.0,no,northwest,$21984.47061
4,32.0,male,28.88,0.0,no,northwest,$3866.8552


In [3]:
# Check for missing values in the dataset
print(insurance.isna().sum())

age         66
sex         66
bmi         66
children    66
smoker      66
region      66
charges     54
dtype: int64


In [4]:
# Map categorical variables ('sex' and 'smoker') to numerical values
insurance['sex'] = insurance['sex'].map({'male': 0, 'female': 1})
insurance['smoker'] = insurance['smoker'].map({'no': 0, 'yes': 1})

# Handle missing values for the 'sex' column by imputing with the mode
sex_mode = insurance['sex'].mode()[0]  # Get the most frequent value
insurance['sex'].fillna(sex_mode, inplace=True)

In [5]:
# Standardize the 'region' column (convert to lowercase for consistency)
print(insurance['region'].values)
insurance['region'] = insurance['region'].str.lower()

# Impute missing values in the 'region' column
print(insurance['region'].value_counts())
region_mode = insurance['region'].mode()[0]  # Get the most frequent value
insurance['region'].fillna(region_mode, inplace=True)

# Convert categorical 'region' column into dummy variables
insurance = pd.get_dummies(insurance, columns=['region'], drop_first=True)


['southwest' 'Southeast' 'southeast' ... 'southeast' 'southwest'
 'northwest']
region
southeast    342
southwest    312
northwest    310
northeast    308
Name: count, dtype: int64


In [6]:
# Impute missing values in the 'age' column
# Replace missing values with the mean age to ensure no gaps in the data
# Use absolute values to handle any potential negative ages (data quality issue)
# Convert 'age' to an integer type for consistency
insurance['age'] = insurance['age'].fillna(insurance['age'].mean())
insurance['age'] = insurance['age'].apply(lambda x: abs(x)).astype("int")

In [7]:
# Impute missing values in the 'bmi' column
# Replace missing values with the mean BMI to handle gaps in the data
insurance['bmi'] = insurance['bmi'].fillna(insurance['bmi'].mean())

In [8]:
# Impute missing values in the 'children' column
# Replace missing values with the mean number of children to maintain numerical consistency
insurance['children'] = insurance['children'].fillna(insurance['children'].mean())


In [9]:
# Impute missing values in the 'smoker' column
# Replace missing values with the most frequent value (mode) to reflect the majority behavior
smoker_mode = insurance['smoker'].mode()[0]  # Find the most common value
insurance['smoker'].fillna(smoker_mode, inplace=True)

In [10]:
# Remove dollar sign from 'charges' and convert to numeric
insurance['charges'] = insurance['charges'].str.replace("$", "").astype("double")

# Impute missing values in the 'charges' column
# Replace missing values with the mean charge to ensure no gaps in the target variable
insurance['charges'] = insurance['charges'].fillna(insurance['charges'].mean())

In [11]:
# Prepare the model for training
model = LinearRegression()

# Define features (X) and target (y)
X_train = insurance.drop(columns='charges')  # Features
y_train = insurance["charges"]  # Target

In [12]:
# Scale features and target for better model performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Scale features
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))  # Scale target

In [13]:
# Train the linear regression model
model.fit(X_train_scaled, y_train_scaled)

In [14]:
# Predict on training data and evaluate the model
y_train_pred_scaled = model.predict(X_train_scaled)
r2_score_value = r2_score(y_train_scaled, y_train_pred_scaled)  # Calculate R-squared score
print("R-Squared Score:", r2_score_value)

R-Squared Score: 0.7039153630279834


In [15]:
# Load validation dataset for predictions
validation_data = pd.read_csv("validation_dataset.csv")
print(validation_data.head())

    age     sex        bmi  children smoker     region
0  18.0  female  24.090000       1.0     no  southeast
1  39.0    male  26.410000       0.0    yes  northeast
2  27.0    male  29.150000       0.0    yes  southeast
3  71.0    male  65.502135      13.0    yes  southeast
4  28.0    male  38.060000       0.0     no  southeast


In [16]:
# Preprocess the validation dataset
validation_data['sex'] = validation_data['sex'].map({'male': 0, 'female': 1})
validation_data['smoker'] = validation_data['smoker'].map({'no': 0, 'yes': 1})
sex_mode = validation_data['sex'].mode()[0]
validation_data['sex'].fillna(sex_mode, inplace=True)

validation_data['region'] = validation_data['region'].str.lower()
region_mode = validation_data['region'].mode()[0]
validation_data['region'].fillna(region_mode, inplace=True)
validation_data = pd.get_dummies(validation_data, columns=['region'], drop_first=True)

validation_data['age'] = validation_data['age'].fillna(validation_data['age'].mean())
validation_data['age'] = validation_data['age'].apply(lambda x: abs(x)).astype("int")
validation_data['bmi'] = validation_data['bmi'].fillna(validation_data['bmi'].mean())
validation_data['children'] = validation_data['children'].fillna(validation_data['children'].mean())
smoker_mode = validation_data['smoker'].mode()[0]
validation_data['smoker'].fillna(smoker_mode, inplace=True)

In [17]:
# Predict insurance charges for validation data
predicted_charges = model.predict(validation_data) * 1000  # Scale predictions back
predicted_charges = np.where(predicted_charges < 1000, 1000, predicted_charges)  # Minimum charge threshold

In [18]:
# Add predictions to the validation data
validation_data["predicted_charges"] = predicted_charges
print(validation_data)

    age  sex        bmi  ...  region_southeast  region_southwest  predicted_charges
0    18    1  24.090000  ...              True             False        8978.172018
1    39    0  26.410000  ...             False             False       16240.359964
2    27    0  29.150000  ...              True             False       13138.745225
3    71    0  65.502135  ...              True             False       32040.288769
4    28    0  38.060000  ...              True             False       14022.988473
5    70    1  72.958351  ...              True             False       32807.192930
6    29    1  32.110000  ...             False             False       13452.562747
7    42    1  41.325000  ...             False             False       18651.798103
8    48    1  36.575000  ...             False             False       19634.733064
9    63    0  33.660000  ...              True             False       23709.816206
10   27    0  18.905000  ...             False             False       10907