In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# Loading the insurance dataset
insurance_data_path = 'insurance.csv'
insurance = pd.read_csv(insurance_data_path)
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0.0,yes,southwest,16884.924
1,18.0,male,33.77,1.0,no,Southeast,1725.5523
2,28.0,male,33.0,3.0,no,southeast,$4449.462
3,33.0,male,22.705,0.0,no,northwest,$21984.47061
4,32.0,male,28.88,0.0,no,northwest,$3866.8552


In [14]:
print(insurance)
print(insurance.isna().sum())

       age     sex     bmi  children smoker     region       charges
0     19.0  female  27.900       0.0    yes  southwest     16884.924
1     18.0    male  33.770       1.0     no  Southeast     1725.5523
2     28.0    male  33.000       3.0     no  southeast     $4449.462
3     33.0    male  22.705       0.0     no  northwest  $21984.47061
4     32.0    male  28.880       0.0     no  northwest    $3866.8552
...    ...     ...     ...       ...    ...        ...           ...
1333  50.0    male  30.970       3.0     no  Northwest   $10600.5483
1334 -18.0  female  31.920       0.0     no  Northeast     2205.9808
1335  18.0  female  36.850       0.0     no  southeast    $1629.8335
1336  21.0  female  25.800       0.0     no  southwest      2007.945
1337  61.0  female  29.070       0.0    yes  northwest    29141.3603

[1338 rows x 7 columns]
age         66
sex         66
bmi         66
children    66
smoker      66
region      66
charges     54
dtype: int64


In [15]:
# Map 'sex' column to numerical values: 'male' -> 0, 'female' -> 1
insurance['sex'] = insurance['sex'].map({'male': 0, 'female': 1})

# Map 'smoker' column to numerical values: 'no' -> 0, 'yes' -> 1
insurance['smoker'] = insurance['smoker'].map({'no': 0, 'yes': 1})

# Get the most frequent value in the 'sex' column
sex_mode = insurance['sex'].mode()[0]

# Impute missing values in the 'sex' column with the most frequent value
insurance['sex'].fillna(sex_mode, inplace=True)

# Display the first few rows of the dataframe to verify changes
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,1.0,27.9,0.0,1.0,southwest,16884.924
1,18.0,0.0,33.77,1.0,0.0,Southeast,1725.5523
2,28.0,0.0,33.0,3.0,0.0,southeast,$4449.462
3,33.0,0.0,22.705,0.0,0.0,northwest,$21984.47061
4,32.0,0.0,28.88,0.0,0.0,northwest,$3866.8552


In [16]:
# Print the unique values in the 'region' column
print(insurance['region'].values)

# Convert all values in the 'region' column to lowercase
insurance['region'] = insurance['region'].str.lower()

# Print the count of each unique value in the 'region' column
print(insurance['region'].value_counts())

# Get the most frequent value in the 'region' column
region_mode = insurance['region'].mode()[0]

# Impute missing values in the 'region' column with the most frequent value
insurance['region'].fillna(region_mode, inplace=True)

# Convert categorical 'region' column to dummy/indicator variables, dropping the first category to avoid multicollinearity
insurance = pd.get_dummies(insurance, columns=['region'], drop_first=True)

print(insurance)

['southwest' 'Southeast' 'southeast' ... 'southeast' 'southwest'
 'northwest']
southeast    342
southwest    312
northwest    310
northeast    308
Name: region, dtype: int64
       age  sex     bmi  ...  region_northwest  region_southeast region_southwest
0     19.0  1.0  27.900  ...                 0                 0                1
1     18.0  0.0  33.770  ...                 0                 1                0
2     28.0  0.0  33.000  ...                 0                 1                0
3     33.0  0.0  22.705  ...                 1                 0                0
4     32.0  0.0  28.880  ...                 1                 0                0
...    ...  ...     ...  ...               ...               ...              ...
1333  50.0  0.0  30.970  ...                 1                 0                0
1334 -18.0  1.0  31.920  ...                 0                 0                0
1335  18.0  1.0  36.850  ...                 0                 1                0
1336  

In [17]:
print(insurance['charges'].dtype)
insurance['charges'] = insurance['charges'].str.replace("$","").astype("double")
print(insurance['charges'])


object
0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64


In [18]:
print(insurance['age'].isna().sum() )
insurance['age'] = insurance['age'].fillna(insurance['age'].mean())
insurance['age'] = insurance['age'].apply(lambda x: abs(x)).astype("int")
print(insurance['age'])
print(insurance['age'].isna().sum() )

66
0       19
1       18
2       28
3       33
4       32
        ..
1333    50
1334    18
1335    18
1336    21
1337    61
Name: age, Length: 1338, dtype: int64
0


In [19]:
insurance['bmi']=insurance['bmi'].fillna(insurance['bmi'].mean())
insurance['children']=insurance['children'].fillna(insurance['children'].mean())
smoker_mode = insurance['smoker'].mode()[0]  # Get the most frequent value
insurance['smoker'].fillna(smoker_mode, inplace=True)  # Impute missing values
insurance['charges']=insurance['charges'].fillna(insurance['charges'].mean())

In [20]:
# Import the LinearRegression model from sklearn
model = LinearRegression()

# Separate the features (X) and the target variable (y) from the insurance DataFrame
X_train = insurance.drop(columns='charges')  # Features
y_train = insurance["charges"]  # Target variable

# Initialize the StandardScaler for feature scaling
scaler = StandardScaler()

# Fit the scaler on the features and transform them
X_train_scaled = scaler.fit_transform(X_train)

# Fit the scaler on the target variable and transform it
# Note: We reshape y_train to a 2D array before scaling
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))

In [21]:
# Fit the Linear Regression model on the scaled training data
model.fit(X_train_scaled, y_train_scaled)

# Predict the target variable for the training data using the fitted model
y_train_pred_scaled = model.predict(X_train_scaled)

# Calculate the R-squared score to evaluate the model's performance
r2_score = r2_score(y_train_scaled, y_train_pred_scaled)

# Print the R-squared score
print("R-Squared Score:", r2_score)

R-Squared Score: 0.7039153630279834


In [22]:
validation_data = pd.read_csv("validation_dataset.csv")

print(validation_data.head())

    age     sex        bmi  children smoker     region
0  18.0  female  24.090000       1.0     no  southeast
1  39.0    male  26.410000       0.0    yes  northeast
2  27.0    male  29.150000       0.0    yes  southeast
3  71.0    male  65.502135      13.0    yes  southeast
4  28.0    male  38.060000       0.0     no  southeast


In [23]:
validation_data['sex'] = validation_data['sex'].map({'male': 0, 'female': 1})
validation_data['smoker'] = validation_data['smoker'].map({'no': 0, 'yes': 1})
sex_mode = validation_data['sex'].mode()[0]  # Get the most frequent value
validation_data['sex'].fillna(sex_mode, inplace=True)  # Impute missing values

validation_data['region']=validation_data['region'].str.lower()

region_mode = validation_data['region'].mode()[0]  # Get the most frequent value
validation_data['region'].fillna(region_mode, inplace=True)  # Impute missing values
validation_data = pd.get_dummies(validation_data, columns=['region'], drop_first=True)


validation_data['age'] = validation_data['age'].fillna(validation_data['age'].mean())
validation_data['age'] = validation_data['age'].apply(lambda x: abs(x)).astype("int")


validation_data['bmi']=validation_data['bmi'].fillna(validation_data['bmi'].mean())
validation_data['children']=validation_data['children'].fillna(validation_data['children'].mean())
smoker_mode = validation_data['smoker'].mode()[0]  # Get the most frequent value
validation_data['smoker'].fillna(smoker_mode, inplace=True)  # Impute missing values


In [24]:
# Predict charges using the model and scale the predictions by 1000
predicted_charges = model.predict(validation_data) * 1000

# Ensure that the minimum predicted charge is 1000
predicted_charges = np.where(predicted_charges < 1000, 1000, predicted_charges)

# Add the predicted charges to the validation_data DataFrame
validation_data["predicted_charges"] = predicted_charges

# Display the validation_data DataFrame
validation_data

Unnamed: 0,age,sex,bmi,children,smoker,region_northwest,region_southeast,region_southwest,predicted_charges
0,18,1,24.09,1.0,0,0,1,0,8978.172018
1,39,0,26.41,0.0,1,0,0,0,16240.359964
2,27,0,29.15,0.0,1,0,1,0,13138.745225
3,71,0,65.502135,13.0,1,0,1,0,32040.288769
4,28,0,38.06,0.0,0,0,1,0,14022.988473
5,70,1,72.958351,11.0,1,0,1,0,32807.19293
6,29,1,32.11,2.0,0,1,0,0,13452.562747
7,42,1,41.325,1.0,0,0,0,0,18651.798103
8,48,1,36.575,0.0,0,1,0,0,19634.733064
9,63,0,33.66,3.0,0,0,1,0,23709.816206
