In [78]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
# Loading the insurance dataset
insurance_data_path = 'insurance.csv'
insurance = pd.read_csv(insurance_data_path)
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19.0,female,27.9,0.0,yes,southwest,16884.924
1,18.0,male,33.77,1.0,no,Southeast,1725.5523
2,28.0,male,33.0,3.0,no,southeast,$4449.462
3,33.0,male,22.705,0.0,no,northwest,$21984.47061
4,32.0,male,28.88,0.0,no,northwest,$3866.8552


In [79]:
print(insurance.head())
print(insurance.isna().sum())

    age     sex     bmi  children smoker     region       charges
0  19.0  female  27.900       0.0    yes  southwest     16884.924
1  18.0    male  33.770       1.0     no  Southeast     1725.5523
2  28.0    male  33.000       3.0     no  southeast     $4449.462
3  33.0    male  22.705       0.0     no  northwest  $21984.47061
4  32.0    male  28.880       0.0     no  northwest    $3866.8552
age         66
sex         66
bmi         66
children    66
smoker      66
region      66
charges     54
dtype: int64


In [80]:
insurance['sex'] = insurance['sex'].map({'male': 0, 'female': 1})
insurance['smoker'] = insurance['smoker'].map({'no': 0, 'yes': 1})
sex_mode = insurance['sex'].mode()[0]  # Get the most frequent value
insurance['sex'].fillna(sex_mode, inplace=True)  # Impute missing values
print(insurance.head())

    age  sex     bmi  children  smoker     region       charges
0  19.0  1.0  27.900       0.0     1.0  southwest     16884.924
1  18.0  0.0  33.770       1.0     0.0  Southeast     1725.5523
2  28.0  0.0  33.000       3.0     0.0  southeast     $4449.462
3  33.0  0.0  22.705       0.0     0.0  northwest  $21984.47061
4  32.0  0.0  28.880       0.0     0.0  northwest    $3866.8552


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  insurance['sex'].fillna(sex_mode, inplace=True)  # Impute missing values


In [81]:
print(insurance['region'].values)
insurance['region']=insurance['region'].str.lower()
print(insurance['region'].value_counts())
region_mode = insurance['region'].mode()[0]  # Get the most frequent value
insurance['region'].fillna(region_mode, inplace=True)  # Impute missing values
insurance = pd.get_dummies(insurance, columns=['region'], drop_first=True)
print(insurance.head())


['southwest' 'Southeast' 'southeast' ... 'southeast' 'southwest'
 'northwest']
region
southeast    342
southwest    312
northwest    310
northeast    308
Name: count, dtype: int64
    age  sex     bmi  children  smoker       charges  region_northwest  \
0  19.0  1.0  27.900       0.0     1.0     16884.924             False   
1  18.0  0.0  33.770       1.0     0.0     1725.5523             False   
2  28.0  0.0  33.000       3.0     0.0     $4449.462             False   
3  33.0  0.0  22.705       0.0     0.0  $21984.47061              True   
4  32.0  0.0  28.880       0.0     0.0    $3866.8552              True   

   region_southeast  region_southwest  
0             False              True  
1              True             False  
2              True             False  
3             False             False  
4             False             False  


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  insurance['region'].fillna(region_mode, inplace=True)  # Impute missing values


In [82]:
print(insurance['charges'].dtype)
insurance['charges'] = insurance['charges'].str.replace("$","").astype("double")
print(insurance['charges'])


object
0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64


In [83]:
print(insurance['age'].isna().sum() )
insurance['age'] = insurance['age'].fillna(insurance['age'].mean())
insurance['age'] = insurance['age'].apply(lambda x: abs(x)).astype("int")
print(insurance['age'])
print(insurance['age'].isna().sum() )

66
0       19
1       18
2       28
3       33
4       32
        ..
1333    50
1334    18
1335    18
1336    21
1337    61
Name: age, Length: 1338, dtype: int32
0


In [None]:
insurance['bmi']=insurance['bmi'].fillna(insurance['bmi'].mean())
insurance['children']=insurance['children'].fillna(insurance['children'].mean())
smoker_mode = insurance['smoker'].mode()[0]  # Get the most frequent value
insurance['smoker'].fillna(smoker_mode, inplace = True)  # Impute missing values
insurance['charges']=insurance['charges'].fillna(insurance['charges'].mean())

In [85]:
model = LinearRegression()
X_train = insurance.drop(columns='charges')

y_train = insurance["charges"]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))

In [86]:
model.fit(X_train_scaled,y_train_scaled)
y_train_pred_scaled = model.predict(X_train_scaled)
r2_score = r2_score(y_train_scaled, y_train_pred_scaled)
print("R-Squared Score:", r2_score)

R-Squared Score: 0.7039153630279834


In [87]:
validation_data = pd.read_csv("validation_dataset.csv")

print(validation_data.head())

    age     sex        bmi  children smoker     region
0  18.0  female  24.090000       1.0     no  southeast
1  39.0    male  26.410000       0.0    yes  northeast
2  27.0    male  29.150000       0.0    yes  southeast
3  71.0    male  65.502135      13.0    yes  southeast
4  28.0    male  38.060000       0.0     no  southeast


In [None]:
validation_data['sex'] = validation_data['sex'].map({'male': 0, 'female': 1})
validation_data['smoker'] = validation_data['smoker'].map({'no': 0, 'yes': 1})
sex_mode = validation_data['sex'].mode()[0]  # Get the most frequent value
validation_data['sex'].fillna(sex_mode, inplace=True)  # Impute missing values

validation_data['region']=validation_data['region'].str.lower()

region_mode = validation_data['region'].mode()[0]  # Get the most frequent value
validation_data['region'].fillna(region_mode, inplace=True)  # Impute missing values
validation_data = pd.get_dummies(validation_data, columns=['region'], drop_first=True)




validation_data['age'] = validation_data['age'].fillna(validation_data['age'].mean())
validation_data['age'] = validation_data['age'].apply(lambda x: abs(x)).astype("int")


validation_data['bmi']=validation_data['bmi'].fillna(validation_data['bmi'].mean())
validation_data['children']=validation_data['children'].fillna(validation_data['children'].mean())
smoker_mode = validation_data['smoker'].mode()[0]  # Get the most frequent value
validation_data['smoker'].fillna(smoker_mode, inplace=True)  # Impute missing values


In [89]:
predicted_charges  = model.predict(validation_data)*1000
predicted_charges = np.where(predicted_charges < 1000, 1000, predicted_charges)
validation_data["predicted_charges"] = predicted_charges
print(validation_data)



    age  sex        bmi  children  smoker  region_northwest  region_southeast  \
0    18    1  24.090000       1.0       0             False              True   
1    39    0  26.410000       0.0       1             False             False   
2    27    0  29.150000       0.0       1             False              True   
3    71    0  65.502135      13.0       1             False              True   
4    28    0  38.060000       0.0       0             False              True   
5    70    1  72.958351      11.0       1             False              True   
6    29    1  32.110000       2.0       0              True             False   
7    42    1  41.325000       1.0       0             False             False   
8    48    1  36.575000       0.0       0              True             False   
9    63    0  33.660000       3.0       0             False              True   
10   27    0  18.905000       3.0       0             False             False   
11   51    1  36.670000     