In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [None]:
df=pd.read_csv('/content/medical_cost.csv')
df.head()

Unnamed: 0,Id,age,sex,bmi,children,smoker,region,charges
0,1,19,female,27.9,0,yes,southwest,16884.924
1,2,18,male,33.77,1,no,southeast,1725.5523
2,3,28,male,33.0,3,no,southeast,4449.462
3,4,33,male,22.705,0,no,northwest,21984.47061
4,5,32,male,28.88,0,no,northwest,3866.8552


In [None]:
df = df.drop(columns=['Id','region'])
df.shape

(1338, 6)

In [None]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
charges     0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 62.8+ KB


In [None]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [None]:
#feature engineering
df['smoker'] = df['smoker'].map({'yes': 1, 'no': 0})

In [None]:
df.sample(10)

Unnamed: 0,age,sex,bmi,children,smoker,charges
1192,58,female,32.395,1,0,13019.16105
833,58,male,34.39,0,0,11743.9341
1293,46,male,25.745,3,0,9301.89355
1109,45,male,20.35,3,0,8605.3615
802,21,male,22.3,1,0,2103.08
378,64,female,30.115,3,0,16455.70785
317,54,male,32.775,0,0,10435.06525
343,63,male,36.765,0,0,13981.85035
1211,39,male,34.1,2,0,23563.01618
983,27,female,30.59,1,0,16796.41194


In [None]:
df = df.drop(columns='sex')
df.shape

(1338, 5)

In [None]:
# Prepare the data
X = df[['age', 'bmi', 'children', 'smoker']]
y = df['charges']

In [None]:
# Split the data for train and test
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state=50)
print(X_train.shape)
print(X_test.shape)

(1070, 4)
(268, 4)


In [None]:
# Fitting the model
lr_model = LinearRegression()
lr_model.fit(X_train,y_train)

In [None]:
# Predict using test data
y_predict = lr_model.predict(X_test)
y_predict

array([ 8006.6966127 , 11661.14105581, 17245.89901012, 11282.92858129,
         433.14185864, 38728.99916608, 26216.5728887 ,  7747.21084863,
         820.55875406, 11535.5620035 ,  9273.4524188 , 32112.92983658,
        8172.35663107, 30866.29570727,  7601.23118546,  9490.2268413 ,
       11578.987719  ,  4735.50528083, 12326.37118117,  8650.25434057,
       -1300.51302513, 36566.23218274,  6683.20506819,  8828.04549254,
        4241.7307814 , 10924.53574683,  5405.55519996, 38058.08970644,
         616.80972085,   109.41173146, 37209.03503087,  4265.91158158,
       14463.00021936,  1360.6856817 ,  1273.21286518,  6111.00530298,
        9287.43603658,  1274.76919435, 11866.60328834, 39063.32729146,
        9766.9248815 ,  6748.50634778, 16917.88321704, 13262.48054542,
       35336.6711358 , 13945.64302041, 34104.71741292,  4494.24802141,
       12504.10424206,  9765.2931492 , 39910.59336457,  9453.89153158,
        6516.83085805, 37948.44535298, 11499.64482162, 35289.47647911,
      

In [None]:
# Predicted Score of train data
train_accuracy = lr_model.score(X_train, y_train)
train_accuracy

0.7393654059979747

In [None]:
test_accuracy = lr_model.score(X_test, y_test)
test_accuracy

0.7842410709074031

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

print('Mean absolute error = ', mean_absolute_error(y_test,y_predict))
print('Mean square error = ', mean_squared_error(y_test,y_predict))
print('R2 Score = ', r2_score(y_test,y_predict))

Mean absolute error =  3972.0995171524482
Mean square error =  34584351.96422181
R2 Score =  0.7842410709074031
