In [1]:
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
%matplotlib inline

In [2]:
df=pd.read_csv('insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
#checking the null values in dataset
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
#changing to categorical datatype of sex column
df.sex=df.sex.apply(lambda x: 1 if x=='male' else 0)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [6]:
#changing to categorical datatype of smoker column
df.smoker=df.smoker.apply(lambda x: 1 if x=='yes' else 0)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [7]:
#changing to categorical datatype of region column
Region=pd.get_dummies(df.region,drop_first=True)
Region

Unnamed: 0,northwest,southeast,southwest
0,0,0,1
1,0,1,0
2,0,1,0
3,1,0,0
4,1,0,0
...,...,...,...
1333,1,0,0
1334,0,0,0
1335,0,1,0
1336,0,0,1


In [8]:
#concatinating two array
new_df=pd.concat([df,Region],axis=1)
new_df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,northwest,southeast,southwest
0,19,0,27.900,0,1,southwest,16884.92400,0,0,1
1,18,1,33.770,1,0,southeast,1725.55230,0,1,0
2,28,1,33.000,3,0,southeast,4449.46200,0,1,0
3,33,1,22.705,0,0,northwest,21984.47061,1,0,0
4,32,1,28.880,0,0,northwest,3866.85520,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,northwest,10600.54830,1,0,0
1334,18,0,31.920,0,0,northeast,2205.98080,0,0,0
1335,18,0,36.850,0,0,southeast,1629.83350,0,1,0
1336,21,0,25.800,0,0,southwest,2007.94500,0,0,1


In [9]:
#Dropping the region column
new_df.drop('region',axis=1,inplace=True)
new_df

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,19,0,27.900,0,1,16884.92400,0,0,1
1,18,1,33.770,1,0,1725.55230,0,1,0
2,28,1,33.000,3,0,4449.46200,0,1,0
3,33,1,22.705,0,0,21984.47061,1,0,0
4,32,1,28.880,0,0,3866.85520,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830,1,0,0
1334,18,0,31.920,0,0,2205.98080,0,0,0
1335,18,0,36.850,0,0,1629.83350,0,1,0
1336,21,0,25.800,0,0,2007.94500,0,0,1


# SCALING AGE, SEX, CHILDREN, CHARGES COLUMN BETWEEN 0 TO 1

In [10]:
new_df.age=new_df.age/100
new_df

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,0.19,0,27.900,0,1,16884.92400,0,0,1
1,0.18,1,33.770,1,0,1725.55230,0,1,0
2,0.28,1,33.000,3,0,4449.46200,0,1,0
3,0.33,1,22.705,0,0,21984.47061,1,0,0
4,0.32,1,28.880,0,0,3866.85520,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,0.50,1,30.970,3,0,10600.54830,1,0,0
1334,0.18,0,31.920,0,0,2205.98080,0,0,0
1335,0.18,0,36.850,0,0,1629.83350,0,1,0
1336,0.21,0,25.800,0,0,2007.94500,0,0,1


In [11]:
new_df.bmi=new_df.bmi/55
new_df

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,0.19,0,0.507273,0,1,16884.92400,0,0,1
1,0.18,1,0.614000,1,0,1725.55230,0,1,0
2,0.28,1,0.600000,3,0,4449.46200,0,1,0
3,0.33,1,0.412818,0,0,21984.47061,1,0,0
4,0.32,1,0.525091,0,0,3866.85520,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,0.50,1,0.563091,3,0,10600.54830,1,0,0
1334,0.18,0,0.580364,0,0,2205.98080,0,0,0
1335,0.18,0,0.670000,0,0,1629.83350,0,1,0
1336,0.21,0,0.469091,0,0,2007.94500,0,0,1


In [12]:
new_df.charges=new_df.charges/45000
new_df

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,0.19,0,0.507273,0,1,0.375221,0,0,1
1,0.18,1,0.614000,1,0,0.038346,0,1,0
2,0.28,1,0.600000,3,0,0.098877,0,1,0
3,0.33,1,0.412818,0,0,0.488544,1,0,0
4,0.32,1,0.525091,0,0,0.085930,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,0.50,1,0.563091,3,0,0.235568,1,0,0
1334,0.18,0,0.580364,0,0,0.049022,0,0,0
1335,0.18,0,0.670000,0,0,0.036219,0,1,0
1336,0.21,0,0.469091,0,0,0.044621,0,0,1


In [15]:
new_df.children=new_df.children/7
new_df

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
0,0.19,0,0.507273,0.000000,1,0.375221,0,0,1
1,0.18,1,0.614000,0.142857,0,0.038346,0,1,0
2,0.28,1,0.600000,0.428571,0,0.098877,0,1,0
3,0.33,1,0.412818,0.000000,0,0.488544,1,0,0
4,0.32,1,0.525091,0.000000,0,0.085930,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,0.50,1,0.563091,0.428571,0,0.235568,1,0,0
1334,0.18,0,0.580364,0.000000,0,0.049022,0,0,0
1335,0.18,0,0.670000,0.000000,0,0.036219,0,1,0
1336,0.21,0,0.469091,0.000000,0,0.044621,0,0,1


In [16]:
new_df.describe()

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,0.39207,0.505232,0.557516,0.156417,0.204783,0.294898,0.2429,0.272048,0.2429
std,0.1405,0.50016,0.110876,0.172213,0.403694,0.269111,0.428995,0.445181,0.428995
min,0.18,0.0,0.290182,0.0,0.0,0.024931,0.0,0.0,0.0
25%,0.27,0.0,0.478114,0.0,0.0,0.10534,0.0,0.0,0.0
50%,0.39,1.0,0.552727,0.142857,0.0,0.20849,0.0,0.0,0.0
75%,0.51,1.0,0.630795,0.285714,0.0,0.369776,0.0,1.0,0.0
max,0.64,1.0,0.966,0.714286,1.0,1.417121,1.0,1.0,1.0


# The max value of charges column us 1.417121, not between 0 to 1

In [17]:
new_df.charges=new_df.charges/2

In [18]:
new_df.describe()

Unnamed: 0,age,sex,bmi,children,smoker,charges,northwest,southeast,southwest
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,0.39207,0.505232,0.557516,0.156417,0.204783,0.147449,0.2429,0.272048,0.2429
std,0.1405,0.50016,0.110876,0.172213,0.403694,0.134556,0.428995,0.445181,0.428995
min,0.18,0.0,0.290182,0.0,0.0,0.012465,0.0,0.0,0.0
25%,0.27,0.0,0.478114,0.0,0.0,0.05267,0.0,0.0,0.0
50%,0.39,1.0,0.552727,0.142857,0.0,0.104245,0.0,0.0,0.0
75%,0.51,1.0,0.630795,0.285714,0.0,0.184888,0.0,1.0,0.0
max,0.64,1.0,0.966,0.714286,1.0,0.70856,1.0,1.0,1.0


Now all the column has been scaled between 0 to 1

# TRAIN TEST SPLIT

In [19]:
X=new_df.drop('charges',axis=1)
y=new_df.charges

In [20]:
X

Unnamed: 0,age,sex,bmi,children,smoker,northwest,southeast,southwest
0,0.19,0,0.507273,0.000000,1,0,0,1
1,0.18,1,0.614000,0.142857,0,0,1,0
2,0.28,1,0.600000,0.428571,0,0,1,0
3,0.33,1,0.412818,0.000000,0,1,0,0
4,0.32,1,0.525091,0.000000,0,1,0,0
...,...,...,...,...,...,...,...,...
1333,0.50,1,0.563091,0.428571,0,1,0,0
1334,0.18,0,0.580364,0.000000,0,0,0,0
1335,0.18,0,0.670000,0.000000,0,0,1,0
1336,0.21,0,0.469091,0.000000,0,0,0,1


In [21]:
y

0       0.187610
1       0.019173
2       0.049438
3       0.244272
4       0.042965
          ...   
1333    0.117784
1334    0.024511
1335    0.018109
1336    0.022311
1337    0.323793
Name: charges, Length: 1338, dtype: float64

In [39]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=10)

In [40]:
X_train.shape

(1070, 8)

In [41]:
X_test.shape

(268, 8)

In [45]:
model=keras.Sequential([
    keras.layers.Dense(500,input_shape=(8,),activation='relu'),
    keras.layers.Dense(1,activation='linear')
])
model.compile(optimizer='adam',
             loss='mean_squared_logarithmic_error',
             metrics=['mse'])
model.fit(X_train,y_train,epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f1a1461c160>

In [46]:
model.evaluate(X_test,y_test)



[0.002795339096337557, 0.0041354564018547535]

In [47]:
model.predict(X_test)

array([[0.11677364],
       [0.06435171],
       [0.18758994],
       [0.17359024],
       [0.04150337],
       [0.1197096 ],
       [0.14817125],
       [0.23783195],
       [0.12109409],
       [0.22225678],
       [0.02913241],
       [0.055175  ],
       [0.10678592],
       [0.14305954],
       [0.14380458],
       [0.14780807],
       [0.0345294 ],
       [0.218695  ],
       [0.16245022],
       [0.33114308],
       [0.41274977],
       [0.02989998],
       [0.03090833],
       [0.10643118],
       [0.06424432],
       [0.14593819],
       [0.14086767],
       [0.13883427],
       [0.08294133],
       [0.35218623],
       [0.03760202],
       [0.46456912],
       [0.0982629 ],
       [0.12486142],
       [0.07378983],
       [0.07702719],
       [0.14834534],
       [0.16413711],
       [0.13137506],
       [0.13817632],
       [0.1670798 ],
       [0.11918239],
       [0.06394948],
       [0.11605267],
       [0.02947521],
       [0.10345337],
       [0.11988637],
       [0.388