In [1]:
## Dataset used
# https://www.kaggle.com/datasets/mirichoi0218/insurance

In [2]:
!pip install kaggle



In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mirichoi0218/insurance")

print(f"Path of dataset files : {path}")

Path of dataset files : /kaggle/input/insurance


In [4]:
import os
print(os.getcwd())

/content


In [5]:
import pandas as pd
os.listdir('/kaggle/input/insurance')

['insurance.csv']

In [6]:
df = pd.read_csv(os.path.join(path, 'insurance.csv'))
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [8]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [10]:
# Split dataset before encoding
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [11]:
# Encode cetagorical variable
label_encoder = {}
for col in ['sex', 'smoker', 'region']:
  le = LabelEncoder()
  train_df[col] = le.fit_transform(train_df[col])
  test_df[col] = le.transform(test_df[col])
  label_encoder[col] = le


In [12]:
# Features and target
X_train = train_df.drop(columns=['charges'])
y_train = train_df['charges']

X_test = test_df.drop(columns=['charges'])
y_test = test_df['charges']

In [13]:
print(X_train.head())
print(y_train.head())

      age  sex    bmi  children  smoker  region
560    46    0  19.95         2       0       1
1285   47    0  24.32         0       0       0
1142   52    0  24.86         0       0       2
969    39    0  34.32         5       0       2
486    54    0  21.47         3       0       1
560      9193.83850
1285     8534.67180
1142    27117.99378
969      8596.82780
486     12475.35130
Name: charges, dtype: float64


In [14]:
# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
print(X_test)

[[ 0.40114007 -1.0246016  -0.89153925  0.73433626 -0.50874702 -1.35325561]
 [-0.23863782 -1.0246016  -0.08946143 -0.91119211 -0.50874702 -0.45611589]
 [ 1.75178229 -1.0246016  -0.60845296 -0.91119211  1.96561348 -0.45611589]
 ...
 [-0.09646495  0.97598911 -0.41972876 -0.08842793 -0.50874702 -1.35325561]
 [ 1.04091797 -1.0246016   2.78941026 -0.91119211  1.96561348  0.44102382]
 [ 0.82765867 -1.0246016   0.60252728 -0.08842793 -0.50874702  1.33816354]]


In [17]:
# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [18]:
print(y_train_tensor)
print(y_train_tensor.shape)

tensor([[ 9193.8389],
        [ 8534.6719],
        [27117.9941],
        ...,
        [11931.1250],
        [46113.5117],
        [10214.6357]])
torch.Size([1070, 1])


In [19]:
print(X_test_tensor)
print(X_test_tensor.shape)

tensor([[ 0.4011, -1.0246, -0.8915,  0.7343, -0.5087, -1.3533],
        [-0.2386, -1.0246, -0.0895, -0.9112, -0.5087, -0.4561],
        [ 1.7518, -1.0246, -0.6085, -0.9112,  1.9656, -0.4561],
        ...,
        [-0.0965,  0.9760, -0.4197, -0.0884, -0.5087, -1.3533],
        [ 1.0409, -1.0246,  2.7894, -0.9112,  1.9656,  0.4410],
        [ 0.8277, -1.0246,  0.6025, -0.0884, -0.5087,  1.3382]])
torch.Size([268, 6])


In [20]:
# Define Neural network model

class SimpleNNRegressionModel(nn.Module):
  def __init__(self, input_dim):
    super(SimpleNNRegressionModel, self).__init__()
    self.network = nn.Sequential(
        nn.Linear(input_dim, 64),
        nn.ReLU(),
        nn.Linear(64, 128),
        nn.ReLU(),
        nn.Linear(128, 256),
        nn.ReLU(),
        nn.Linear(256, 1)
    )

  def forward(self, x):
    return self.network(x)

In [21]:
X_train_tensor.shape

torch.Size([1070, 6])

In [22]:
input_dim = X_train_tensor.shape[1]
model = SimpleNNRegressionModel(input_dim)
print(model)

SimpleNNRegressionModel(
  (network): Sequential(
    (0): Linear(in_features=6, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=256, bias=True)
    (5): ReLU()
    (6): Linear(in_features=256, out_features=1, bias=True)
  )
)


In [23]:
# Loss and optmiser

criterion = nn.MSELoss()
optimiser = optim.Adam(model.parameters(), lr=0.01)

In [24]:
# Training loop
epochs = 50000

for epoch in range(epochs):
  model.train()
  optimiser.zero_grad()
  predictions = model(X_train_tensor)
  loss = criterion(predictions, y_train_tensor)
  loss.backward()

  optimiser.step()

  if (epoch+1) % 100 == 0:
    print(f"Epoch [{epoch+1}/{epochs}], Loss : {loss.item():.4f}")

Epoch [100/50000], Loss : 30215788.0000
Epoch [200/50000], Loss : 24620942.0000
Epoch [300/50000], Loss : 22695646.0000
Epoch [400/50000], Loss : 21287408.0000
Epoch [500/50000], Loss : 20100558.0000
Epoch [600/50000], Loss : 19179840.0000
Epoch [700/50000], Loss : 18653762.0000
Epoch [800/50000], Loss : 18188870.0000
Epoch [900/50000], Loss : 17705148.0000
Epoch [1000/50000], Loss : 17196004.0000
Epoch [1100/50000], Loss : 16668521.0000
Epoch [1200/50000], Loss : 16114807.0000
Epoch [1300/50000], Loss : 15449722.0000
Epoch [1400/50000], Loss : 14764694.0000
Epoch [1500/50000], Loss : 13987367.0000
Epoch [1600/50000], Loss : 13483533.0000
Epoch [1700/50000], Loss : 12906438.0000
Epoch [1800/50000], Loss : 12069613.0000
Epoch [1900/50000], Loss : 11531254.0000
Epoch [2000/50000], Loss : 10938952.0000
Epoch [2100/50000], Loss : 10153136.0000
Epoch [2200/50000], Loss : 9434129.0000
Epoch [2300/50000], Loss : 8450299.0000
Epoch [2400/50000], Loss : 8134511.5000
Epoch [2500/50000], Loss : 7

In [25]:
# Model Evaluation

model.eval()
y_pred = model(X_test_tensor).detach().numpy()

In [26]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_test_numpy = y_test_tensor.numpy()

# Calculate metrics
mse = mean_squared_error(y_test_numpy, y_pred)
rmse = mse ** 0.5
mae = mean_absolute_error(y_test_numpy, y_pred)
r2 = r2_score(y_test_numpy, y_pred)

print(f"MSE : {mse}")
print(f"RMSE : {rmse}")
print(f"MAE : {mae}")
print(f"R2-Score : {r2}")

# 0 --> 0

MSE : 43229056.0
RMSE : 6574.880683328025
MAE : 4047.92529296875
R2-Score : 0.7215497493743896


In [27]:
def predict_charges(age, sex, bmi, children, smoker,region):
  input_data = pd.DataFrame([[age, sex, bmi, children, smoker,region]],
               columns=['age', 'sex', 'bmi', 'children', 'smoker', 'region'])

  for col in ['sex', 'smoker', 'region']:
    input_data[col] = label_encoder[col].transform(input_data[col])
  input_data = scaler.transform(input_data)
  input_tensor = torch.tensor(input_data, dtype=torch.float32)
  predicted_charge = model(input_tensor).item()
  return predicted_charge


In [30]:
predicted = predict_charges(28, 'male', 33, 3, 'yes', 'southeast')
print(f"Predicted insurance charge: ${predicted:.2f}")

Predicted insurance charge: $32462.47
