# Create Linear Regression model with Pytorch components
![](https://drive.google.com/uc?id=1K99pcqE9fbl6QL0UADlC27S_sstwJvPH)

In [1]:
# 1. Data gathering
# 2. Data preprocessing
# 3. Feature engineering
# 4. Model training
# 5. Testing

In [2]:
## Dataset used
# https://www.kaggle.com/datasets/mirichoi0218/insurance

In [3]:
!pip install kaggle



In [4]:
!pip install kagglehub



In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mirichoi0218/insurance")

print(f"Path of dataset files : {path}")

  from .autonotebook import tqdm as notebook_tqdm


Path of dataset files : C:\Users\aman0\.cache\kagglehub\datasets\mirichoi0218\insurance\versions\1


In [6]:
import os
print(os.getcwd())

c:\Users\aman0\OneDrive\Desktop\Skills\DL(Images)\Sec(6)-PyTorch


In [7]:
import pandas as pd

In [8]:
os.listdir('C:/Users/aman0/.cache/kagglehub/datasets/mirichoi0218/insurance/versions/1')

['insurance.csv']

In [9]:
df = pd.read_csv(os.path.join(path, 'insurance.csv'))

In [10]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [12]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [13]:
!pip install scikit-learn



In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split


In [15]:
# Split dataset before encoding
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [16]:
for col in df.select_dtypes(include='object').columns :
    print(df[col].unique())

['female' 'male']
['yes' 'no']
['southwest' 'southeast' 'northwest' 'northeast']


In [17]:
# Encode cetagorical variable
label_encoder = {}
for col in ['sex', 'smoker', 'region']:
  le = LabelEncoder()
  train_df[col] = le.fit_transform(train_df[col])
  test_df[col] = le.transform(test_df[col])
  label_encoder[col] = le



In [18]:
# Features and target
X_train = train_df.drop(columns=['charges'])
y_train = train_df['charges']

X_test = test_df.drop(columns=['charges'])
y_test = test_df['charges']

In [19]:
print(X_train.head())
print(y_train.head())

      age  sex    bmi  children  smoker  region
560    46    0  19.95         2       0       1
1285   47    0  24.32         0       0       0
1142   52    0  24.86         0       0       2
969    39    0  34.32         5       0       2
486    54    0  21.47         3       0       1
560      9193.83850
1285     8534.67180
1142    27117.99378
969      8596.82780
486     12475.35130
Name: charges, dtype: float64


In [20]:
# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [21]:
print(X_train)

[[ 0.47222651 -1.0246016  -1.75652513  0.73433626 -0.50874702 -0.45611589]
 [ 0.54331294 -1.0246016  -1.03308239 -0.91119211 -0.50874702 -1.35325561]
 [ 0.8987451  -1.0246016  -0.94368672 -0.91119211 -0.50874702  0.44102382]
 ...
 [ 1.3252637   0.97598911 -0.89153925 -0.91119211 -0.50874702 -1.35325561]
 [-0.16755139 -1.0246016   2.82086429  0.73433626  1.96561348  1.33816354]
 [ 1.1120044   0.97598911 -0.10932713 -0.91119211 -0.50874702  1.33816354]]


In [22]:
# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [23]:
print(y_train_tensor)
print(y_train_tensor.shape)

tensor([[ 9193.8389],
        [ 8534.6719],
        [27117.9941],
        ...,
        [11931.1250],
        [46113.5117],
        [10214.6357]])
torch.Size([1070, 1])


In [24]:
print(X_test_tensor)
print(X_test_tensor.shape)

tensor([[ 0.4011, -1.0246, -0.8915,  0.7343, -0.5087, -1.3533],
        [-0.2386, -1.0246, -0.0895, -0.9112, -0.5087, -0.4561],
        [ 1.7518, -1.0246, -0.6085, -0.9112,  1.9656, -0.4561],
        ...,
        [-0.0965,  0.9760, -0.4197, -0.0884, -0.5087, -1.3533],
        [ 1.0409, -1.0246,  2.7894, -0.9112,  1.9656,  0.4410],
        [ 0.8277, -1.0246,  0.6025, -0.0884, -0.5087,  1.3382]])
torch.Size([268, 6])


In [27]:
class NN(nn.Module):
    def __init__(self,input_size,hl1_size,hl2_size,output_size):
        super(NN,self).__init__() 
        '''Layers'''
        self.fc1=nn.Linear(input_size,hl1_size)  #hl1
        self.relu1=nn.ReLU()
        self.fc2=nn.Linear(hl1_size,hl2_size)    #hl2
        self.relu2=nn.ReLU()                     
        self.fc3=nn.Linear(hl2_size,output_size) #output layer
        self.relu3=nn.ReLU()                     #since output is positive always we can use relu instead of linear
        
    
    def forward(self,x):
        x=self.fc1(x)  
        x=self.relu1(x)
        x=self.fc2(x)
        x=self.relu2(x)
        x=self.fc3(x)
        x=self.relu3(x)
        return x  

In [39]:
model=NN(X_train_tensor.shape[1],64,64,1)

In [40]:
print(model)

NN(
  (fc1): Linear(in_features=6, out_features=64, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (relu3): ReLU()
)


In [41]:
criterion=nn.MSELoss()
optimizer=optim.Adam(model.parameters(),lr=0.01)

In [42]:
X_train_tensor.shape

torch.Size([1070, 6])

In [47]:
epochs=1000
for i in range(epochs):
    optimizer.zero_grad()
    y_hat=model(X_train_tensor)
    loss=criterion(y_hat,y_train_tensor)
    loss.backward()
    optimizer.step()
    if(i%10==0):
        print(f"Epoch {i}/[{epochs}] {'- '*10} Loss: {loss.item():.4f}")

Epoch 0/[1000] - - - - - - - - - -  Loss: 17335662.0000
Epoch 10/[1000] - - - - - - - - - -  Loss: 17310878.0000
Epoch 20/[1000] - - - - - - - - - -  Loss: 17288066.0000
Epoch 30/[1000] - - - - - - - - - -  Loss: 17265544.0000
Epoch 40/[1000] - - - - - - - - - -  Loss: 17239044.0000
Epoch 50/[1000] - - - - - - - - - -  Loss: 17214638.0000
Epoch 60/[1000] - - - - - - - - - -  Loss: 17192374.0000
Epoch 70/[1000] - - - - - - - - - -  Loss: 17174062.0000
Epoch 80/[1000] - - - - - - - - - -  Loss: 17149960.0000
Epoch 90/[1000] - - - - - - - - - -  Loss: 17132048.0000
Epoch 100/[1000] - - - - - - - - - -  Loss: 17111876.0000
Epoch 110/[1000] - - - - - - - - - -  Loss: 17097904.0000
Epoch 120/[1000] - - - - - - - - - -  Loss: 17075760.0000
Epoch 130/[1000] - - - - - - - - - -  Loss: 17052502.0000
Epoch 140/[1000] - - - - - - - - - -  Loss: 17035786.0000
Epoch 150/[1000] - - - - - - - - - -  Loss: 17015108.0000
Epoch 160/[1000] - - - - - - - - - -  Loss: 16997536.0000
Epoch 170/[1000] - - - - 

In [48]:
model.eval()

NN(
  (fc1): Linear(in_features=6, out_features=64, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (relu3): ReLU()
)

In [50]:
y_pred_train=model(X_train_tensor).detach().numpy()
y_pred_test=model(X_test_tensor).detach().numpy()

In [51]:
from sklearn.metrics import r2_score
print(f'"Train Accuracy" : {r2_score(y_pred_train,y_train)}')
print(f'"Test Accuracy" : {r2_score(y_pred_test,y_test)}')

"Train Accuracy" : 0.8818362821024882
"Test Accuracy" : 0.8222960193508666
