# Linear Regression Model in PyTorch

In [1]:
import torch 

## Implement Linear Regression Model from scratch in PyTorch:

To solve a Machine Learning problem there are multiple steps: 
1. Data gathering (collect data for your project, e.g from Kaggle online)
2. Data pre-processing (changing data type, removing useless features etc)
3. Feature engineering (derive more features from the one in the dataset, extract additional features from the available ones)
4. Model training (on the training set)
5. Testing the model (on the test set, how accurate is it)

In [8]:
# Dataset : https://www.kaggle.com/datasets/mirichoi0218/insurance
# ! pip install kaggle
# ! pip install kagglehub
! pip install pandas

Collecting pandas
  Downloading pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.3-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (12.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m26.3 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Downloading tzdata-2025.3-py2.py3-none-any.whl (348 kB)
Installing collected packages: pytz, tzdata, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [pandas]2m2/3[0m [pandas]
[1A[2KSuccessfully installed pandas-2.3.3 pytz-2025.2 tzdata-2025.3


In [None]:
import kagglehub

# Download latest version of the dataset with kaggle
path = kagglehub.dataset_download("mirichoi0218/insurance")

print(f"Path of dataset files : {path}") # Stored in the cache

Path of dataset files : /home/ale/.cache/kagglehub/datasets/mirichoi0218/insurance/versions/1


In [None]:
import os
print(os.getcwd()) # see current directory 

# Load the file from cache and use pandas to read it 
import pandas as pd

os.listdir(path) # work as ls to visualize the file in the path 
# The file name can be visualized

/home/ale/tutorials/computer-vision-deep-learning/code-python/pytorch_tutorials


['insurance.csv']

In [22]:
df = pd.read_csv(os.path.join(path, 'insurance.csv'))

df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [23]:
# Check more information on the dataframe 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [24]:
df.describe() # calculate description values for numerical values 
# Also we see that there are no null values, so removal of null is not necessary

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [None]:
# ! pip install scikit-learn



In [26]:
# Define components for data preprocessing for model training 
import torch
import torch.nn as nn
import torch.optim as optim

# Using sklearn for data pre-processing
from sklearn.preprocessing import LabelEncoder, StandardScaler 
# we need LabelEncoder to transform the dataset
from sklearn.model_selection import train_test_split # help to divide the dataset

In [27]:
# Split dataset 
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42) # 80% as train, 20% as test

In [28]:
# Encode the cathegorigal variables
# non numerical variables requires to be encoded...
# encoders for sex, smoker and region
label_encoder = {}

for col in ['sex', 'smoker', 'region']:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col]) # Transform test set 
    label_encoder[col] = le

print(label_encoder)


{'sex': LabelEncoder(), 'smoker': LabelEncoder(), 'region': LabelEncoder()}


In [37]:
# create features and target columns 
# X and Y

X_train = train_df.drop(columns=['charges']) # all except charges
y_train = train_df['charges'] # take just charges 

X_test = test_df.drop(columns=['charges']) # all except charges
y_test = test_df['charges'] # take just charges 

In [38]:
X_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
560,46,0,19.95,2,0,1
1285,47,0,24.32,0,0,0
1142,52,0,24.86,0,0,2
969,39,0,34.32,5,0,2
486,54,0,21.47,3,0,1


In [39]:
y_train.head()

560      9193.83850
1285     8534.67180
1142    27117.99378
969      8596.82780
486     12475.35130
Name: charges, dtype: float64

In [40]:
# Normalize features to avoid giving importance to numerical values in the range 
# Normalize by standard scalar 

scaler = StandardScaler()

# Normalize on all features (NOT TARGET)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [49]:
print(X_train)

print(y_train.values)
print(y_train.values.shape)

[[ 0.47222651 -1.0246016  -1.75652513  0.73433626 -0.50874702 -0.45611589]
 [ 0.54331294 -1.0246016  -1.03308239 -0.91119211 -0.50874702 -1.35325561]
 [ 0.8987451  -1.0246016  -0.94368672 -0.91119211 -0.50874702  0.44102382]
 ...
 [ 1.3252637   0.97598911 -0.89153925 -0.91119211 -0.50874702 -1.35325561]
 [-0.16755139 -1.0246016   2.82086429  0.73433626  1.96561348  1.33816354]
 [ 1.1120044   0.97598911 -0.10932713 -0.91119211 -0.50874702  1.33816354]]
[ 9193.8385   8534.6718  27117.99378 ... 11931.12525 46113.511
 10214.636  ]
(1070,)


In [52]:
# Convert it to tensors, numpy or pandas are not correct version for nn models 
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1) # Flatten the output
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [54]:
print(y_test_tensor)
print(y_test_tensor.shape) # you need to make it as a column vector from the original row shape 

print(X_test_tensor)
print(X_test_tensor.shape)

tensor([[ 9095.0684],
        [ 5272.1758],
        [29330.9824],
        [ 9301.8936],
        [33750.2930],
        [ 4536.2588],
        [ 2117.3389],
        [14210.5361],
        [ 3732.6250],
        [10264.4424],
        [18259.2168],
        [ 7256.7231],
        [ 3947.4131],
        [46151.1250],
        [48673.5586],
        [44202.6523],
        [ 9800.8887],
        [42969.8516],
        [ 8233.0977],
        [21774.3223],
        [ 5080.0962],
        [ 7441.5010],
        [ 1256.2990],
        [ 2755.0210],
        [11085.5869],
        [10923.9336],
        [12644.5889],
        [18804.7520],
        [ 9715.8408],
        [ 1131.5066],
        [15828.8213],
        [11842.6240],
        [ 2020.5522],
        [ 5693.4307],
        [ 2904.0879],
        [ 7448.4038],
        [ 2597.7791],
        [ 7337.7480],
        [23887.6621],
        [38709.1758],
        [ 4687.7969],
        [ 2643.2686],
        [11674.1299],
        [12124.9922],
        [ 4889.9995],
        [1

In [55]:
# Define NN model for linear regression over the dataset 

class SimpleNNRegressionModel(nn.Module):
    def __init__(self, input_dim):

        super(SimpleNNRegressionModel, self).__init__()

        # Very simple model, Sequential is enough
        self.network = nn.Sequential(
            nn.Linear(input_dim, 64), # 64 neurons of first hidden layer
            nn.ReLU(),
            nn.Linear(64, 128), # map to 128 neurons on next hidden layer 
            nn.ReLU(),
            nn.Linear(128, 1) # here the final layer of 128 is mapped to regrssion output of dimension 1
        )

    def forward(self, x):
        return self.network(x) # produce the output forward passing x on the network

In [56]:
# Initialize model from the custom NN 
input_dim = X_train_tensor.shape[1] # we are interested on feature lenght of each data
model = SimpleNNRegressionModel(input_dim=input_dim)

print(model) # Visualize model architecture 


SimpleNNRegressionModel(
  (network): Sequential(
    (0): Linear(in_features=6, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=1, bias=True)
  )
)


In [57]:
# Initialize other nn components, Loss and Optimizer

criterion = nn.MSELoss() # In regression MSE loss is enough
optimizer = optim.Adam(model.parameters(), lr=0.01) # Adam optimizer

In [59]:
# Create training loop 
epochs = 30000
for epoch in range(epochs):
    model.train()           # train method tell pytorch to store history of model training
    # while to test the model, model.eval() is to use in feed forward
    optimizer.zero_grad()   # clear gradients

    predictions = model(X_train_tensor)
    loss = criterion(predictions, y_train_tensor)
    loss.backward() #  compute gradients

    optimizer.step() # update weights using gradients computed 

    if (epoch + 1) % 100 == 0:
        print(f"Epoch [{epoch+1}/{epochs}], Loss : {loss.item():.4f}")

Epoch [100/30000], Loss : 6642561.5000
Epoch [200/30000], Loss : 6615907.5000
Epoch [300/30000], Loss : 6583331.5000
Epoch [400/30000], Loss : 6533506.0000
Epoch [500/30000], Loss : 6497858.0000
Epoch [600/30000], Loss : 6475542.0000
Epoch [700/30000], Loss : 6429766.5000
Epoch [800/30000], Loss : 6395507.5000
Epoch [900/30000], Loss : 6363408.5000
Epoch [1000/30000], Loss : 6336698.5000
Epoch [1100/30000], Loss : 6306668.0000
Epoch [1200/30000], Loss : 6281544.5000
Epoch [1300/30000], Loss : 6261196.5000
Epoch [1400/30000], Loss : 6225880.0000
Epoch [1500/30000], Loss : 6198025.5000
Epoch [1600/30000], Loss : 6177780.5000
Epoch [1700/30000], Loss : 6139461.0000
Epoch [1800/30000], Loss : 6109364.0000
Epoch [1900/30000], Loss : 6071241.5000
Epoch [2000/30000], Loss : 6058012.0000
Epoch [2100/30000], Loss : 6008117.0000
Epoch [2200/30000], Loss : 5986887.5000
Epoch [2300/30000], Loss : 5966286.5000
Epoch [2400/30000], Loss : 5948743.0000
Epoch [2500/30000], Loss : 5925193.5000
Epoch [26

In [None]:
# Model Evaluation (Testing how it perform)
model.eval() # In evaluation mode, use just for forward propagation
y_pred = model(X_test_tensor).detach().numpy() # detach from memory and convert to numpy, move to CPU 

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # metrics to evaluate the moel 

y_test_numpy = y_test_tensor.numpy() # no need to detach, it is already a tensor on the RAM 

# calculate metrics
mse = mean_squared_error(y_test_numpy, y_pred)
rmse = mse ** 0.5 # root of mse 

mae = mean_absolute_error(y_test_numpy, y_pred)

r2 = r2_score(y_test_numpy, y_pred)

print(f"Metrics: \nMSE:{mse} \nRMSE:{rmse} \nMAE:{mae} \nR2-Score:{r2}")

# Regression problem 

Metrics: 
MSE:54482280.0 
RMSE:7381.211282709634 
MAE:4892.45654296875 
R2-Score:0.6490645408630371


In [64]:
# Use the model for prediction in a custom function 

def predict_charges(age, sex, bmi, children, smoker, region):
    input_data = pd.DataFrame([[age, sex, bmi, children, smoker, region]],
                 columns = ['age', 'sex', 'bmi', 'children', 'smoker', 'region'])
    
    # Use lable encoders previosuly defined to convert to the proper input format 
    for col in ['sex', 'smoker', 'region']:
        input_data[col] = label_encoder[col].transform(input_data[col])

    input_data = scaler.transform(input_data) # normalize properly
    input_tensor = torch.tensor(input_data, dtype=torch.float32)

    predicted_charge = model(input_tensor).item()
    return predicted_charge



In [67]:
# Create a value and see how it behaves 
predicted_charge = predict_charges(19, 'female', 27.9, 0, 'yes', 'southwest')
print(f"Predicted insurance charge: {predicted_charge:.2f} $")

Predicted insurance charge: 16805.43 $


In [None]:
# It is reasonable? What if I increase age 
predicted_charge = predict_charges(50, 'female', 27.9, 0, 'yes', 'southwest')
print(f"Predicted insurance charge: {predicted_charge:.2f} $")
# Increase with higher age... meaningful, reasonable result
# Linear pattern learned !

Predicted insurance charge: 30624.44 $
