# Understanding components of custom data loader in pytorch
![](https://drive.google.com/uc?id=1e92FXOYdRlmQTbK0WozmBN0ZO9KYCPJx)

## Recap - Creating Linear regression model

In [None]:
## Dataset used
# https://www.kaggle.com/datasets/mirichoi0218/insurance

In [4]:
!pip install kaggle



In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mirichoi0218/insurance")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\aman0\.cache\kagglehub\datasets\mirichoi0218\insurance\versions\1


In [2]:
import os
import pandas as pd

In [3]:
os.listdir("C:/Users/aman0/.cache/kagglehub/datasets/mirichoi0218/insurance/versions/1")

['insurance.csv']

In [4]:
df = pd.read_csv(os.path.join(path, 'insurance.csv'))

In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [8]:
# Split dataset before encoding
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [9]:
# Encode categorical variables
label_encoders = {}
for col in ['sex', 'smoker', 'region']:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    label_encoders[col] = le  # Store encoders for later use

In [10]:
# Features and target
X_train = train_df.drop(columns=['charges'])
y_train = train_df['charges']
X_test = test_df.drop(columns=['charges'])
y_test = test_df['charges']

In [11]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

(1070, 6)
(1070,)
(268, 6)


In [12]:
# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [14]:
print(X_train_tensor.shape)

torch.Size([1070, 6])


In [15]:
# Define Neural Network Model
class SimpleNNRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNNRegressionModel, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        return self.network(x)


In [16]:
# Initialize model
input_dim = X_train_tensor.shape[1]
model = SimpleNNRegressionModel(input_dim)

In [17]:
# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [18]:
'''
x_train_tensor = 1000000 --> 10gb OOM - out of memory
1000000 --> weight and bias

we are teaching human : A book of 1000 pages --> student : 10 : student, i am not able to understand.
1000
10 --> 100
100 feedback = 1000 pages --> 1 epoch

1000 pages total
100 epoch
10 pages feedback
100 iteration * 100



100 epoch
1 epoch --> 1070 rows
'''

'\nx_train_tensor = 1000000 --> 10gb OOM - out of memory\n1000000 --> weight and bias\n\nwe are teaching human : A book of 1000 pages --> student : 10 : student, i am not able to understand.\n1000\n10 --> 100\n100 feedback = 1000 pages --> 1 epoch\n\n1000 pages total\n100 epoch\n10 pages feedback\n100 iteration * 100\n\n\n\n100 epoch\n1 epoch --> 1070 rows\n'

In [19]:

# ✅ Common Practice:
# Model Type	Typical clip_value
# Small RNN / LSTM / GRU models	1.0 to 5.0
# Large Transformers (BERT, GPT)	0.5 to 1.0
# CNNs / Feedforward networks (rarely needed)	Usually no clipping, or 2.0~5.0 if unstable

# To prevent exploding gradients
# ✅ Step-by-step:
# Calculate total gradient norm (L2 norm) across all parameters:

# total_norm
# =
# ∑
# (
# gradients
# 2
# )
# total_norm= 
# ∑(gradients 
# 2
#  )
# ​
 
# Compare total_norm with clip_value:

# If:

# total_norm
# ≤
# clip_value
# total_norm≤clip_value
# ➡ Do nothing, keep gradients as they are.

# Else:

# scale factor
# =
# clip_value
# total_norm
# scale factor= 
# total_norm
# clip_value
# ​


In [20]:
# Training loop
epochs = 1000
clip_value = 25
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    predictions = model(X_train_tensor)
    loss = criterion(predictions, y_train_tensor)
    loss.backward()

    # torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

    optimizer.step()

    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Epoch [100/1000], Loss: 46238456.0000
Epoch [200/1000], Loss: 31916640.0000
Epoch [300/1000], Loss: 29426798.0000
Epoch [400/1000], Loss: 27384960.0000
Epoch [500/1000], Loss: 26143540.0000
Epoch [600/1000], Loss: 25205638.0000
Epoch [700/1000], Loss: 24448342.0000
Epoch [800/1000], Loss: 23932408.0000
Epoch [900/1000], Loss: 23538004.0000
Epoch [1000/1000], Loss: 23207750.0000


## Understanding Components of a Custom DataLoader in PyTorch

1. Dataset (torch.utils.data.Dataset)
2. DataLoader (torch.utils.data.DataLoader)

In [21]:
# Data loader is like generators in python which wraps the object into iterable and yield the dataset

In [22]:
# Creating our custom Dataset in pytorch
# Following are mandatory methods required to override
# init() - initialised the dataset, loads data, applied preprocessing
# len() - return the total numbers of samples in the dataset
# getitem() - Defines how to retrieve a single data sample when an index is provide

In [20]:
import torch
from torch.utils.data import Dataset, DataLoader

In [21]:
class InsuranceDataset(Dataset):
  def __init__(self, X, y):
    self.X = X
    self.y = y
    # any preprocessing

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
     features = torch.tensor(self.X[idx], dtype=torch.float32)
     target = torch.tensor(self.y.values[idx], dtype=torch.float32)
     return features, target


In [22]:
y_train

560      9193.83850
1285     8534.67180
1142    27117.99378
969      8596.82780
486     12475.35130
           ...     
1095     4561.18850
1130     8582.30230
1294    11931.12525
860     46113.51100
1126    10214.63600
Name: charges, Length: 1070, dtype: float64

In [23]:
type(X_train),type(y_train)
# So X_train.values is not required as it already , y_train.values is required because its y_train[0] will return pandas based indexing 

(numpy.ndarray, pandas.core.series.Series)

In [24]:
y_train.values[0],y_train[0]

(np.float64(9193.8385), np.float64(16884.924))

In [25]:
dataset = InsuranceDataset(X_train, y_train)
# We are not using dataset for test data because it is smaller and can be passed to the model once

In [26]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)

In [27]:
# Argument	Meaning
# dataset	Your dataset object (could be TensorDataset, CustomDataset, etc.)
# batch_size=32	Loads data in batches of size 32
# shuffle=True	Randomly shuffle data at the start of each epoch (good for training)
# num_workers=4	Uses 4 parallel worker processes to speed up data loading (especially useful if your dataset is large or on disk)

In [None]:
for batch_idx, (features, targets) in enumerate(dataloader):
  print(f"Batch {batch_idx+1} :")
  print("Features : ", features.shape)
  print("Targets : ", targets.shape)
  break

In [28]:
1070/32

33.4375

✅ ✅ Why is batch size important for gradient calculation?
Larger batch → gradients computed over more data → smoother update
Smaller batch → faster per-step iteration but noisier gradient estimate

Batch size	Pros	Cons
Small (like 16, 32)	Less memory, faster per-step	Noisy gradients
Medium (64-128)	Good balance	Needs more memory
Large (256-1024)	Smooth gradients	Requires large GPU

In [None]:
epochs = 1000
for epoch in range(epochs):
    model.train()

    for batch_idx, (batch_X, batch_y) in enumerate(dataloader):
      print(f"Current batch : {batch_idx}")
      optimizer.zero_grad()
      predictions = model(batch_X)
      loss = criterion(predictions, batch_y)
      loss.backward()
      optimizer.step()
      print(f'Batch [{batch_idx+1}/{len(dataloader)}], Loss: {loss.item():.4f}')

    if (epoch+1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

In [None]:
# # For large datasets -- we use the dataset and dataloader to avoid oom, and for faster processing
# import torch
# from torch.utils.data import Dataset
# from PIL import Image
# import os

# class MyImageDataset(Dataset):
#     def __init__(self, image_dir, labels_dict, transform=None):
#         """
#         image_dir: Path to folder containing image files
#         labels_dict: Dictionary mapping image filename to label (e.g., {'img1.jpg': 0, 'img2.jpg': 1, ...})
#         transform: Optional torchvision transforms (like resize, normalize, etc.)
#         """
#         self.image_dir = image_dir
#         self.image_filenames = os.listdir(image_dir)
#         self.labels_dict = labels_dict
#         self.transform = transform

#     def __len__(self):
#         return len(self.image_filenames)

#     def __getitem__(self, idx):
#         image_name = self.image_filenames[idx]
#         image_path = os.path.join(self.image_dir, image_name)

#         # Load image
#         image = Image.open(image_path).convert('RGB')

#         # Apply transforms (resize, normalize, etc.)
#         if self.transform:
#             image = self.transform(image)

#         # Get label
#         label = self.labels_dict.get(image_name, -1)  # -1 if label missing

#         return image, label
