In [None]:
!pip install d2l --no-deps

Collecting d2l
  Downloading d2l-1.0.3-py3-none-any.whl.metadata (556 bytes)
Downloading d2l-1.0.3-py3-none-any.whl (111 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/111.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.7/111.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: d2l
Successfully installed d2l-1.0.3


In [None]:
%matplotlib inline
import random
import torch
from d2l import torch as d2l

## 1.1 Generating a dataset.

* Here we will work in low dimension for accuracy.
* Here we'll generate 1000 examples with 2-dimensional features drawn from a standard normal distribution.
* Resulting to a matrix `X`.
* We generate each label by applying a ground truth linear function, corrupting them via additive noise `e`, drawn independetly and identically for each example:

  * `y=Xw + b +e`

In [None]:
class SyntheticRegressionData(d2l.DataModule):
  """Synthetic data for linear regression"""
  def __init__(self,w,b,noise=0.01,num_train=1000,num_val=1000,batch_size=32 ):
    super().__init__()
    self.save_hyperparameters()
    n = num_train + num_val
    self.X = torch.rand(n,len(w))
    noise = torch.randn(n,1)*noise
    self.y = torch.matmul(self.X,w.reshape((-1,1))) + b + noise


* We set the true parameters $w=[2,-3.4]^T$ and $b=4.2$.

In [None]:
data = SyntheticRegressionData(w=torch.tensor([2,-3.4]),b=4.2)

* Each row in features consists of a vector in $R^2$ and each row in `labels` is a scalar.

In [None]:
##looking at first entry
print(f"Features:{data.X[0]}")
print(f"\nLabel: {data.y[0]}")

Features:tensor([0.6352, 0.0926])

Label: tensor([5.1511])


## 1.2 Reading the dataset.


* Training ML models often requires multiple passes over the dataset, grabbing one minibatch of examples at a time and then the data is then used to update the model.
* To illustrate how this works, we implement the `get_dataloader` method, registering it in the `SyntheticRegressionData` via `add_to_class`.

In [None]:
@d2l.add_to_class(SyntheticRegressionData)
def get_dataloader(self,train):
  if train:
    indices = list(range(0,self.num_train))
    #reading the examples in random order
    random.shuffle(indices)

  else:
    indices = list(range(self.num_train,self.num_train+self.num_val))
  for i in range(0,len(indices),self.batch_size):
    batch_indices = torch.tensor(indices[i:i+self.batch_size])
    yield self.X[batch_indices],self.y[batch_indices]

In [None]:
##inspecting the first minibatch
X,y = next(iter(data.train_dataloader()))
print('X.shape:',X.shape, '\ny shape:',y.shape)

X.shape: torch.Size([32, 2]) 
y shape: torch.Size([32, 1])


* Using iterators we've just built is inefficient in ways that might get us into trouble in the real world.
* For example, it requires we load  all data in memory and that we perform lots of random memory access.
* However,built-in iterators implemented in deep learning frameworks are considerably more efficient and they can deal with sources such as data stored in files, data received via a stream, and data generated or processed on the fly.

## 1.3 Concise Implementation of the DataLoader

* Rather than writing our own iterator, we can call the existing API framework to load data.
* We set `batch_size` in the built-in data loader and let it take care of shufflong examples efficiently.

In [None]:
d2l.add_to_class(d2l.DataModule)
def get_tensorloader(self,tensors,train,indices=slice(0,None)):
  tensors = tuple(a[indices]for a in tensors)
  dataset = torch.utils.data.TensorDataset(*tensors)
  return torch.utils.data.DataLoader(dataset,self.batch_size,shuffle=train)

d2l.add_to_class(SyntheticRegressionData)
def get_dataloader(self,train):
  i = slice(0,self.num_train)if train else slice(self.num_train,None)
  return self.get_tensorloader((self.X,self.y),train,i)

* The new dataloader behaves just like the previous one, except that it more efficient and has some added functionality.

In [None]:
X,y = next(iter(data.train_dataloader()))
print('X.shape',X.shape,'\ny shape: ',y.shape)

X.shape torch.Size([32, 2]) 
y shape:  torch.Size([32, 1])


* The dataloader supports the `__len__` method, which we can use to query the length of dataloders.

In [None]:
len(X)

32

In [None]:
##implementing a data generator that produces new data on the fly,
##every time the iterator is called
import torch
import random

def data_generator(w,b,batch_size):
  """Generates synthetic data on the fly"""
  num_features = len(w)

  while True:
    #generate random features (x) for the batch
    X = torch.randn(batch_size,num_features)

    #calculating labels
    y = torch.matmul(X,w) + b

    #adding gaussina noise
    y += torch.randn(y.shape)*0.01

    #yield function pauses the function and returns the batch
    #when the iterator is called again, it resumes
    yield X,y.reshape((-1,1))

true_w = torch.tensor([2,-3.5,0.5])
true_b = 4.2
batch_size = 10

data_iter = data_generator(true_w,true_b,batch_size)

#getting the first "on-the-fly" batch
X_batch,y_batch = next(data_iter)

print("First batch of features (X):\n", X_batch)
print("\nFirst batch of labels (y):\n", y_batch)


First batch of features (X):
 tensor([[ 0.5723,  1.3739, -1.8686],
        [ 0.3060,  0.9130,  0.7138],
        [ 1.3909,  0.7003,  0.8485],
        [ 0.4367,  0.6296, -0.5597],
        [ 0.2494, -1.1324, -1.1059],
        [-0.0696,  0.5318,  0.6042],
        [ 1.8589, -0.7398,  0.9072],
        [ 0.7863,  0.2056,  0.7796],
        [ 0.3892,  0.2855, -1.7331],
        [-0.3145,  0.1259,  1.6988]])

First batch of labels (y):
 tensor([[-0.3917],
        [ 1.9866],
        [ 4.9611],
        [ 2.5830],
        [ 8.1056],
        [ 2.5204],
        [10.9670],
        [ 5.4257],
        [ 3.1212],
        [ 3.9784]])


* We can see from above that given a dataset of examples we reshuffle it but why do we need to reshuffle the data points in the dataset?
1. Preventing `shortcut learning`- if data is ordered the model finds the "cheat code". For example if we have a dataset with cat and dogs images. And say we divide the dataset into two batches one containing dog and cat images. As the model sees the cat images it may see that since cat are indoors with stuff like carpets so that means that's an image of a cat.
If data is shuffled this cheat code it broken. The only way the model can reduce loss consistently is bylearning the actual features shape,color that define the object.
2. Smooting the gradient path:
in unordered data,each training step is based on a random sample, so we can generally move towards the bottom.
In ordered we will take like 100 steps to the left 100 steps to the right and never end at the bottom.
