In [None]:
class HyperParameters:
    """The base class of hyperparameters."""
    def save_hyperparameters(self, ignore=[]):
        """Defined in :numref:`sec_oo-design`"""
        raise NotImplemented
    def save_hyperparameters(self, ignore=[]):
        """Save function arguments into class attributes.
        Defined in :numref:`sec_utils`"""
        frame = inspect.currentframe().f_back
        _, _, _, local_vars = inspect.getargvalues(frame)
        self.hparams = {k:v for k, v in local_vars.items()
                        if k not in set(ignore+['self']) and not k.startswith('_')}
        for k, v in self.hparams.items():
            setattr(self, k, v)

class DataModule(d2l.HyperParameters):
    """The base class of data.
    Defined in :numref:`subsec_oo-design-models`"""
    def __init__(self, root='../data', num_workers=4):
        self.save_hyperparameters()
    def get_dataloader(self, train):
        raise NotImplementedError
    def train_dataloader(self):
        return self.get_dataloader(train=True)
    def val_dataloader(self):
        return self.get_dataloader(train=False)
    def get_tensorloader(self, tensors, train, indices=slice(0, None)):
        """Defined in :numref:`sec_synthetic-regression-data`"""
        tensors = tuple(a[indices] for a in tensors)
        dataset = torch.utils.data.TensorDataset(*tensors)
        return torch.utils.data.DataLoader(dataset, self.batch_size,
                                           shuffle=train)
    
class SyntheticRegressionData(d2l.DataModule):
    """Synthetic data for linear regression. 
    Defined in :numref:`sec_synthetic-regression-data`"""
    def __init__(self, w, b, noise=0.01, num_train=1000, num_val=1000,
                batch_size=32):
        super().__init__()
        self.save_hyperparameters()
        n = num_train + num_val
        self.X = d2l.randn(n, len(w))
        noise = d2l.randn(n, 1) * noise
        self.y = d2l.matmul(self.X, d2l.reshape(w, (-1, 1))) + b + noise
    def get_dataloader(self, train):
        """Defined in :numref:`sec_synthetic-regression-data`"""
        i = slice(0, self.num_train) if train else slice(self.num_train, None)
        return self.get_tensorloader((self.X, self.y), train, i)

In [None]:
@d2l.add_to_class(d2l.DataModule)  #@save
def get_tensorloader(self, tensors, train, indices=slice(0, None)):
    tensors = tuple(a[indices] for a in tensors)
    dataset = torch.utils.data.TensorDataset(*tensors)
    return torch.utils.data.DataLoader(dataset, self.batch_size, shuffle=train)

@d2l.add_to_class(SyntheticRegressionData)  #@save
def get_dataloader(self, train):
    i = slice(0, self.num_train) if train else slice(self.num_train, None)
    return self.get_tensorloader((self.X, self.y), train, i)

X, y = next(iter(data.train_dataloader()))
print('X shape:', X.shape, '\ny shape:', y.shape)
len(data.train_dataloader())

# #1

In [None]:
@d2l.add_to_class(d2l.DataModule)  #@save
def get_tensorloader(self, tensors, train, indices=slice(0, None)):
    tensors = tuple(a[indices] for a in tensors)
    dataset = torch.utils.data.TensorDataset(*tensors)
    return torch.utils.data.DataLoader(dataset, self.batch_size, 
                                       shuffle=train, drop_last = True)

@d2l.add_to_class(SyntheticRegressionData)  #@save
def get_dataloader(self, train):
    i = slice(0, self.num_train) if train else slice(self.num_train, None)
    return self.get_tensorloader((self.X, self.y), train, i)

X, y = next(iter(data.train_dataloader()))
print('X shape:', X.shape, '\ny shape:', y.shape)
len(data.train_dataloader())

X shape: torch.Size([32, 2]) 
y shape: torch.Size([32, 1])


31

```python
@d2l.add_to_class(SyntheticRegressionData)  #@save
def get_dataloader(self, train):
    i = slice(0, self.num_train) if train else slice(self.num_train, None)
    return self.get_tensorloader((self.X, self.y), train, i)
```
##### What is a Slice Object? 
- A slice object in Python is used to specify a range of indices for slicing sequences like lists, tuples, and arrays.
Example, 
You can use slice objects to extract portions of sequences. For example, if you have a list my_list, you can use my_list[slice(0, 5)] to get the first five elements.
- `None`: When None is provided as the stop index in a slice object, it means "up to the end of the sequence".

The `get_dataloader()` then calls the `get_tensorloader()` function and passes the tuple (data.X, data,y) = (`self.X`,`self.y`), `train` parameter and the slicing object into it.
Therefore, the `get_dataloader()` has to be called twice separately for training data and validation data. (since the get_tensorloader() takes only 1 slice object as input)

```python
@d2l.add_to_class(d2l.DataModule)  #@save
def get_tensorloader(self, tensors, train, indices=slice(0, None)):
    tensors = tuple(a[indices] for a in tensors)
    dataset = torch.utils.data.TensorDataset(*tensors)
    return torch.utils.data.DataLoader(dataset, self.batch_size, 
                                       shuffle=train, drop_last = True)
```
- default value for `indices` argument, `indices=slice(0, None)`
if `indices` is not provided. (no slicing, therefore everything is put in the training data)

- `dataset = torch.utils.data.TensorDataset(*tensors)`:
 Unpacking with `*` operator, torch.utils.data.TensorDataset expects one or more tensors. It creates a dataset object where each element is a tuple of the corresponding elements from the input tensors. 

- `return torch.utils.data.DataLoader(dataset, self.batch_size, shuffle=train, drop_last = True)` is an instance of the `DataLoader` class. This class is a PyTorch utility that provides an iterable over the dataset, yielding batches of data.(shuffle if train=True)


# #2

1. If the dataset is too large to fit into memory, an "out of memory" error will occur. In such cases, only a portion of the data can be loaded, preventing complete shuffling of the entire dataset.

##### Why is Shuffling of Data Important ? 
[https://datascience.stackexchange.com/a/24539]

[https://datascience.stackexchange.com/a/24524]
Shuffling data serves the purpose of reducing variance and making sure that models remain general and overfit less.

The obvious case where you'd shuffle your data is if your data is sorted by their class/target. Here, you will want to shuffle to make sure that your training/test/validation sets are representative of the overall distribution of the data.

For batch gradient descent, the same logic applies. The idea behind batch gradient descent is that by calculating the gradient on a single batch, you will usually get a fairly good estimate of the "true" gradient. That way, you save computation time by not having to calculate the "true" gradient over the entire dataset every time.

You want to shuffle your data(entire) after each epoch because you will always have the risk to create batches that are not representative of the overall dataset, and therefore, your estimate of the gradient will be off. Shuffling your data after each epoch ensures that you will not be "stuck" with too many bad batches.
(epoch: An epoch is when all the training data is used at once and is defined as the total number of iterations of all the training data in one cycle for training the machine learning model.)

In regular stochastic gradient descent, when each batch has size 1, you still want to shuffle your data after each epoch to keep your learning general. Indeed, if data point 17 is always used after data point 16, its own gradient will be biased with whatever updates data point 16 is making on the model. By shuffling your data, you ensure that each data point creates an "independent" change on the model, without being biased by the same points before them.

2. ??
[https://www.youtube.com/watch?v=onWTiaDx3Lw]

[Apache Spark external sorting algorithm]

[Apache Hadoop external sorting algorithm]

[MapReduce: Simplified Data Processing on Large Clusters,Jeffrey Dean and Sanjay Ghemawat]

[External Memory Algorithms and Data Structures:Dealing with Massive Data, JEFFREY SCOTT VITTER]

# #3

In [None]:
class SyntheticRegressionData_onfly(d2l.DataModule):
    """Synthetic data for linear regression."""
    def __init__(self, w, b, noise=0.01, num_train=1000, num_val=1000,
                 batch_size = 32):
        super().__init__()
        self.save_hyperparameters()
        n = num_train + num_val
        self.X  = torch.randn(n, len(w))
        noise = torch.randn(n,1) *noise
        self.y = torch.matmul(self.X, w.reshape((-1,1))) + b + noise

data = SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2)
print('features:', data.X[0], '\nlabel:', data.y[0])

same as SyntheticRegressionData

# #4

In [None]:
class SyntheticRegressionData(d2l.DataModule):
    """Synthetic data for linear regression."""
    def __init__(self, w, b, noise=0.01, num_train=1000, num_val=1000,
                 batch_size = 32, seed:int = 0):
        super().__init__()
        self.save_hyperparameters()
        n = num_train + num_val
        torch.manual_seed(seed)
        self.X  = torch.randn(n, len(w))
        noise = torch.randn(n,1) *noise
        self.y = torch.matmul(self.X, w.reshape((-1,1))) + b + noise

data = SyntheticRegressionData(w=torch.tensor([2, -3.4]), b=4.2, seed=5)
print('features:', data.X[0], '\nlabel:', data.y[0])

features: tensor([1.8423, 0.5189]) 
label: tensor([6.1095])


`torch.manual_seed(seed)` initializes the random number generator (RNG) with a specific seed, ensuring that any subsequent random numbers generated will be deterministic and reproducible.