In [18]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import datasets

from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
import pandas as pd

First, let's create a Custom Dataset using random numbers, inheriting from the `Dataset` class:

In [5]:
class RandomIntDataset(Dataset):
    def __init__(self, start, stop, x, y):
        self.data = torch.randint(start, stop, (x,y))
        self.labels = torch.randint(0, 10, (x,))

    def __len__(self):
        return len(self.labels)

    def __str__(self):
        return str(torch.cat((self.data, self.labels.unsqueeze(1)), 1))

    def __getitem__(self, i):
        return self.data[i], self.labels[i]

In [6]:
dataset = RandomIntDataset(100, 1000, 500, 10)

There's some interesting properties that we may use with our custom *Dataset*:

In [14]:
len(dataset)

500

In [15]:
str(dataset)

'tensor([[779, 619, 804,  ..., 499, 265,   9],\n        [244, 943, 755,  ..., 220, 688,   9],\n        [587, 379, 261,  ..., 660, 522,   6],\n        ...,\n        [260, 759, 319,  ..., 638, 729,   6],\n        [611, 241, 974,  ..., 112, 688,   7],\n        [967, 677, 133,  ..., 209, 144,   7]])'

Also, as we are inheriting from the `Dataset` module we have the power of using batches in our data with `DataLoader`:

In [17]:
dataset_loader = DataLoader(dataset, batch_size=10, shuffle=True)
data, labels = next(iter(dataset_loader))
data

tensor([[683, 484, 235, 188, 722, 391, 935, 723, 572, 824],
        [154, 960, 672, 540, 746, 491, 173, 704, 442, 182],
        [511, 125, 257, 619, 276, 502, 537, 963, 307, 999],
        [729, 749, 271, 924, 578, 784, 337, 858, 420, 303],
        [805, 383, 557, 768, 214, 244, 739, 448, 408, 425],
        [823, 789, 699, 961, 356, 135, 433, 248, 454, 947],
        [226, 181, 478, 139, 869, 629, 177, 442, 349, 901],
        [838, 379, 330, 698, 991, 659, 635, 667, 520, 970],
        [444, 171, 988, 759, 397, 267, 758, 985, 728, 760],
        [854, 934, 190, 433, 552, 100, 822, 860, 297, 793]])

You can also combine tensor `Datasets` with `pd` loaders to create your own data: 

In [49]:
class TaxiSample(Dataset):
    def __init__(self):
        super().__init__()
        df = pd.read_csv('data/taxi_data_sample.csv')
        
        features = ['passenger_count',
                    'pickup_longitude',
                    'pickup_latitude',
                    'dropoff_longitude',
                    'dropoff_latitude']
        
        target = 'trip_duration'
        
        self.features = torch.tensor(df[features].values, 
                                     dtype=torch.float32)

        self.labels = torch.tensor(df[target].values, 
                                     dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [57]:
# Defining data and loader
data_taxi = TaxiSample()
dataset_loader = DataLoader(data_taxi, batch_size=20, shuffle=True)

In [58]:
data_iterator = iter(dataset_loader)
data, labels = next(data_iterator)

In [60]:
data, labels

(tensor([[  1.0000, -73.9939,  40.7404, -73.9811,  40.7452],
         [  1.0000, -73.9975,  40.7417, -74.0117,  40.7078],
         [  1.0000, -73.9758,  40.7916, -73.9590,  40.7811],
         [  3.0000, -73.9726,  40.7942, -73.9869,  40.7767],
         [  5.0000, -74.0049,  40.7375, -73.9994,  40.7226],
         [  1.0000, -73.9825,  40.7685, -73.9686,  40.7647],
         [  2.0000, -73.9584,  40.7643, -73.9566,  40.7671],
         [  1.0000, -73.9780,  40.7865, -73.9697,  40.7602],
         [  1.0000, -73.9707,  40.7885, -73.9814,  40.7618],
         [  1.0000, -73.9786,  40.7410, -73.9916,  40.7392],
         [  1.0000, -73.7786,  40.6467, -73.9572,  40.7676],
         [  2.0000, -73.9700,  40.7892, -73.9644,  40.7548],
         [  1.0000, -73.9863,  40.7458, -73.9772,  40.7436],
         [  1.0000, -73.9558,  40.7643, -73.9670,  40.7600],
         [  1.0000, -74.0075,  40.7429, -73.9946,  40.7504],
         [  1.0000, -73.9650,  40.8001, -73.9466,  40.8061],
         [  1.0000, -74.