## 1. Get data



In [12]:
from typing_extensions import dataclass_transform
import os
import requests
import zipfile
from pathlib import Path
from torchvision import datasets, transforms

# Setup path to data folder
data_path = Path("data/")
image_path = data_path / "pizza_steak_sushi"

# If the image folder doesnt exist download
if image_path.is_dir():
  print(f"{image_path} already exist")
else:
  print(f"Not available yet. creating one")
  image_path.mkdir(parents=True, exist_ok=True)

# Download dataset
with open(data_path / "pizza_steak_sushi.zip", "wb") as f:
  request = requests.get("https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip")
  print("Downloading data")
  f.write(request.content)

# Unzip pizza, steak, sushi data
with zipfile.ZipFile(data_path / "pizza_steak_sushi.zip", "r") as zip_ref:
    print("Unzipping pizza, steak, sushi data...")
    zip_ref.extractall(image_path)

# Remove zip file
os.remove(data_path / "pizza_steak_sushi.zip")

# create simple transfomr
data_transform = transforms.Compose([
    transforms.Resize((64,64)),
    transforms.ToTensor(),
])

data/pizza_steak_sushi already exist
Downloading data
Unzipping pizza, steak, sushi data...


## 2. Datasets and Dataloader with script

In [16]:
# Create a directory for going_modular scripts
import os
os.makedirs("going_modular")

FileExistsError: [Errno 17] File exists: 'going_modular'

In [14]:
%%writefile going_modular/data_setup.py

""" Contains functionality for creating PyTorch DataLoader for image classification data """
import os

from torchvision import datasets, transforms
from torch.utils.data import DataLoader

NUM_WORKERS = os.cpu_count()

def create_dataloaders(
    train_dir: str,
    test_dir: str,
    transform: transforms.Compose,
    batch_size: int,
    numworkers: int=NUM_WORKERS
):
  """ Creates training and testing DataLoaders
  Takes in a training directory and testing directory path and turns
  them into PyTorch Datasets and then into PyTorch DataLoaders.

  Args:
    train_dir: Path to training directory.
    test_dir: Path to testing directory.
    transform: torchvision transforms to perform on training and testing data.
    batch_size: Number of samples per batch in each of the DataLoaders.
    num_workers: An integer for number of workers per DataLoader.

  Returns:
    A tuple of (train_dataloader, test_dataloader, class_names).
    Where class_names is a list of the target classes.
    Example usage:
      train_dataloader, test_dataloader, class_names = \
        = create_dataloaders(train_dir=path/to/train_dir,
                             test_dir=path/to/test_dir,
                             transform=some_transform,
                             batch_size=32,
                             num_workers=4)
  """

  # Use imagefolder to create dataset
  train_data = datasets.ImageFolder(train_dir, transform = transform)
  test_data = datasets.ImageFolder(test_dir, transform = transform)

  train_dir = image_path / "train"
  test_dir = image_path / "test"

  # Get class names
  class_names = train_data.classes

  # Turn images into data loaders
  train_dataloader = DataLoader(
      train_data,
      batch_size = batch_size,
      shuffle = True,
      num_workers = NUM_WORKERS,
      pin_memory= True
  )
  test_dataloader = DataLoader (
      test_data,
      batch_size=batch_size,
      shuffle=False,
      num_workers=NUM_WORKERS,
      pin_memory=True
  )

  return train_dataloader, test_dataloader, class_names

Overwriting going_modular/data_setup.py


In [15]:
# Test out the data_setup.py
# Import data_setup.py
from going_modular import data_setup

# Create train/test dataloader and get class names as a list
train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(train_dir = train_dir,
                                                                               test_dir = test_dir,
                                                                               transforms = data_transform,
                                                                               batch_size=32)

NameError: name 'train_dir' is not defined

# I think its just adding docstring to our previous learn and use the %%writefile on top of it - https://www.learnpytorch.io/05_pytorch_going_modular/