# PyTorch Custom Data Creation (from Food101)

Making a dataset to use with notebook 04 (creating a PyTorch dataset).

Going to leverage the fact that PyTorch now incorporates Food101.

I'll get the images for 3 classes: Pizza, Steak, Sushi and store them in their respective files.

> **Note:** To use the Food101 dataset with PyTorch requires PyTorch 1.11+ and `torchvision` 0.12.0+.

Want:

```
pizza_steak_sushi/
    train/
        pizza/
            image01.jpeg
            image02.jpeg
            ...
        steak/
            image04.jpeg
            image05.jpeg
            ...
        sushi/
            image07.jpeg
            ...
    test/
        pizza/
            image101.jpeg
            image102.jpeg
            ...
        steak/
            image104.jpeg
            image105.jpeg
            ...
        sushi/
            image107.jpeg
            ...
```

In [19]:
import torch
import torchvision

# Note: Required to have PyTorch > 1.11.0 & torchvision > 0.12.0 for Food101 dataset
#assert int(torch.__version__.split(".")[1]) >= 11
#assert int(torchvision.__version__.split(".")[1]) >= 11

import torchvision.datasets as datasets
import torchvision.transforms as transforms

# Setup data directory
import pathlib
data_dir = pathlib.Path("../data")

## Download data

Get the Food101 dataset from PyTorch.
* Food101 in `torchvision.datasets` - https://pytorch.org/vision/stable/generated/torchvision.datasets.Food101.html
* Original Food101 dataset - https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/ 

> **Note:** Downloading the dataset from PyTorch may take ~10-15 minutes depending on your internet speed. It will download ~5GB of data to the specified `root` directory.

In [20]:
# Get training data
train_data = datasets.Food101(root=data_dir,
                              split="train",
                              # transform=transforms.ToTensor(),
                              download=True)

# Get testing data
test_data = datasets.Food101(root=data_dir,
                             split="test",
                             # transform=transforms.ToTensor(),
                             download=True)

In [21]:
train_data

Dataset Food101
    Number of datapoints: 75750
    Root location: ../data
    split=train

In [22]:
class_names = train_data.classes
class_names[:10]

['apple_pie',
 'baby_back_ribs',
 'baklava',
 'beef_carpaccio',
 'beef_tartare',
 'beet_salad',
 'beignets',
 'bibimbap',
 'bread_pudding',
 'breakfast_burrito']

In [None]:
# View first sample (PIL Image format)
print(class_names[train_data[0][1]])
train_data[0][0]

## Find subset of appropriate classes

Want: Steak, pizza, sushi.

Current path setup:

```
../data/food-101/images/CLASS_NAME/IMAGES.jpg
```

Going to get a list of the different target image classes (`pizza`, `steak`, `sushi`) filenames and then copy the images to separate folders.

I'd like to get a random 10% of the images from the target classes from both datasets.

In [30]:
# Get random 10% of training images
import random

# Setup data paths
data_path = data_dir / "food-101" / "images"
target_classes = ["pizza", "steak", "sushi"]

# Change amount of data to get (e.g. 0.1 = random 10%, 0.2 = random 20%)
amount_to_get = 0.77

# Create function to separate a random amount of data
def get_subset(image_path=data_path,
               data_splits=["train", "test"], 
               target_classes=["pizza", "steak", "sushi"],
               amount=0.1,
               seed=42):
    random.seed(73)
    label_splits = {}
    
    # Get labels
    for data_split in data_splits:
        print(f"[INFO] Creating image split for: {data_split}...")
        label_path = data_dir / "food-101" / "meta" / f"{data_split}.txt"
        with open(label_path, "r") as f:
            labels = [line.strip("\n") for line in f.readlines() if line.split("/")[0] in target_classes] 
        
        # Get random subset of target classes image ID's
        number_to_sample = round(amount * len(labels))
        print(f"[INFO] Getting random subset of {number_to_sample} images for {data_split}...")
        sampled_images = random.sample(labels, k=number_to_sample)
        
        # Apply full paths
        image_paths = [pathlib.Path(str(image_path / sample_image) + ".jpg") for sample_image in sampled_images]
        label_splits[data_split] = image_paths
    return label_splits
        
label_splits = get_subset(amount=amount_to_get)
label_splits["train"][:10]

[INFO] Creating image split for: train...
[INFO] Getting random subset of 1732 images for train...
[INFO] Creating image split for: test...
[INFO] Getting random subset of 578 images for test...


[PosixPath('../data/food-101/images/steak/290850.jpg'),
 PosixPath('../data/food-101/images/pizza/3464027.jpg'),
 PosixPath('../data/food-101/images/sushi/3792053.jpg'),
 PosixPath('../data/food-101/images/sushi/3218663.jpg'),
 PosixPath('../data/food-101/images/steak/1000205.jpg'),
 PosixPath('../data/food-101/images/sushi/2847583.jpg'),
 PosixPath('../data/food-101/images/steak/3223601.jpg'),
 PosixPath('../data/food-101/images/pizza/2481333.jpg'),
 PosixPath('../data/food-101/images/sushi/3392092.jpg'),
 PosixPath('../data/food-101/images/sushi/166134.jpg')]

## Move training and testing images to dedicated folders

In [31]:
# Create target directory path
target_dir_name = f"../data/pizza_steak_sushi_{str(int(amount_to_get*100))}_percent"
print(f"Creating directory: '{target_dir_name}'")

# Setup the directories
target_dir = pathlib.Path(target_dir_name)

# Make the directories
target_dir.mkdir(parents=True, exist_ok=True)

Creating directory: '../data/pizza_steak_sushi_77_percent'


In [32]:
import shutil

for image_split in label_splits.keys():
    for image_path in label_splits[str(image_split)]:
        dest_dir = target_dir / image_split / image_path.parent.stem / image_path.name
        if not dest_dir.parent.is_dir():
            dest_dir.parent.mkdir(parents=True, exist_ok=True)
        print(f"[INFO] Copying {image_path} to {dest_dir}...")
        shutil.copy2(image_path, dest_dir)

[INFO] Copying ../data/food-101/images/steak/290850.jpg to ../data/pizza_steak_sushi_77_percent/train/steak/290850.jpg...
[INFO] Copying ../data/food-101/images/pizza/3464027.jpg to ../data/pizza_steak_sushi_77_percent/train/pizza/3464027.jpg...
[INFO] Copying ../data/food-101/images/sushi/3792053.jpg to ../data/pizza_steak_sushi_77_percent/train/sushi/3792053.jpg...
[INFO] Copying ../data/food-101/images/sushi/3218663.jpg to ../data/pizza_steak_sushi_77_percent/train/sushi/3218663.jpg...
[INFO] Copying ../data/food-101/images/steak/1000205.jpg to ../data/pizza_steak_sushi_77_percent/train/steak/1000205.jpg...
[INFO] Copying ../data/food-101/images/sushi/2847583.jpg to ../data/pizza_steak_sushi_77_percent/train/sushi/2847583.jpg...
[INFO] Copying ../data/food-101/images/steak/3223601.jpg to ../data/pizza_steak_sushi_77_percent/train/steak/3223601.jpg...
[INFO] Copying ../data/food-101/images/pizza/2481333.jpg to ../data/pizza_steak_sushi_77_percent/train/pizza/2481333.jpg...
[INFO] Cop

In [33]:
# Check lengths of directories
def walk_through_dir(dir_path):
  """
  Walks through dir_path returning its contents.
  Args:
    dir_path (str): target directory
  
  Returns:
    A print out of:
      number of subdiretories in dir_path
      number of images (files) in each subdirectory
      name of each subdirectory
  """
  import os
  for dirpath, dirnames, filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")
    
walk_through_dir(target_dir)

There are 2 directories and 0 images in '../data/pizza_steak_sushi_77_percent'.
There are 3 directories and 0 images in '../data/pizza_steak_sushi_77_percent/train'.
There are 0 directories and 578 images in '../data/pizza_steak_sushi_77_percent/train/sushi'.
There are 0 directories and 582 images in '../data/pizza_steak_sushi_77_percent/train/steak'.
There are 0 directories and 572 images in '../data/pizza_steak_sushi_77_percent/train/pizza'.
There are 3 directories and 0 images in '../data/pizza_steak_sushi_77_percent/test'.
There are 0 directories and 188 images in '../data/pizza_steak_sushi_77_percent/test/sushi'.
There are 0 directories and 196 images in '../data/pizza_steak_sushi_77_percent/test/steak'.
There are 0 directories and 194 images in '../data/pizza_steak_sushi_77_percent/test/pizza'.


Looks like we've got about ~75 training images per class and ~25 testing images per class (or more if you're using a higher percentage, e.g. ~150 training images per class and ~50 testing images per class for 20% of the data).

This should be enough for a starting dataset.

We can always increased them if needed.

## Zip up images folder to be more easily transported

In [34]:
# Zip pizza_steak_sushi images
zip_file_name = data_dir / f"pizza_steak_sushi_{str(int(amount_to_get*100))}_percent"
shutil.make_archive(zip_file_name, 
                    format="zip", 
                    root_dir=target_dir)

'/data/pizza_steak_sushi_77_percent.zip'

In [35]:
!ls -la ../data/

total 5299732
drwxr-xr-x 6 root root       4096 May 23 15:12 .
drwxr-xr-x 1 root root       4096 May 23 14:52 ..
drwxr-xr-x 4 3156  320       4096 Jul  9  2014 food-101
-rw-r--r-- 1 root root 4996278331 May 23 14:56 food-101.tar.gz
drwxr-xr-x 4 root root       4096 May 23 14:58 pizza_steak_sushi_100_percent
-rw-r--r-- 1 root root  158207541 May 23 14:59 pizza_steak_sushi_100_percent.zip
drwxr-xr-x 4 root root       4096 May 23 15:12 pizza_steak_sushi_77_percent
-rw-r--r-- 1 root root  121983360 May 23 15:12 pizza_steak_sushi_77_percent.zip
drwxr-xr-x 4 root root       4096 May 23 15:09 pizza_steak_sushi_95_percent
-rw-r--r-- 1 root root  150421213 May 23 15:11 pizza_steak_sushi_95_percent.zip


In [36]:
!mkdir -p pizza_steak_sushi
!unzip ../data/pizza_steak_sushi_20_percent.zip -d pizza_steak_sushi

unzip:  cannot find or open ../data/pizza_steak_sushi_20_percent.zip, ../data/pizza_steak_sushi_20_percent.zip.zip or ../data/pizza_steak_sushi_20_percent.zip.ZIP.


In [37]:
!ls ../data

food-101			   pizza_steak_sushi_77_percent
food-101.tar.gz			   pizza_steak_sushi_77_percent.zip
pizza_steak_sushi_100_percent	   pizza_steak_sushi_95_percent
pizza_steak_sushi_100_percent.zip  pizza_steak_sushi_95_percent.zip


In [38]:
walk_through_dir("pizza_steak_sushi")

There are 0 directories and 0 images in 'pizza_steak_sushi'.


In [39]:
# # Remove extra data
# import os
# os.remove("pizza_steak_sushi")
# shutil.rmtree("pizza_steak_sushi")

In [40]:
!ls

pizza_steak_sushi  sample_data


In [41]:
# Imports
import os
import zipfile
from pathlib import Path
import requests


# Mount the GDrive
from google.colab import drive
drive.mount('/content/drive')





Mounted at /content/drive


In [42]:
data_path = "/content/drive/Othercomputers/My MacBook Air/GitHub/-Machine_Learning/Learning_Pytorch/"


In [45]:
!cp "/data/pizza_steak_sushi_95_percent.zip" "/content/drive/Othercomputers/My MacBook Air/GitHub/-Machine_Learning/Learning_Pytorch/"