preface

cell no.: contents
3: Downloading data
5: Transforming the dataset in desired format
8: Checking lengths of dirs(walk_through_dir)
9: Zip images folder to be easily transported
13: How to remove the these dirs(for learning purpose)
15: Train and Test dir paths

In [1]:
import torch, torchvision, os, requests
from torch import nn
import matplotlib.pyplot as plt
import torchvision.datasets as datasets
import torchvision.transforms as transforms

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [None]:
# Downloading Food101 dataset
import pathlib

data_dir = pathlib.Path('/data')
image_path = data_dir / 'pizza_steak_sushi'

train_data = datasets.Food101(root=data_dir,
                              split="train",
                              # transform=transforms.ToTensor(),
                              download=True)

# Get testing data
test_data = datasets.Food101(root=data_dir,
                             split="test",
                             # transform=transforms.ToTensor(),
                             download=True)

# It's a 5GB storage dataset so will take some time

Downloading https://data.vision.ee.ethz.ch/cvl/food-101.tar.gz to /data/food-101.tar.gz


 93%|█████████▎| 4.63G/5.00G [05:13<00:29, 12.4MB/s]

In [None]:
class_names = train_data.classes
print('first 10 class names:')
print(class_names[:10])

# View first sample (PIL Image format)
print(class_names[train_data[0][1]])
train_data[0][0]

### In following cell I will be transforming the food-101 data like this:

/data/pizza_steak_sushi_10_percent/
  - pizza/
    - image1.jpg
    - image2.jpg
  - steak/
    - image3.jpg
  - sushi/
    - image4.jpg


In [None]:
# Trnsforming the dataset in desired format
# The following cell is for practise

# First we will train model on 3 classes pizza, steak, sushi

import random
data_path = data_dir / "food-101" / "images"
target_classes = ['pizza', 'steak', 'sushi']
amount_to_get = .1

# Now 10% of the food101 data for target data(10% data causes we want to start small) see line 29 for this
# A function to sepatrate ramdom amount of data

def get_subset(image_path=data_path,
               data_splits=["train", "test"],
               target_classes=["pizza", "steak", "sushi"],
               amount=0.1,
               seed=42):
    random.seed(42)
    label_splits = {}

    # Get labels
    for data_split in data_splits:
        print(f"[INFO] Creating image split for: {data_split}...")
        label_path = data_dir / "food-101" / "meta" / f"{data_split}.txt"
        # In the data_path / "food-101" / "meta" / train.txt are image files names written like this:
        # apple_pie/100791.jpg
        # apple_pie/101003.jpg,,,,, the first part representt the label name so
        with open(label_path, "r") as f:
            labels = [line.strip("\n") for line in f.readlines() if line.split("/")[0] in target_classes]
        # ^^^ This line only keep those that are present the target classes
        # The strip would make Hello/nworld -> Helloworld or concate all label as a list

        number_to_sample = round(amount * len(labels))
        print(f"[INFO] Getting random subset of {number_to_sample} images for {data_split}...")
        sampled_images = random.sample(labels, k=number_to_sample)# takes no. of samples from labels list

        # Apply full paths
        image_paths = [pathlib.Path(str(image_path / sample_image) + ".jpg") for sample_image in sampled_images]
        label_splits[data_split] = image_paths
        # ^^^ in his label_splits the images are stored with corresponding train or test label

    return label_splits

label_splits = get_subset(amount=amount_to_get)
label_splits["train"][:10]

In [None]:
# Create desired directory
import pathlib

amount = .1
target_dir_name = f"../data/pizza_steak_sushi_{str(int(amount*100))}_percent"
print("Creating directory"+target_dir_name)

# Okay I was having prb with os so I switched to path lib here
# Setting up dir
# target_dir = os.path.join(target_dir_name)
target_dir = pathlib.Path(target_dir_name)

# Making dir
# os.makedirs(target_dir_name, exist_ok=True)
target_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# image_path.parent: This retrieves the parent directory of the image file. For
# example, if image_path is "food-101/pizza/apple_pie/12345.jpg", then image_path.parent would be "food-101/pizza/apple_pie".
# .stem: This attribute of the Path object extracts the filename without the extension.
# So, in this example, image_path.parent.stem would be "apple_pie".
# To summurize, .stem gets the file name without extension .name gets the file with extenstion

import shutil # shutil is used dir or file manupulation

for image_split in label_splits.keys():
    for image_path in label_splits[str(image_split)]:
        dest_dir = target_dir / image_split / image_path.parent.stem / image_path.name
        # Making sure .parent exists
        if not dest_dir.parent.is_dir():
            dest_dir.parent.mkdir(parents=True, exist_ok=True)
        print(f"[INFO] Copying {image_path} to {dest_dir}...")
        shutil.copy2(image_path, dest_dir)

In [None]:
# Checking lengths of dirs(walk_through_dir)


# A helper function to run through dir and see it's contents
# os.walk() is used for such purpose:
# It generates a 3-tuple for each directory:

# Root Directory: The current directory path.
# Directories: A list of subdirectories within the current directory.
# Files: A list of files within the current directory.

def walk_through_dir(dir_path):
  """
  Walks through dir_path returning its contents.
  Args:
    dir_path (str): target directory

  Returns:
    A print out of:
      number of subdiretories in dir_path
      number of images (files) in each subdirectory
      name of each subdirectory
  """
  import os
  for dirpath, dirnames, filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")

walk_through_dir(target_dir)

# Zip images folder to be easily transported

In [None]:
zip_file_name = data_dir / f"pizza_steak_sushi_{str(int(amount_to_get*100))}_percent"
shutil.make_archive(zip_file_name,
                    format="zip",
                    root_dir=target_dir)

##I kind of got confused with the root dir thing so what it does is:
this is how target_dir looks:

/data/pizza_steak_sushi_10_percent/
  - pizza/
    - image1.jpg
    - image2.jpg
  - steak/
    - image3.jpg
  - sushi/
    - image4.jpg

this how it will look like:
pizza_steak_sushi_10_percent.zip
  - pizza/
    - image1.jpg
    - image2.jpg
  - steak/
    - image3.jpg
  - sushi/
    - image4.jpg

so root_dir tells the path or target and zip makes the whole dir zipped


In [None]:
!ls -la ../data/

In [None]:
import os

# Check if the directory exists
if not os.path.exists("pizza_steak_sushi"):
  # Create the directory if it doesn't exist
  !mkdir -p pizza_steak_sushi # -p means to create any necessary parents dir that don't exists
else:
  print("Directory 'pizza_steak_sushi' already exists. Skipping creation.")

# Unzip the archive
!unzip -o ../data/pizza_steak_sushi_10_percent.zip -d pizza_steak_sushi
# -o tells unzip to overwrite the files(so it would ask for replace in the terminal)
# -d stads for destination(a shell command)

In [None]:
walk_through_dir("pizza_steak_sushi")

# How to remove this dirs

In [None]:
# # Remove extra data
# import os
# os.remove("pizza_steak_sushi") < rm the file if "pizza_steak_sushi" is a dir then raise error
# shutil.rmtree("pizza_steak_sushi") < recursively rmv dir and it's subdir

# I need these files of course so I commented these out

# The following is not needed as I created the zip myself

In [None]:
# if os.path.isdir(data_path):
#   print(f"{image_path} directory exists.")
# else:
#   print(f"Did not find {image_path} directory, creating one...")
#   os.makedirs(image_path, exist_ok=True)
#   with open(os.path.join(data_path / "pizza_steak_sushi.zip"), "wb") as f:
#     requests.get("https://github.com/mrdbourke/pytorch-deep-learning/raw/main/data/pizza_steak_sushi.zip")
#     print("Downloading pizza, steak, sushi data...")
#     f.write(requests.content)

#   with zipfile.ZipFile(data_path/"pizza_steak_sushi.zip", 'r') as zip_ref:
#     print("Unzipping pizza, steak, sushi data...")
#     zip_ref.extractall(image_path)

In [None]:
# Train and test dir paths

# From cell 12 output we can see form the 3rd line there are about 75 images for per pizze,steak and sushi
# and 19,31,25 images for tests(Starting small than gradually increasing the no. as needed)

train_dir = image_path / "train"
test_dir = image_path / "test"

train_dir, test_dir