# Imports

In [None]:
import torch
import sklearn
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# Computer vision libraries
import torchvision
from torchvision import datasets
from torchvision.transforms import v2
#from torchvision import transforms as T


from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.utils.data import random_split
from torch.utils.data import ConcatDataset

# Directory libraries
import pathlib
import os
import shutil

# Extras
import random
from collections import defaultdict
from PIL import Image

# Jupyter Themes
from jupyterthemes import jtplot
jtplot.style(theme='gruvboxd')

In [None]:
device = 'cuda' if torch.cuda.is_available else 'cpu'
device

# Download data

In [None]:
data_path = r'C:\Users\alvar\Documentos\Jupyter-Projects\FreeCode\data'

In [None]:
# Get training data
train_data = datasets.Food101(root=data_path,
                              split="train",                              
                              transform=v2.ToImage(),
                              download=True)

# Get testing data
test_data = datasets.Food101(root=data_path,
                             split="test",                             
                             transform=v2.ToImage(),
                             download=True)

In [None]:
train_data, test_data

In [None]:
class_names = train_data.classes
type(train_data), len(class_names)

In [None]:
# Assuming your list is named 'my_list'
#my_list = [str(i) for i in range(101)]  # Convert elements to strings

# Number of rows and columns in the grid
num_rows = 21
num_cols = 5

# Iterate through the list and print in a grid format
for i in range(num_rows):
    for j in range(num_cols):
        index = i * num_cols + j
        if index < len(class_names):
            element = class_names[index]            
            print(f'| {index} {element:<25} ', end='')  # Adjust the formatting as needed
    print("\n\n") # Move to the next line after printing each row

In [None]:
img, label = next(iter(train_data))

In [None]:
plt.imshow(img.permute(1, 2, 0))
plt.title(class_names[label])
plt.axis(False);
print(img.shape)

# Subset

In [None]:
# Create function to separate a random amount of data
def get_subset(data_dir=data_path,               
               data_splits=["train", "test"], 
               target_classes=["pizza", "steak", "sushi"],
               amount=0.1):
    '''
    Function to separate a random amount of data. 
    Train - Each class contains 750 images, there are 101 different classes (750*101 = 75, 750) 
    Test - Each class contains 250 images, there are 101 different classes (250*101 = 25, 250) 
    So, the amount of data is calculated by each class pass throught `target_classes`.
    The amount is calculated for each class and then each value is added, this results in the total 
    number of examples.
    
    Example:
    If the amount is 0.1 (10%) and 5 classes are passed.
    10% is calculated for each class.
    Training data: 10% of 750 = 75, this is repeated 5 times (one time for each class)
    Total number of examples for training data: (75+75+75+75+75) = 375
    
    Testing data: 10% of 250 = 25, this is repeated 5 times (one time for each class)
    Total number of examples for testing data: (25+25+25+25+25) = 125
    
    For example, for each class it can return something like this, where each item represents the amount
    of elements for each class.
    [76, 79, 92, 53, 75] = 375
    
    Finally:
    Total number of examples are selected within the data that belong to the target classes. Hence
    the total elements for each class may vary, one class can have more elements than another and 
    vice versa.
    
    Returns: 
    This return a dictionary where the key indicates if it's training or testing data. For each
    data (training, testing) a list is returned with the image path for each image.
    '''
    
    label_splits = {}
    image_path = f'{data_path}/food-101'
    
    # Get labels
    for data_split in data_splits:
        print(f"[INFO] Creating image split for: {data_split}...")
        #label_path = data_dir / "food-101" / "meta" / f"{data_split}.txt"
        
        # Open the txt file that contains the name of each image        
        label_path = f'{image_path}/meta/{data_split}.txt'
        with open(label_path, "r") as f:
            # Select the names that coincide with the target classes (list)
            labels = [line.strip("\n") for line in f.readlines() if line.split("/")[0] in target_classes] 
        
        # Total of elements to select
        number_to_sample = round(amount * len(labels))
        print(f"[INFO] Getting random subset of {number_to_sample} images for {data_split}...")
        # Get random subset of target classes image ID's (list with length equal to `number_to_sample`)
        sampled_images = random.sample(labels, k=number_to_sample)
        
        # Apply full paths
        # Create the path for each image        
        image_paths = [f'{image_path}/images/{sample_image}.jpg' for sample_image in sampled_images]
        
        # Create a dictionary with the train and test data
        label_splits[data_split] = image_paths
    return label_splits

In [None]:
def print_classes_amount(dictionary_classes, data_split):
    print(f'***** {data_split} data *****')
    total_elements = 0
    for item_class in dictionary_classes:        
        len_dict = len(dictionary_classes[item_class])
        print(f'{item_class}: {len_dict}')
        total_elements += len_dict    
    print(f'Total elements: {total_elements}' + '\n')

In [None]:
def get_amount_per_class(dict_data):
    '''
    Args:
    dict_data (dict): Dictionary where the key indicates if it's training or testing data. For each
    data (training, testing) a list is returned with the image path for each image.
    
    Return a dictionary where keys are training and testing data, each key contain another dictionary, this
    dictionary contains all item separated by class.
    For example:
    dict_split.keys() ---> ['train', 'test']
    dict_split['train'].keys() ---> ['pizza', 'tacos', 'donuts']
    dict_split['train']['pizza'] ---> ['./food_path/pizza/12345.jpg', './food_path/pizza/12346.jpg', ...]
    dict_split['train']['tacos'] ---> ['./food_path/tacos/12355.jpg', './food_path/tacos/12356.jpg', ...]    
    '''
    data_splits = list(dict_data.keys())
    # Create a defaultdict to store lists for each type
    # It creates an empty list, hence we can use `append` to add each element
    lists_dict = defaultdict(list)
    
    dict_split = {}    
        
    for data_split in data_splits:        
        for item in dict_data[data_split]:
            # Select the second part of the split (e.g. 'pizza/2572488.jpg')
            _, item_type = item.split('data/food-101/images/')
            # Select the first part of the split that contains the food type (e.g. pizza/)
            image_type, _ = item_type.split('/')
            # Append the item to the corresponding list in the dictionary
            lists_dict[image_type].append(item) 
        
        # Add the dictionary to each type of data
        dict_split[data_split] = lists_dict                    
        
        # Print the amount of each class
        print_classes_amount(lists_dict, data_split)
        
        # Reset the dictionary
        lists_dict = defaultdict(list)
    return dict_split            

In [None]:
target_classes = ['tacos', 'ramen', 'pizza', 'guacamole', 'donuts']

# Amount of data to get (e.g. 0.1 = random 10%, 0.2 = random 20%)
# Amount of data for each class
amount_to_get = 0.2

In [None]:
dict_data = get_subset(target_classes=target_classes, amount=amount_to_get)
len(dict_data['train']), len(dict_data['test'])

In [None]:
# Amount per class
dict_per_class = get_amount_per_class(dict_data)

# Move training and testing images to dedicated folders

In [None]:
def copy_images_to(destiny_path, dict_data, amount_data):
    '''
    Copy the images from the source path to the target directory path. The function creates all folder
    that are necesary.
    
    Args:    
    * destiny_path: Directory path where the images are going to be copied (if it doesn't exist, it will
    be created).    
    * dict_data: Dictionary that contains the path of each image. The firsts keys must be 'train' and
    'test', then each element of 'train' or 'test' must be the class to which the image belongs and finally
    each element of that is the directory path.
    Example:
        dict_data.keys() ---> ['train', 'test']
        dict_data['train'].keys() ---> ['tacos', 'pizza', 'sushi', ...]
        dict_data['train']['tacos'] ---> ['image_path/1234.jpg', 'image_path/1235.jpg', ...]
    * amount_data: Amount of data obtained (it's just to name the destination folder).            
    
    Return:
    Return the directory path where all images were copied.
    '''    
    
    # Create the directory name
    target_dir_name = '_'.join(dict_per_class['train'].keys())
    
    # Create target directory    
    target_dir_path = f'{destiny_path}/{target_dir_name}_{str(int(amount_data * 100))}_percent'                
    
    if os.path.exists(target_dir_path):
        print(f"Folder: {target_dir_path[52:]} exists!")
    else:
        os.makedirs(target_dir_path, exist_ok=True)
        print(f'Directory created at: {target_dir_path[52:]}')
    print()    
    
    for image_split in dict_data:
        for img_class in dict_data[image_split]:
            dest_dir = f'{target_dir_path}/{image_split}/{img_class}'
            dest_folder_path = os.path.join(current_directory, dest_dir)                        
            os.makedirs(dest_folder_path, exist_ok=True)        
            for img_path in dict_per_class[image_split][img_class]:
                shutil.copy2(img_path, dest_folder_path)            
                print(f"[INFO] Copying {img_path} to {dest_dir[52:]}...")     
    return target_dir_path

In [None]:
current_directory = os.getcwd()
prueba = 'data/food-101/'
folder_path = os.path.join(current_directory, prueba)

In [None]:
target_path = copy_images_to(folder_path, dict_per_class, amount_to_get)

# Becoming one with the data (data preparation and data exploration)

In [None]:
def walk_through_dir(dir_path):
    '''
    Walks through dir_path returning it's contents.
    '''
    
    for dirpath, dirnames, filenames in os.walk(dir_path):
        print(f'There are {len(dirnames)} directories and {len(filenames)} images in ./{dirpath[52:]}')

In [None]:
walk_through_dir(target_path)

In [None]:
# Setup train and testing data
TRAIN_DIR = target_path + '/train'
TEST_DIR = target_path + '/test'

# Transforming data

Before we can use our image data with Pytorch:  
1.- Turn our target data into tensors (in our case, numerical representation of our images)  
2.- Turn it into a `torch.util.data.Dataset` and subsequently a `torch.utils.data.DataLoader`, we'll call these `Dataset` and `DataLoader`.

In [None]:
# Write a transform for image
data_transform = v2.Compose([
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Resize(size=(256, 256)),
    v2.RandomHorizontalFlip(p=0.5),
    #v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

# Transform for Image Augmentation
data_transform_2 = v2.Compose([
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Resize(size=(256, 256)),
    v2.RandomVerticalFlip(0.5),
    v2.RandomHorizontalFlip(),
    v2.RandomRotation(10),
    v2.RandomGrayscale(p=0.1),
    v2.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
    v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
])

In [None]:
reverse_normalize = v2.Compose([    
    v2.Normalize(mean=[-0.5 / 0.5, -0.5 / 0.5, -0.5 / 0.5], std=[1 / 0.5, 1 / 0.5, 1 / 0.5])    
])

In [None]:
a = data_transform(img)  
plt.imshow(a.permute(1, 2, 0))

In [None]:
a_2 = reverse_normalize(a)
plt.imshow(a_2.permute(1, 2, 0))

# Dataset

In [None]:
class Food_Dataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        # List all folders inside of train or test path. Each folder's name is the name of the class that
        # contains
        self.classes = sorted(os.listdir(root_dir))

    def __len__(self):
        total_samples = 0
        for class_dir in self.classes:
            # Join the path with the name of the folder, thus we can access all images of each class
            class_path = os.path.join(self.root_dir, class_dir)
            # Sum all elements inside of each folder
            total_samples += len(os.listdir(class_path))
        # Return the total of images (all classes)
        return total_samples

    def __getitem__(self, idx):
        class_idx = 0
        '''
        If idx is greater than all elements in the folder of the first class means that the image is not
        in the first folder (first class) so the idx is subtracted, thus we can know the original position
        on the next folder.  
        Example:
        0_class: 10 elements
        1_class: 5 elements
        2_class: 30 elements
        idx = 14
        First check if idx is greater than all elements in the first folder.
        14 >= 10   ---> True 
        So, update idx
        idx = 14 - 10 = 4
        Then, update the class_idx
        class_idx = 0+1 = 1
        Repeat the process, check if id is greater than all elements in the second folder.
        4 >= 5   ---> False
        That means the image is on the second folder and it's the fourth image in that folder.
        *** With this we can find the image and get the class that it belongs. ***
        '''
        while idx >= len(os.listdir(os.path.join(self.root_dir, self.classes[class_idx]))):
            idx -= len(os.listdir(os.path.join(self.root_dir, self.classes[class_idx])))
            class_idx += 1
                
        class_dir = self.classes[class_idx]
        class_path = os.path.join(self.root_dir, class_dir)
        filenames = os.listdir(class_path)
        img_name = filenames[idx]        
        img_path = os.path.join(class_path, img_name)
        image = Image.open(img_path)
        
        if self.transform:
            image = self.transform(image)
        else:
            trans = v2.ToImage()
            image = trans(image)
        #image /= 255
        
        return image, class_idx