# Imports

## Python Libraries

In [298]:
import numpy as np
import os
import pandas as pd
import pickle
import torch
import torch.nn.functional as F

from PIL import Image
from torch.utils.data import DataLoader,Dataset
from torch.utils.tensorboard import SummaryWriter
from torchvision import transforms



## Importing the datasets

In [299]:
dataset = pd.read_csv('Datasets/Products.csv', lineterminator='\n')

# Cleaning the Dataset

## Cleaning the table

In [300]:
dataset.count(), dataset.dropna().count(), dataset["price"].map(lambda x: x[0]).unique(), dataset["price"].map(lambda x: x[-3:]).unique()

(Unnamed: 0             7156
 id                     7156
 product_name           7156
 category               7156
 product_description    7156
 price                  7156
 location               7156
 dtype: int64,
 Unnamed: 0             7156
 id                     7156
 product_name           7156
 category               7156
 product_description    7156
 price                  7156
 location               7156
 dtype: int64,
 array(['£'], dtype=object),
 array(['.00', '.99', '.78', '.01', '.97', '.25', '.50', '.20', '.90',
        '.80', '.60', '.23', '.05', '.75', '.56', '.40', '.44', '.95',
        '.66', '.35', '.85', '.30', '.45', '.16', '.69', '.49', '.55',
        '.09', '.11'], dtype=object))

In [301]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7156 entries, 0 to 7155
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           7156 non-null   int64 
 1   id                   7156 non-null   object
 2   product_name         7156 non-null   object
 3   category             7156 non-null   object
 4   product_description  7156 non-null   object
 5   price                7156 non-null   object
 6   location             7156 non-null   object
dtypes: int64(1), object(6)
memory usage: 391.5+ KB


# So the dataset seems to already be clean

Now we convert the price column

In [302]:
dataset_cleaned = dataset

def remove_pound_sign(string_to_replace) -> str:
    return 

dataset_cleaned['price'] = dataset_cleaned['price'].map(lambda x : x.replace('£', '')) #Removes the pound signs
dataset_cleaned['price'] = dataset_cleaned['price'].map(lambda x: x.replace(',', '')) #Removes commas

dataset_cleaned['price'] = pd.to_numeric(dataset_cleaned['price'])

# Now we extract the root category from each item

In [303]:
dataset_cleaned['category'] = dataset_cleaned['category'].map(lambda x: x.split(' /' )[0])

In [304]:
dataset_cleaned['category'].unique()

array(['Home & Garden', 'Baby & Kids Stuff', 'DIY Tools & Materials',
       'Music, Films, Books & Games', 'Phones, Mobile Phones & Telecoms',
       'Clothes, Footwear & Accessories', 'Other Goods',
       'Health & Beauty', 'Sports, Leisure & Travel', 'Appliances',
       'Computers & Software', 'Office Furniture & Equipment',
       'Video Games & Consoles'], dtype=object)

Now we create an encoder to go to and from the categories and integers

In [306]:
list_of_categories = list(dataset_cleaned['category'].unique())

encoder = {x: list_of_categories.index(x) for x in list_of_categories}

decoder = {list_of_categories.index(x):x for x in list_of_categories}

encoder, decoder

#Let's save these to pickle files:

with open("encoder_pickle", 'wb') as encody:
    pickle.dump(encoder, encody)

with open("decoder_pickle", 'wb') as decody:
    pickle.dump(decoder, decody)

# Now we merge this with the original table:

In [242]:
dataset_cleaned['root_category'] = dataset_cleaned['category']
dataset_cleaned['root_category_index'] = dataset_cleaned['category'].map(lambda x:encoder[x])

# Next we merge this with the images table:

In [243]:
images_dataset = pd.read_csv('Datasets/Images.csv')

dataset_cleaned["merge_column"] = dataset_cleaned['id']
images_dataset['merge_column'] = images_dataset['product_id']

dataset_cleaned.dtypes, images_dataset.dtypes

(Unnamed: 0               int64
 id                      object
 product_name            object
 category                object
 product_description     object
 price                  float64
 location                object
 root_category           object
 root_category_index      int64
 merge_column            object
 dtype: object,
 Unnamed: 0       int64
 id              object
 product_id      object
 merge_column    object
 dtype: object)

In [244]:
merged_df = images_dataset.merge(dataset_cleaned, on='merge_column')

In [245]:
merged_df = merged_df[["id_x", "product_id", "root_category", "root_category_index"]]
merged_df = merged_df.rename(columns={'id_x':'id'})
merged_df = merged_df.drop(columns=['product_id'])

merged_df.dtypes

id                     object
root_category          object
root_category_index     int64
dtype: object

In [246]:
merged_df.to_csv('Datasets/training_data.csv', index=False)

## Clean Image Dataset:

In [287]:
def resize_image(final_size, im):
    size = im.size
    ratio = float(final_size) / max(size)
    new_image_size = tuple([int(x*ratio) for x in size])
    im = im.resize(new_image_size)
    new_im = Image.new("RGB", (final_size, final_size))
    new_im.paste(im, ((final_size-new_image_size[0])//2, (final_size-new_image_size[1])//2))
    return new_im

def clean_images(path_to_extract = "Datasets/images/", path_to_save = "Datasets/cleaned_images/"):
    dirs = os.listdir(path_to_extract)
    final_size = 224
    for n, item in enumerate(dirs, 1):
        #print(n, item)
        im = Image.open(path_to_extract + item)
        #print(im.width, im.height)
        new_im = resize_image(final_size, im)
        #print(new_im.width, new_im.height)
        new_im.save(path_to_save + item)


In [288]:
#clean_images(path_to_save="Datasets/cleaned_images_224/")

# Working with the Model

## Casting into a PyTorch dataset

In [332]:
class ImageDataset(Dataset):
    def __init__(self, df_of_keys:pd.DataFrame, folder_of_images:str) -> None:
        super().__init__()

        # Set the labels to be the column 'root_cotegory_index' of df_of_keys
        self.labels = torch.tensor(df_of_keys['root_category_index'])

        # Assings image_paths to the file name from the column 'id' of df_of_keys and maps it to it's path relative to the project root folder
        self.image_paths = df_of_keys['id'].map(lambda x : folder_of_images + x + '.jpg')

        # This will be used later to turn images from self.image_paths into tensors
        self.image_transformer = transforms.PILToTensor()
    
    def __getitem__(self, index):
        
        # Opens the image at index with using PIL.Image
        with Image.open(self.image_paths[index]) as img:

            # Sets the feature to be a tensor obtained by applying PILToTensor to the relevant image
            X = self.image_transformer(img)
        
        # y is the label from self.labels
        y = torch.tensor(self.labels[index])

        #print(X.shape, y.shape)
        return (X, y)
    
    def __len__(self):
        return len(self.labels)

In [341]:
my_dataset = ImageDataset(merged_df, 'Datasets/cleaned_images_224/')

# A quick sanity check of the dataset:
my_dataset[1][0].shape, my_dataset[1][1].shape, len(my_dataset)

  y = torch.tensor(self.labels[index])


(torch.Size([3, 224, 224]), torch.Size([]), 12604)

## Splitting the dataset:

In [342]:
train_dataset, test_dataset = torch.utils.data.random_split(
    my_dataset, [10000, len(my_dataset) - 10000]
)

train_dataset, validation_dataset = torch.utils.data.random_split(
    train_dataset, [8000, 2000]
)

train_loader = DataLoader(train_dataset, batch_size=8)
validation_loader = DataLoader(validation_dataset, batch_size=len(validation_dataset))
test_loader = DataLoader(test_dataset, batch_size=len(test_dataset))

## Defining the model

In [343]:
# Sets the device for PyTorch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# Loads the resnet50 model
resnet50 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', model = 'nvidia_resnet50', pretrained = True )
# Don't know what this does
utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_convnets_processing_utils')
# Again don't know what this does
resnet50.eval().to(device)

Using cache found in /home/ibs/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub
Using cache found in /home/ibs/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layers): Sequential(
    (0): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d

## Examining the model

Let's print out the layers of resnet50:

In [344]:
resnet50.modules

<bound method Module.modules of ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layers): Sequential(
    (0): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): S

- The final layer, `fc` takes in 2048 features and outputs 1000 features. The number of labels we need to classify, i.e. the number of distinct values in `merged_df['root_categories']` is 13.

- So we need to modify the `fc` layer so that it has 13 outputs. We do this by changin `fc` to `torch.nn.Linear(2048, 13)`.

In [345]:
class ImageClassifier(torch.nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.resnet50 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_resnet50', pretrained=True)
        self.resnet50.fc = torch.nn.Linear(2048, 13)
        #print(resnet50.modules)
    
    def forward(self, X):
        return F.softmax(self.resnet50(X.float()))

In [346]:
model = ImageClassifier()

Using cache found in /home/ibs/.cache/torch/hub/NVIDIA_DeepLearningExamples_torchhub


In [365]:
def train(model, train_loader, validation_loader, epochs = 10, learning_rate =0.01, stop_point:int = 0):

    optimiser = torch.optim.SGD(model.parameters(), lr=learning_rate) #Automatically updates the parameters of model, works due to inheritacne of torch.nn.module
    writer = SummaryWriter()
    batch_index = 0

    for epoch in range(epochs):
        for batch in train_loader:
            features, labels = batch
            predictions = model(features)
            labels = labels
            loss = F.cross_entropy(predictions, labels)
            loss.backward() #Creates grad attributes
            print(f"Epoch: {epoch}, batch index: {batch_index}, loss:{loss.item()}")

            optimiser.step() #Optimiser looks through the optimiser for parameters and updates them using grad attribute
            optimiser.zero_grad() #Without this, loss.backwards will add to the new gradients, so things will be messed up
            writer.add_scalar('loss', loss.item(), batch_index)

            batch_index += 1
            if batch_index == stop_point:
                break
        if batch_index == stop_point:
                break
        # Print the validation loss
        validation_predictions = model(next(iter(validation_loader))[0])
        validation_labels = next(iter(validation_loader))[1]
        validation_loss = F.cross_entropy(validation_predictions, validation_labels)
        print(f"Epoch {epoch}, validation loss{validation_loss}")
        



In [366]:
train(model, train_loader, validation_loader, stop_point=5)

  y = torch.tensor(self.labels[index])


In [360]:
next(iter(validation_loader))[0]

  y = torch.tensor(self.labels[index])


tensor([[[[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]],

         [[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]],

         [[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]]],


        [[[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          ...,
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0, 0]],

         [[0, 0, 0,  ..., 0, 0, 0],
          [0, 0, 0,  ..., 0, 0