# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F
# from torchsummary import summary
from tqdm import tqdm
from torch.utils.data import random_split
import matplotlib.pyplot as plt # plotting distribution of classes

from torchvision import models
from torchvision import transforms

import os
from PIL import Image

from collections import OrderedDict

# Defining Class to Load the Data

In [2]:
LABEL_DICT = {
    'Sandal': 1,
    'Bottomwear': 2,
    'Shoes': 3,
    'Topwear': 4,
    'Innerwear': 5,
    'Loungewear and Nightwear': 6,
    'Watches': 7,
    'Fragrance': 8,
    'Eyewear': 9,
    'Lips': 10,
    'Bags': 11,
    'Saree': 12,
    'Wallets': 13,
    'Scarves': 14,
    'Jewellery': 15,
    'Dress': 16,
    'Ties': 17,
    'Flip Flops': 18,
    'Headwear': 19,
    'Makeup': 20,
    'Belts': 21,
    'Socks': 22,
    'Nails': 23,
    'Free Gifts': 24,
    'Apparel Set': 25,
    'Cufflinks': 26,
    'Accessories': 27
}

In [3]:
class FashionDatasetStacking(Dataset):
    """Fashion dataset."""
    def __init__(self, test=False):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.test = test
        if self.test:
            self.fashion_frame = pd.read_csv('/kaggle/input/uw-cs480-winter23/test.csv')
        else:
            self.fashion_frame = pd.read_csv('/kaggle/input/uw-cs480-winter23/train.csv')
        self.fashion_frame = pd.get_dummies(self.fashion_frame
                                            , columns = ['gender', 'baseColour', 'season', 'usage'])
        self.root_dir = '/kaggle/input/uw-cs480-winter23/noisy-images/noisy-images'
        self.transform_img = transforms.Compose(
            [transforms.ToTensor()
             , transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

    def __len__(self):
        return len(self.fashion_frame)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,
                                str(self.fashion_frame.iloc[idx, 0])+'.jpg')
        image = plt.imread(img_name)
        if self.test:
            features = self.fashion_frame.iloc[idx, 2:]
            text = self.fashion_frame.iloc[idx, 1]
            label = self.fashion_frame.iloc[idx, 0]
        else:
            features = self.fashion_frame.iloc[idx, 3:]
            text = self.fashion_frame.iloc[idx, 2]
            label = LABEL_DICT[self.fashion_frame.iloc[idx, 1]]
        features = np.array(features)
        features = features.astype('float32')  
        
        image = self.transform_img(image.copy())

        return image, features, text, label

# Defining the Model

In [4]:
class FashionClassifierLSTM(nn.Module):
    def __init__(self):
        super().__init__()
        # For images
        self.conv1 = nn.Conv2d(3, 16, (5, 3), padding='same')
        self.conv2 = nn.Conv2d(16, 16, (5, 3), padding='same')
        self.conv3 = nn.Conv2d(16, 32, (5, 3), padding='same')
        self.conv4 = nn.Conv2d(32, 32, (5, 3), padding='same')
        self.pool = nn.MaxPool2d(2, 2)
        
        # For text
        self.hidden_size = 1024
        self.embeddings = nn.Embedding(vocab_size, 50, padding_idx=0)
        self.lstm = nn.LSTM(50, 1024, num_layers=3, batch_first=True, dropout=0.4)
        self.linearlstm = nn.Linear(1024, 32*20*15)
        
        # For categorical variables
        self.ln1 = nn.Linear(62, 512)
        self.ln2 = nn.Linear(512, 32*20*15)
        
        # for concatenated
        self.fc1 = nn.Linear(32*20*15*3, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 512)
        self.fc4 = nn.Linear(512, 27)
        
        # Dropouts
        self.dp1 = nn.Dropout(p=0.25)
        self.dp2 = nn.Dropout(p=0.4)

    def forward(self, image, features, text, offsets):
        image = F.relu(self.conv1(image))
        image = F.relu(self.conv2(image))
        image = self.pool(image)
        image = self.dp1(image)

        image = F.relu(self.conv3(image))
        image = F.relu(self.conv4(image))
        image = self.pool(image)
        image = self.dp2(image)
        
        image = torch.flatten(image, 1)
        
        features = self.ln1(features)
        features = self.ln2(features)

        text = self.embeddings(text)
        lstm_out, (hidden, cell) = self.lstm(text)
        text = self.linearlstm(hidden[-1])
        
        output = torch.cat((image, features, text), 1)
        
        output = F.relu(self.fc1(output))
        output = F.relu(self.fc2(output))
        output = F.relu(self.fc3(output))
        output = F.log_softmax(self.fc4(output), 1)
        
        return output

# Load the Data and Run Experiments on the Model

In [5]:
fashion_data = FashionDatasetStacking()
fashion_test_data = FashionDatasetStacking(test=True)

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [6]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
max_length = 14

def yield_tokens(data_iter):
    for _,_,text,_ in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(fashion_data), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

def collate_batch_LSTM(batch, testing=False):
    label_list, text_list, offsets = [], [], [0]
    image_list = []
    feature_list = []
    for (image, features, _text, _label) in batch:
        if testing == True:
            label_list.append(_label)
        else:
            label_list.append(label_pipeline(_label))
        encoded_text = np.zeros(max_length, dtype=int)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        length = min(max_length, len(processed_text))
        encoded_text[:length] = processed_text[:length]
        
        text_list.append(torch.tensor(encoded_text, dtype=torch.int64))
        offsets.append(processed_text.size(0))
        image_list.append(image)
        feature_list.append(features)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    image_list = torch.utils.data.default_collate(image_list)
    feature_list = torch.utils.data.default_collate(feature_list)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.utils.data.default_collate(text_list)
    return image_list, feature_list, text_list, label_list, offsets

In [7]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.train()
    train_loss, correct = 0, 0
    for batch, (image, features, text, label, offsets) in enumerate(dataloader):
        image, features, text, label, offsets = image.to(device), features.to(device), text.to(device), label.to(device), offsets.to(device)

        pred = model(image, features, text, offsets)
        loss = loss_fn(pred, label)
        train_loss += loss.item()
        correct += (pred.argmax(1) == label).type(torch.float).sum().item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 5)
        optimizer.step()

    average_train_loss = train_loss / num_batches
    accuracy = correct / size
    return accuracy, average_train_loss

In [8]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for image, features, text, label, offsets in dataloader:
            image, features, text, label, offsets = image.to(device), features.to(device), text.to(device), label.to(device), offsets.to(device)
            pred = model(image, features, text, offsets)
            test_loss += loss_fn(pred, label).item()
            correct += (pred.argmax(1) == label).type(torch.float).sum().item()
    average_test_loss = test_loss / num_batches
    accuracy = correct / size
    return accuracy, average_test_loss

In [9]:
def experiment(model, optimizer, train_dataloader, test_dataloader, loss_fn, epochs=10):

    all_train_accuracies = []
    all_test_accuracies = []
    for t in tqdm(range(epochs)):
    
        # train
        train_accuracy, average_train_loss = train(train_dataloader, model, loss_fn, optimizer)
        all_train_accuracies += [train_accuracy]
    
        #test
        test_accuracy, average_test_loss = test(test_dataloader, model, loss_fn)
        all_test_accuracies += [test_accuracy]
    
        print(f"Epoch {t+1}:\t Train accuracy: {100*train_accuracy:0.1f}%\t Avg train loss: {average_train_loss:>6f}\t Test accuracy: {100*test_accuracy:0.1f}%\t Avg test loss: {average_test_loss:>6f}")
        
    return all_train_accuracies, all_test_accuracies

In [10]:
num_batches = 512
train_loader = DataLoader(fashion_data, shuffle=True, batch_size=num_batches, collate_fn=lambda x: collate_batch_LSTM(x, testing=False))
# validate_loader = DataLoader(validate_set, shuffle=True, batch_size=num_batches, collate_fn=lambda x: collate_batch_LSTM(x, testing=False))
test_loader = DataLoader(fashion_test_data, shuffle=False, batch_size=num_batches, collate_fn=lambda x: collate_batch_LSTM(x, testing=True))

loss_fn = nn.NLLLoss()
vocab_size = len(vocab)

In [11]:
from tqdm import tqdm
# batch size was 512

model = FashionClassifierLSTM().to(device)

optimizer = torch.optim.Adam(model.parameters())
loss_fn = nn.NLLLoss()
epochs = 40
for t in tqdm(range(epochs)):
    train_accuracy, average_train_loss = train(train_loader, model, loss_fn, optimizer)
    print(f"Epoch {t+1}:\t Train accuracy: {100*train_accuracy:0.1f}%")


  2%|▎         | 1/40 [09:16<6:01:43, 556.49s/it]

Epoch 1:	 Train accuracy: 43.6%


  5%|▌         | 2/40 [18:32<5:52:17, 556.24s/it]

Epoch 2:	 Train accuracy: 70.3%


  8%|▊         | 3/40 [27:51<5:43:43, 557.39s/it]

Epoch 3:	 Train accuracy: 79.5%


 10%|█         | 4/40 [37:08<5:34:19, 557.21s/it]

Epoch 4:	 Train accuracy: 84.0%


 12%|█▎        | 5/40 [46:32<5:26:29, 559.69s/it]

Epoch 5:	 Train accuracy: 86.7%


 15%|█▌        | 6/40 [55:51<5:17:02, 559.48s/it]

Epoch 6:	 Train accuracy: 88.9%


 18%|█▊        | 7/40 [1:05:00<5:05:46, 555.94s/it]

Epoch 7:	 Train accuracy: 90.4%


 20%|██        | 8/40 [1:14:13<4:56:08, 555.27s/it]

Epoch 8:	 Train accuracy: 92.3%


 22%|██▎       | 9/40 [1:23:30<4:47:08, 555.75s/it]

Epoch 9:	 Train accuracy: 92.7%


 25%|██▌       | 10/40 [1:32:51<4:38:37, 557.26s/it]

Epoch 10:	 Train accuracy: 94.4%


 28%|██▊       | 11/40 [1:42:17<4:30:40, 560.01s/it]

Epoch 11:	 Train accuracy: 95.4%


 30%|███       | 12/40 [1:51:52<4:23:26, 564.53s/it]

Epoch 12:	 Train accuracy: 95.8%


 32%|███▎      | 13/40 [2:01:22<4:14:46, 566.16s/it]

Epoch 13:	 Train accuracy: 96.7%


 35%|███▌      | 14/40 [2:10:48<4:05:17, 566.05s/it]

Epoch 14:	 Train accuracy: 97.3%


 38%|███▊      | 15/40 [2:20:08<3:55:04, 564.18s/it]

Epoch 15:	 Train accuracy: 97.4%


 40%|████      | 16/40 [2:29:31<3:45:32, 563.86s/it]

Epoch 16:	 Train accuracy: 97.2%


 42%|████▎     | 17/40 [2:38:54<3:36:07, 563.81s/it]

Epoch 17:	 Train accuracy: 97.8%


 45%|████▌     | 18/40 [2:48:13<3:26:10, 562.27s/it]

Epoch 18:	 Train accuracy: 98.0%


 48%|████▊     | 19/40 [2:57:36<3:16:51, 562.44s/it]

Epoch 19:	 Train accuracy: 98.1%


 50%|█████     | 20/40 [3:07:01<3:07:42, 563.13s/it]

Epoch 20:	 Train accuracy: 98.4%


 52%|█████▎    | 21/40 [3:16:23<2:58:15, 562.94s/it]

Epoch 21:	 Train accuracy: 98.5%


 55%|█████▌    | 22/40 [3:25:41<2:48:28, 561.57s/it]

Epoch 22:	 Train accuracy: 98.6%


 57%|█████▊    | 23/40 [3:35:04<2:39:13, 561.97s/it]

Epoch 23:	 Train accuracy: 99.1%


 60%|██████    | 24/40 [3:44:24<2:29:40, 561.27s/it]

Epoch 24:	 Train accuracy: 99.1%


 62%|██████▎   | 25/40 [3:53:41<2:19:58, 559.92s/it]

Epoch 25:	 Train accuracy: 98.9%


 65%|██████▌   | 26/40 [4:03:05<2:10:55, 561.11s/it]

Epoch 26:	 Train accuracy: 98.9%


 68%|██████▊   | 27/40 [4:12:26<2:01:33, 561.04s/it]

Epoch 27:	 Train accuracy: 98.6%


 70%|███████   | 28/40 [4:21:51<1:52:28, 562.33s/it]

Epoch 28:	 Train accuracy: 99.1%


 72%|███████▎  | 29/40 [4:31:13<1:43:03, 562.13s/it]

Epoch 29:	 Train accuracy: 99.4%


 75%|███████▌  | 30/40 [4:40:38<1:33:51, 563.13s/it]

Epoch 30:	 Train accuracy: 99.3%


 78%|███████▊  | 31/40 [4:49:57<1:24:16, 561.79s/it]

Epoch 31:	 Train accuracy: 99.3%


 80%|████████  | 32/40 [4:59:09<1:14:32, 559.06s/it]

Epoch 32:	 Train accuracy: 99.4%


 82%|████████▎ | 33/40 [5:08:26<1:05:08, 558.31s/it]

Epoch 33:	 Train accuracy: 99.2%


 85%|████████▌ | 34/40 [5:17:43<55:47, 557.90s/it]  

Epoch 34:	 Train accuracy: 99.2%


 88%|████████▊ | 35/40 [5:27:03<46:32, 558.46s/it]

Epoch 35:	 Train accuracy: 99.3%


 90%|█████████ | 36/40 [5:36:32<37:26, 561.65s/it]

Epoch 36:	 Train accuracy: 99.4%


 92%|█████████▎| 37/40 [5:45:55<28:06, 562.12s/it]

Epoch 37:	 Train accuracy: 99.4%


 95%|█████████▌| 38/40 [5:55:13<18:41, 560.96s/it]

Epoch 38:	 Train accuracy: 99.4%


 98%|█████████▊| 39/40 [6:04:35<09:21, 561.21s/it]

Epoch 39:	 Train accuracy: 99.2%


100%|██████████| 40/40 [6:13:51<00:00, 560.78s/it]

Epoch 40:	 Train accuracy: 99.2%





In [12]:
torch.save(model.state_dict(), 'lstm_whole_model.pt')

In [13]:
# model = FashionClassifierLSTM().to(device)
# model.load_state_dict(torch.load('/kaggle/input/lstm-model/lstm_final_model.pt'))
# model.to(device)

In [14]:
# model = FashionClassifierLSTM().to(device)
# optimizer = torch.optim.Adam(model.parameters())
# loss_fn = nn.NLLLoss()
# all_train_accuracies, all_test_accuracies = experiment(model, optimizer, train_loader, validate_loader, loss_fn, epochs=50)

First submission was 5 models
Second was 100 models

In [15]:
model.eval()
result_pred = pd.DataFrame(columns=['id', 'category'])

with torch.no_grad():
    for batch_idx, (image, features, text, label, offsets) in enumerate(test_loader):
        intermed_pred = pd.DataFrame(columns=['id', 'category'])
        image, features, text, offsets = image.to(device), features.to(device), text.to(device), offsets.to(device)
        
        pred = torch.zeros([features.shape[0],27]).to(device)
        for i in range(20):
            pred += model(image, features, text, offsets)
        
        final_pred = pred.argmax(1)
        
        intermed_pred['id'] = label
        intermed_pred['category'] = final_pred.cpu()
        result_pred = result_pred.append(intermed_pred)
        print('Batch {} Done'.format(batch_idx+1))

result_pred['category'] = result_pred['category'].map(lambda i: list(LABEL_DICT.keys())[i])
result_pred.to_csv('results.csv', index=False)



Batch 1 Done
Batch 2 Done
Batch 3 Done
Batch 4 Done
Batch 5 Done
Batch 6 Done
Batch 7 Done
Batch 8 Done
Batch 9 Done
Batch 10 Done
Batch 11 Done
Batch 12 Done
Batch 13 Done
Batch 14 Done
Batch 15 Done
Batch 16 Done
Batch 17 Done
Batch 18 Done
Batch 19 Done
Batch 20 Done
Batch 21 Done
Batch 22 Done
Batch 23 Done
Batch 24 Done
Batch 25 Done
Batch 26 Done
Batch 27 Done
Batch 28 Done
Batch 29 Done
Batch 30 Done
Batch 31 Done
Batch 32 Done
Batch 33 Done
Batch 34 Done
Batch 35 Done
Batch 36 Done
Batch 37 Done
Batch 38 Done
Batch 39 Done
Batch 40 Done
Batch 41 Done
Batch 42 Done
Batch 43 Done


# Bootstrapping the models

In [16]:
# fashion_data = FashionDatasetStacking()
# fashion_test_data = FashionDatasetStacking(test=True)

# device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
# print(f"Using {device} device")

In [17]:
# batch_size = 512

# test_loader = DataLoader(fashion_test_data, shuffle=False, batch_size=batch_size, collate_fn=lambda x: collate_batch_LSTM(x, testing=True))
# num_models = 5
# num_epochs = 100

In [18]:
# from tqdm import tqdm

# model_list = []
# for i in range(num_models):
#     model = FashionClassifierLSTM().to(device)
#     model_list.append(model)

# for i in range(num_models):
#     model = model_list[i]
#     generator1 = torch.Generator()
#     optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
#     loss_fn = nn.NLLLoss()
#     train_loader = DataLoader(fashion_data, batch_size=batch_size
#                           , collate_fn=lambda x: collate_batch_LSTM(x, testing=False)
#                           , sampler=torch.utils.data.RandomSampler(fashion_data, replacement=True, num_samples=len(fashion_data), generator=None))
#     for t in tqdm(range(num_epochs)):
#         # train
#         train_accuracy, average_train_loss = train(train_loader, model, loss_fn, optimizer)
#         print(f"Epoch {t+1}:\t Train accuracy: {100*train_accuracy:0.1f}%\t Avg train loss: {average_train_loss:>6f}")




In [19]:
# for i in range(num_models):
#     model = model_list[i]
#     torch.save(model.state_dict(), f"model{i+1}.pt")

In [20]:
# for i in range(num_models):
#     model_list[i].eval()
# result_pred = pd.DataFrame(columns=['id', 'category'])

# with torch.no_grad():
#     for batch_idx, (image, features, text, label, offsets) in enumerate(test_loader):
#         intermed_pred = pd.DataFrame(columns=['id', 'category'])
#         image, features, text, offsets = image.to(device), features.to(device), text.to(device), offsets.to(device)
        
#         pred = torch.zeros([features.shape[0],27]).to(device)
#         for i in range(num_models):
#             pred += model(image, features, text, offsets)
        
#         final_pred = pred.argmax(1)
        
#         intermed_pred['id'] = label
#         intermed_pred['category'] = final_pred.cpu()
#         result_pred = result_pred.append(intermed_pred)
#         print('Batch {} Done'.format(batch_idx+1))

# result_pred['category'] = result_pred['category'].map(lambda i: list(LABEL_DICT.keys())[i])
# result_pred.to_csv('results.csv', index=False)



# Written Report

* <b>Categorical attributes</b>: For the training loop, three linear layers to combine the categorical attributes were used. 
* <b>Noisy text description</b>: For text classification, an LSTM was used. LSTM units are more expressive and can control the content of their memory by utilizing gates. Prior to using the LSTM gates, the text is embedded using nn.embedding to convert it into a tensor of integers and make classification easier. Additionally, since the longest text description had 14 words, all the text is padded to a maximum length of 14.
* <b>Image Classification</b>: To take advantage of the images for classification, a 5x3 channel with maximal channels was implemented. This performed well because the greater the number of channels the greater is the capacity of the network. Additionally a 5x3 channel was used since a smaller channel have lesser parameters, which increases the generalization of the model. Moreover, dropouts have also been implemented twice since they help achieve better generalisation, despite it making the model learn slowly.
* <b>Ensemble learning</b>: In terms of ensemble techniques, two techniques were used: Stacking and Bagging. The models to extract features from the categorical attributes, text description and the images were concatenated and later passed through multiple layers to extract important features from the concatenated model. Secondly, for bagging, 5 models were trained on separate data by repeatedly sampling from the training data. Then, the vector of probabilities outputted by each model were added and the predictions are the classes with the maximum probabilities.