In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import torch
import pandas as pd
from PIL import ImageDraw, ImageFont, Image
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torch
import torch.nn as nn
from transformers import ViTModel
from torchinfo import summary  # 
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")
import random
import time

In [2]:
DEVICE="cuda:0"
def setAllSeeds(seed):
  os.environ['MY_GLOBAL_SEED'] = str(seed)
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
setAllSeeds(42)

In [13]:
df = pd.read_csv("train.csv")
categories=df["Category"].unique()
print(categories)
category=categories[1]
df = df[df["Category"]==category]

['Men Tshirts' 'Sarees' 'Kurtis' 'Women Tshirts' 'Women Tops & Tunics']


In [14]:
delCol = []
idxCol = []
trackNum = []
for i in range(1,11):
    uniName = df["attr_"+str(i)].unique()
    # print(len(uniName))
    if(len(uniName)==1):
        delCol.append("attr_"+str(i))
    else:
        idxCol.append("attr_"+str(i))
        trackNum.append(len(uniName))

In [16]:
df = df.drop(delCol,axis=1)
df.shape

(7267, 8)

In [17]:
df.isna().sum()

(4575, 8)

(2237, 13)

In [9]:
# df=df[0:100]

In [10]:
id2label={}
label2id={}
attrs={}
total_attr=len(df.columns)
for i in range(3,total_attr):
    labels=df[df.columns[i]].dropna().unique()
    # print(df.columns[i],labels)
    id2label[i-3]={k:labels[k] for k in range(len(labels))}
    label2id[i-3]={labels[k]:k for k in range(len(labels))}
    attrs[i-3]=df.columns[i]
print(id2label)
print(label2id)
print(attrs)

{0: {0: 'same as saree', 1: 'NA', 2: 'solid', 3: 'same as border', 4: 'default'}, 1: {0: 'woven design', 1: 'zari', 2: 'no border', 3: 'solid', 4: 'default', 5: 'temple border', 6: 'NA'}, 2: {0: 'small border', 1: 'big border', 2: 'NA', 3: 'no border'}, 3: {0: 'multicolor', 1: 'cream', 2: 'white', 3: 'default', 4: 'NA', 5: 'navy blue', 6: 'yellow', 7: 'green', 8: 'pink'}, 4: {0: 'party', 1: 'traditional', 2: 'daily', 3: 'NA', 4: 'wedding'}, 5: {0: 'jacquard', 1: 'NA', 2: 'default', 3: 'tassels and latkans'}, 6: {0: 'woven design', 1: 'NA', 2: 'same as saree', 3: 'default', 4: 'zari woven'}, 7: {0: 'zari woven', 1: 'NA', 2: 'woven design', 3: 'default', 4: 'solid', 5: 'printed'}, 8: {0: 'applique', 1: 'elephant', 2: 'floral', 3: 'ethnic motif', 4: 'NA', 5: 'peacock', 6: 'default', 7: 'solid', 8: 'checked', 9: 'botanical'}, 9: {0: 'no', 1: 'yes', 2: 'NA'}}
{0: {'same as saree': 0, 'NA': 1, 'solid': 2, 'same as border': 3, 'default': 4}, 1: {'woven design': 0, 'zari': 1, 'no border': 2, '

In [11]:
def categorize(example):
    for i in attrs:
        # print(example[attrs[i]],type(example[attrs[i]]),pd.isna(example[attrs[i]]))
        if not pd.isna(example[attrs[i]]):
            example[attrs[i]]=label2id[i][example[attrs[i]]]
        else:
            example[attrs[i]]=-100
    return example
df=df.apply(categorize,axis=1)
df.head()
    

In [12]:
from transformers import ViTImageProcessor
model_name = 'google/vit-base-patch16-224'
processor = ViTImageProcessor.from_pretrained(model_name)

In [13]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.3)
val_df,test_df=train_test_split(val_df,test_size=0.33)

In [14]:
class CustomFashionManager(Dataset):
    def __init__(self,csv_file, root_dir="./",transforms =None):
        self.fashionItems = csv_file
        self.root_dir = root_dir
        self.transforms = transforms
    
    def __len__(self):
        return len(self.fashionItems)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        img_name = os.path.join(self.root_dir,f"{self.fashionItems.iloc[idx, 0]:06d}"+'.jpg')
        image = Image.open(img_name)
        attributes = self.fashionItems.iloc[idx, 3:]
        attributes = np.array(attributes)
        attributes = attributes.astype('float')
        # print(attributes.shape)
        # attributes = attributes.astype('float').reshape(-1, len(attributes))
        # swap color axis because
        # numpy image: H x W x C
        # torch image: C X H X W
        inputs=processor(image, return_tensors='pt')
        inputs['labels']=torch.tensor(attributes, dtype=torch.long)
        return inputs

        # if self.transforms:
        #     sample = self.transforms(sample)

        # return sample


In [15]:
train_fashion_data = CustomFashionManager(csv_file=train_df,
                                    root_dir='train_images')
val_fashion_data = CustomFashionManager(csv_file=val_df,
                                    root_dir='train_images')
test_fashion_data = CustomFashionManager(csv_file=test_df,root_dir='train_images')

fig = plt.figure()
        

<Figure size 640x480 with 0 Axes>

In [16]:
import sys
from typing import List
from transformers import ViTConfig,ViTPreTrainedModel,DeiTConfig,DeiTPreTrainedModel,DeiTModel


class CustomConfig(DeiTConfig):
    def __init__(self,num_classes_per_label:List[int]=[1],**kwargs):
        super().__init__(**kwargs)
        self.num_classes_per_label = num_classes_per_label

class MultiLabelMultiClassViT(DeiTPreTrainedModel):
    config_class=CustomConfig
    def __init__(self, config: CustomConfig) -> None:
        super().__init__(config)

        self.vit = DeiTModel(config, add_pooling_layer=False)
        # for param in self.vit.parameters():
        #     param.requires_grad = False
        self.classifiers = nn.ModuleList([
            nn.Linear(config.hidden_size, num_classes) 
            for num_classes in config.num_classes_per_label
        ])
        # Initialize weights and apply final processing
        self.post_init()


    def forward(self, pixel_values,labels=None):
        outputs = self.vit(pixel_values).last_hidden_state[:, 0, :]  # CLS token representation
        logits = [classifier(outputs) for classifier in self.classifiers]
        if labels is not None:
            loss=0
            for i in range(len(logits)):
                target=labels[:,i]
                loss += torch.nn.functional.cross_entropy(logits[i], target)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}

# Example usage
num_labels = len(trackNum)  # For example, 5 different labels



In [17]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import classification_report
batch_size = 32
def collate_fn(batch):
    return {
        'pixel_values': torch.cat([x['pixel_values'] for x in batch],dim=0),
        'labels': torch.stack([x['labels'] for x in batch])
    }

def compute_metrics(pred):
    logits = pred.predictions
    labels=pred.label_ids
    probs = np.stack([np.argmax(logit,axis=1) for logit in logits])
    probs=probs.T
    truth_labels=[]
    preds=[]
    for i in range(len(probs)):
        pred=[]
        true=[]
        for j in range(len(probs[i])):
            pred.append(id2label[j][probs[i][j]])
            if labels[i][j]==-100:
                true.append(-100)
            else:
                true.append(id2label[j][labels[i][j]])
        preds.append(pred)
        truth_labels.append(true)

    preds=np.array(preds)
    truth_labels=np.array(truth_labels)

    labels=truth_labels.flatten()
    probs=preds.flatten()

    non_padding_indices = [i for i, label in enumerate(labels) if label != '-100']

# Use the filtered indices to get non-padding true and predicted labels
    labels = [labels[i] for i in non_padding_indices]
    probs = [probs[i] for i in non_padding_indices]

    print(classification_report(labels,probs))
    report=classification_report(labels,probs,output_dict=True)
    return {'accuracy': report['accuracy'],"macro avg f1":report['macro avg']['f1-score']}

training_args = TrainingArguments(
  output_dir="./vit3/"+category,
  per_device_train_batch_size=64,
  per_device_eval_batch_size=64,
  evaluation_strategy="epoch",
  save_strategy="epoch",
  logging_strategy="epoch",
  num_train_epochs=5,
  fp16=True,
  learning_rate=1e-4,
  save_total_limit=1,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to='wandb',
  load_best_model_at_end=True,
  metric_for_best_model="macro avg f1"
)
config=ViTConfig.from_pretrained(model_name)
config=CustomConfig(num_classes_per_label=trackNum,**config.to_dict())
model = MultiLabelMultiClassViT.from_pretrained(model_name,config=config)

trainer = Trainer(
    model,
    training_args,
    train_dataset=train_fashion_data,
    eval_dataset=val_fashion_data,
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)

Some weights of MultiLabelMultiClassViT were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['classifiers.0.bias', 'classifiers.0.weight', 'classifiers.1.bias', 'classifiers.1.weight', 'classifiers.2.bias', 'classifiers.2.weight', 'classifiers.3.bias', 'classifiers.3.weight', 'classifiers.4.bias', 'classifiers.4.weight', 'classifiers.5.bias', 'classifiers.5.weight', 'classifiers.6.bias', 'classifiers.6.weight', 'classifiers.7.bias', 'classifiers.7.weight', 'classifiers.8.bias', 'classifiers.8.weight', 'classifiers.9.bias', 'classifiers.9.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
trainer.train()
trainer.save_model(f"./vit3/{category}/final")

[2024-10-16 13:11:07,260] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/aseems/anaconda3/envs/mhcp4/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkaran21258[0m ([33mkaran912[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Macro avg f1
1,No log,7.660187,0.680933,0.432265
2,No log,7.478254,0.688717,0.438877
3,No log,7.423473,0.696935,0.518036
4,7.428600,7.359627,0.697912,0.485185
5,7.428600,7.354586,0.697044,0.501498


In [19]:
trainer.evaluate(test_fashion_data)

{'eval_loss': 7.306195259094238,
 'eval_accuracy': 0.6957622454595487,
 'eval_macro avg f1': 0.48441997780365265,
 'eval_runtime': 20.5063,
 'eval_samples_per_second': 88.607,
 'eval_steps_per_second': 0.927,
 'epoch': 5.0}