# Image feature extraction

### setup

###### install

In [1]:
!pip install -q transformers datasets timm
!pip install -q wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m549.1/549.1 KB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 KB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.9/132.9 KB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# repo

# develop setup
# !git config --global user.email "SECRET_MAIL@mail.ru"
# !git config --global user.name "SECRET NAME"
# !git clone https://abletobetable:SECRET_TOKEN@github.com/Abletobetable/smart-product.git

!git clone https://github.com/Abletobetable/smart-product.git

Cloning into 'smart-product'...
remote: Enumerating objects: 177, done.[K
remote: Counting objects: 100% (171/171), done.[K
remote: Compressing objects: 100% (128/128), done.[K
remote: Total 177 (delta 102), reused 95 (delta 40), pack-reused 6[K
Receiving objects: 100% (177/177), 74.90 MiB | 25.79 MiB/s, done.
Resolving deltas: 100% (102/102), done.


###### import

In [3]:
import os
import json
import shutil

from PIL import Image
import pandas as pd
import numpy as np

import timm
from transformers import AutoModel, AutoImageProcessor
import torch

# init run for efficient logging
import wandb
run = wandb.init(project="kazan_internship2023")

# imports from repo
%cd /content/smart-product
from src.dataset_utils import expand_text_fields, add_images_path, create_image_datasets
from src.features_for_images import create_model_and_trainer, get_image_features
from src.train_utils import train_pipeline
%cd /content

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(DEVICE)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


/content/smart-product
/content
cpu


### image feature extraction

load dataset from saved

In [None]:
# directory with saved artifact
table_dataset_version = 'processed_table:v0'
images_dataset_version = 'image_dataset:v1'

table data

In [None]:
artifact = run.use_artifact(f'abletobetable/kazan_internship2023/{table_dataset_version}', 
                            type='preprocessed_data')
artifact_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact processed_table:v0, 390.46MB. 2 files... 
[34m[1mwandb[0m:   2 of 2 files downloaded.  
Done. 0:0:4.0


images data

In [None]:
# need 10-30min
artifact = run.use_artifact(f'abletobetable/kazan_internship2023/{images_dataset_version}', 
                            type='raw_data')
artifact_dir = artifact.download()

[34m[1mwandb[0m: Downloading large artifact image_dataset:v1, 1579.90MB. 107980 files... 
[34m[1mwandb[0m:   107980 of 107980 files downloaded.  
Done. 0:15:10.2


move in right folder, so in table data will be correct path

In [None]:
if not os.path.exists('/content/images/'):
    os.mkdir('/content/images')

try:
    shutil.move(f'/content/artifacts/{images_dataset_version}/train', 
            '/content/images')
    shutil.move(f'/content/artifacts/{images_dataset_version}/test', 
            '/content/images')
except:
    print('already moved in right folder')

work with datasets, 

prepare parts of dataset for training, validating, feture exctracting and prediction

when I create dataset I also apply simple image augmentations from albumentations for better training

In [None]:
prep_train_df = pd.read_csv(f'artifacts/{table_dataset_version}/train_processed.tsv', 
                            sep='\t')
prep_predict_df = pd.read_csv(f'artifacts/{table_dataset_version}/predict_processed.tsv', 
                              sep='\t')

unsplitted_dataset, train_dataset, valid_dataset, \
predict_dataset, label2id, id2label = create_image_datasets(prep_train_df, prep_predict_df)

Number of labels: 874
rare categories: [12836, 11875, 11549, 12901]
len train split: 72899
len valid split: 18225


#### ViT

I will use pretrained ViT from 🤗  transformers: https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k

Freeze param need if want to set requires_grad = False for feature extraction layers

##### want to train?)

I add option for training and evaluating classifier based only on features from images. 

So, if I want to finetune image feature extractor, I can easily do this in just couple of commands. Then save my model on the 🤗 model hub and use it in my pipeline.

In [None]:
model, trainer = create_model_and_trainer(
    model_checkpoint='microsoft/beit-base-patch16-224-pt22k-ft22k', 
    train_dataset=train_dataset, 
    valid_dataset=valid_dataset,
    num_epochs=3, 
    batch_size=16, 
    lr=2e-5,
    freeze=False, 
    num_labels=874,
    label2id=label2id, 
    id2label=id2label,
    report_to="wandb",
    push_to_hub=True,
    )

In [None]:
trainer.train()

***** Running training *****
  Num examples = 72899
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 13671
  Number of trainable parameters = 86434090
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,F1
1,3.0016,2.421942,0.43576
2,1.6151,1.803694,0.563603


***** Running Evaluation *****
  Num examples = 18225
  Batch size = 16
  metric = load_metric('f1')


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

Saving model checkpoint to image_feature_extractor/checkpoint-4557
Configuration saved in image_feature_extractor/checkpoint-4557/config.json
Model weights saved in image_feature_extractor/checkpoint-4557/pytorch_model.bin
Image processor saved in image_feature_extractor/checkpoint-4557/preprocessor_config.json
***** Running Evaluation *****
  Num examples = 18225
  Batch size = 16
Saving model checkpoint to image_feature_extractor/checkpoint-9114
Configuration saved in image_feature_extractor/checkpoint-9114/config.json
Model weights saved in image_feature_extractor/checkpoint-9114/pytorch_model.bin
Image processor saved in image_feature_extractor/checkpoint-9114/preprocessor_config.json


push to 🤗 hub if needed. Especially when push_to_hub = False

In [None]:
access_token = "tsss..."

pt_model = AutoModel.from_pretrained("/content/image_feature_extractor/checkpoint-3417", 
                                     use_auth_token=access_token)
pt_processor = AutoImageProcessor.from_pretrained("/content/image_feature_extractor/checkpoint-3417", 
                                                  use_auth_token=access_token)

pt_processor.push_to_hub('smart-product-ViT', use_auth_token=access_token)
pt_model.push_to_hub('smart-product-ViT', use_auth_token=access_token)

##### get features:

first init model for extraction

In [None]:
model, _ = create_model_and_trainer(
    model_checkpoint='microsoft/beit-base-patch16-224-pt22k-ft22k', # , 
    train_dataset=train_dataset, 
    valid_dataset=valid_dataset,
    num_epochs=1, 
    batch_size=64, 
    freeze=True, 
    num_labels=874,
    label2id=label2id, 
    id2label=id2label,
    )

# get features
X_train = get_image_features(unsplitted_dataset, model, DEVICE, 
                             prep_train_df['product_id'], 
                             prep_train_df['category_id'], 
                             model_type = 'ViT')

X_predict = get_image_features(predict_dataset, model, DEVICE, 
                               prep_predict_df['product_id'], 
                               model_type = 'ViT')

# will save output in this folder
folder_path = '/content/beit_features'
if not os.path.exists(folder_path):
    os.mkdir(folder_path)

# save features in .npy for further logging
np.save(f'{folder_path}/train_beit_features.npy', X_train)
np.save(f'{folder_path}/predict_beit_features.npy', X_predict)

log features with wandb

In [None]:
beit_dataset = wandb.Artifact("beit_features", type="preprocessed_data")
beit_dataset.add_dir("/content/beit_features")
run.log_artifact(beit_dataset)

[34m[1mwandb[0m: Adding directory to artifact (/content/beit_features)... Done. 3.6s


<wandb.sdk.wandb_artifacts.Artifact at 0x7f9eaff36fd0>

#### CNN

I will use pretrained efficientnet from 🤗 timm: https://huggingface.co/timm/tf_efficientnetv2_b3.in21k_ft_in1k

##### want to train?)

init model and fix classificator head

In [None]:
cnn_model = timm.create_model("hf_hub:timm/tf_efficientnetv2_b3.in21k_ft_in1k", 
                              pretrained=True)

cnn_model.classifier = torch.nn.Linear(in_features=1536, out_features=874, bias=True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/58.1M [00:00<?, ?B/s]

run training

image_pipeline function create dataloaders, optimizers and run training loop written in PyTorch with loggs in wandb. Also if needed runs call testing function

In [None]:
cfg = dict(count_of_epoch=2, batch_size=64, lr=1e-4, 
           criterion='CrossEntropyLoss', 
           optimizer='Adam', scheduler='StepLR', 
           step_size = 25, step_gamma = 0.1,
           project='kazan_internship2023', model_name='efficientnet', 
           device = DEVICE)

model = train_pipeline(cnn_model, train_dataset, valid_dataset, cfg,
                       saved_model=None, to_train=True, to_test=True, 
                       report_to='wandb')

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

config:
{'batch_size': 64,
 'count_of_epoch': 2,
 'criterion': 'CrossEntropyLoss',
 'device': 'cuda',
 'lr': 0.0001,
 'model_name': 'efficientnet',
 'optimizer': 'Adam',
 'project': 'kazan2023',
 'report_to': 'wandb',
 'scheduler': 'StepLR',
 'step_gamma': 0.1,
 'step_size': 25}

running on device: cuda 



epochs:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1139 [00:00<?, ?it/s]

  0%|          | 0/285 [00:00<?, ?it/s]

train loss on 000 epoch: 3.059568 with lr: 0.0001000000
valid loss on 000 epoch: 1.936718
valid accuracy: 0.53
Validation Loss Decreased(inf--->1.936718) 	 Saving The Model



  0%|          | 0/1139 [00:00<?, ?it/s]

  0%|          | 0/285 [00:00<?, ?it/s]

train loss on 001 epoch: 1.621257 with lr: 0.0001000000
valid loss on 001 epoch: 1.624085
valid accuracy: 0.60
Validation Loss Decreased(1.936718--->1.624085) 	 Saving The Model



  0%|          | 0/285 [00:00<?, ?it/s]

0.5951028875045535


finish logging

In [None]:
wandb.finish()

VBox(children=(Label(value='109.320 MB of 109.320 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0,…

0,1
epoch,▁█
learning_rate,▁▁
train_loss,█▁
valid_f1,▁█
valid_loss,█▁

0,1
epoch,1.0
learning_rate,0.0001
train_loss,1.62126
valid_f1,0.5951
valid_loss,1.62408


push trained model to the hub for further using

In [None]:
# need for login in huggingface_hub
!pip install -q huggingface_hub
!huggingface-cli login

In [None]:
timm.models.hub.push_to_hf_hub(model, 'smart-product-EfficientNet')

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/57.3M [00:00<?, ?B/s]

'https://huggingface.co/abletobetable/smart-product-EfficientNet/tree/main/'

##### get features

first init model for extraction

In [None]:
pt_model = timm.create_model("hf_hub:abletobetable/smart-product-EfficientNet", 
                              pretrained=True) # "hf_hub:timm/tf_efficientnetv2_b3.in21k_ft_in1k"

# will save output in this folder
folder_path = '/content/EfficientNet_features_pt'
if not os.path.exists(folder_path):
    os.mkdir(folder_path)

X_train = get_image_features(unsplitted_dataset, pt_model, DEVICE, 
                             prep_train_df['product_id'], 
                             prep_train_df['category_id'], 
                             model_type = 'CNN')

# save features in .npy for further logging
np.save(f'{folder_path}/train_EfficientNet_features.npy', X_train)

X_predict = get_image_features(predict_dataset, pt_model, DEVICE, 
                               prep_predict_df['product_id'], 
                               model_type = 'CNN')

# save features in .npy for further logging
np.save(f'{folder_path}/predict_EfficientNet_features.npy', X_predict)

Downloading (…)lve/main/config.json:   0%|          | 0.00/17.6k [00:00<?, ?B/s]

Using CNN for feature extraction


100%|██████████| 91120/91120 [45:54<00:00, 33.08it/s]


Using CNN for feature extraction


100%|██████████| 16860/16860 [08:08<00:00, 34.49it/s]


save artifact in wandb

In [11]:
EfficientNet_dataset = wandb.Artifact("EfficientNet_features", type="preprocessed_data")
EfficientNet_dataset.add_dir("/content/EfficientNet_features_pt")
run.log_artifact(EfficientNet_dataset)

[34m[1mwandb[0m: Adding directory to artifact (/content/EfficientNet_features_pt)... Done. 11.2s


<wandb.sdk.wandb_artifacts.Artifact at 0x7f1987663f70>