In [3]:
!pip install open_clip_torch pytorch_lightning

Collecting pytorch_lightning
  Downloading pytorch_lightning-2.1.0-py3-none-any.whl (774 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m774.6/774.6 kB[0m [31m48.9 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0
  Downloading torchmetrics-1.2.0-py3-none-any.whl (805 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m72.0 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0
  Downloading lightning_utilities-0.9.0-py3-none-any.whl (23 kB)
Installing collected packages: lightning-utilities, torchmetrics, pytorch_lightning
Successfully installed lightning-utilities-0.9.0 pytorch_lightning-2.1.0 torchmetrics-1.2.0
[0m

In [1]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel,AutoTokenizer
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score,f1_score
from sklearn.preprocessing import LabelEncoder 
import open_clip
pl.seed_everything(56)

Seed set to 56


56

In [2]:
!nvidia-smi

Thu Nov  2 20:40:13 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100 80G...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   56C    P0    82W / 300W |   1139MiB / 81920MiB |      3%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
open_clip.list_pretrained()

[('RN50', 'openai'),
 ('RN50', 'yfcc15m'),
 ('RN50', 'cc12m'),
 ('RN50-quickgelu', 'openai'),
 ('RN50-quickgelu', 'yfcc15m'),
 ('RN50-quickgelu', 'cc12m'),
 ('RN101', 'openai'),
 ('RN101', 'yfcc15m'),
 ('RN101-quickgelu', 'openai'),
 ('RN101-quickgelu', 'yfcc15m'),
 ('RN50x4', 'openai'),
 ('RN50x16', 'openai'),
 ('RN50x64', 'openai'),
 ('ViT-B-32', 'openai'),
 ('ViT-B-32', 'laion400m_e31'),
 ('ViT-B-32', 'laion400m_e32'),
 ('ViT-B-32', 'laion2b_e16'),
 ('ViT-B-32', 'laion2b_s34b_b79k'),
 ('ViT-B-32', 'datacomp_xl_s13b_b90k'),
 ('ViT-B-32', 'datacomp_m_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_clip_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_laion_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_image_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_text_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_basic_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_s128m_b4k'),
 ('ViT-B-32', 'datacomp_s_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_clip_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_laion_s13m_b4k'),
 ('ViT-B-32', 'commonpool_

In [4]:
class CFG:
    wandb=False
    num_workers=8
    model="ViT-B-16-SigLIP"
    pretrained = "webli"
    text_encoder = 'ai-forever/ruBert-base'
    #model = 'ViT-B-32'
    #pretrained = 'commonpool_m_clip_s128m_b4k'
    hidden_size = 768 * 2
    train_path='./train-test-csvs/train-7.csv'
    test_path = './train-test-csvs/test-5.csv'
    val_split_size = 0.2
    num_labels = 8
    scheduler='cosine'
    max_epoches=1
    clip_lr = 4e-6
    mlp_lr = 2e-4
    text_lr = 3e-5
    eps=1e-6
    weights_decay=0.01
    batch_size=16
    seed=56

In [5]:
class PLDataset(torch.utils.data.Dataset):
    def __init__(self, df,preprocess,tokenizer):
        super().__init__()
        self.cfg = CFG()
        self.data = df[['image','label','text']]
        self.data = self.data.values
        self.processor = preprocess
        self.tokenizer = tokenizer
    def __getitem__(self, index):
        image = Image.open(self.data[index][0])
        image = self.processor(image)
        text_encoding = self.tokenizer.encode_plus(self.data[index][2],
                                                   padding='max_length',
                                                   truncation=True,
                                                   max_length=48,
                                                   return_tensors='pt')
        label = self.data[index][1]
        return {'image':image,
                'text_ids':text_encoding.input_ids[0],
                'attention_mask':text_encoding.attention_mask[0],
                'label':label}
    def __len__(self):
        return len(self.data)

In [6]:
le = LabelEncoder()
label_vc = {'Развлечения и юмор':0,
            'Кулинария':1,
            'Торговля и объявления':2,
            'СМИ':3,
            'Философия и религия':4,
            'Животные':5,
            'Творчество и дизайн':6,
            'Путешествия':7}

def process_labels(label):
    if label not in label_vc.keys():
        return -1
    else:
        return label_vc[label]
def process_text(text:str):
    return text.strip().lower()

def make_df(path):
    df = pd.DataFrame()
    data = pd.read_csv(path,sep=';')
    if 'label' in data.columns:
        data['label'] = data['label'].map(process_labels)
        data = data[data['label'] != -1].reset_index()
        df['text'] = data[data['label'] != -1]['description'].fillna('')
        df['label'] = le.fit_transform(data['label'])
        df['image'] = data['id'].map(lambda x:'./vseros-final-data/Data/Train/'+str(x))
    else:
        df['label'] = 0
        df['text'] = data['description'].fillna('')
        df['image'] = data['id'].map(lambda x:'./vseros-final-data/Data/Test/'+str(x))
    return df

In [7]:
class PLDataModule(pl.LightningDataModule):
    def __init__(self,processor,tokenizer):
        super().__init__()
        self.processor = processor
        self.tokenizer = tokenizer
        self.cfg = CFG()
        self.train_dataset_path = self.cfg.train_path
        self.test_dataset_path = self.cfg.test_path
        self.val_split_size = self.cfg.val_split_size
        self.batch_size = self.cfg.batch_size
        self.num_workers = self.cfg.num_workers
        self.is_setup = False
    def prepare_data(self):
        self.train_df = make_df(self.train_dataset_path)
        self.test_df = make_df(self.test_dataset_path)
    def setup(self, stage: str):
        if self.is_setup:
            return None
        self.train_df, self.val_df = train_test_split(self.train_df, test_size=self.val_split_size,random_state=self.cfg.seed)
        self.train_dataset = PLDataset(self.train_df,self.processor,self.tokenizer)
        self.val_dataset = PLDataset(self.val_df,self.processor,self.tokenizer)
        self.predict_dataset = PLDataset(self.test_df,self.processor,self.tokenizer)
        self.is_setup = True
    
    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_dataset,
                                           batch_size=self.batch_size,
                                           num_workers=self.num_workers,
                                           shuffle=True)
    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.val_dataset,
                                           batch_size=self.batch_size,
                                           num_workers=self.num_workers)

    def predict_dataloader(self):
        return torch.utils.data.DataLoader(self.predict_dataset,
                                           batch_size=self.batch_size,
                                           num_workers=self.num_workers,
                                           shuffle = False)

In [8]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

In [16]:
class PLModule(pl.LightningModule):
    def __init__(self,clip,bert):
        super().__init__()
        self.cfg = CFG()
        self.clip = clip
        self.text_model = bert
        self.mlp = nn.Sequential(nn.Linear(self.cfg.hidden_size,self.cfg.hidden_size * 2),
                                 nn.LayerNorm(self.cfg.hidden_size * 2),
                                 nn.GELU(),
                                 nn.Linear(self.cfg.hidden_size * 2,self.cfg.num_labels))
        self.criterion = nn.CrossEntropyLoss()
        self.pool = MeanPooling()
        self.val_targets = []
        self.val_preds = []
        self.get_features = False
        
    def get_clip_features(self,batch):
        image_features = self.clip.encode_image(batch['image'])
        text_features = self.text_model(batch['text_ids'],
                                        attention_mask=batch['attention_mask'])['last_hidden_state']
        text_features = self.pool(text_features,batch['attention_mask'])
        return torch.cat([image_features,text_features],axis=-1)
    def forward(self,batch):
        features = self.get_clip_features(batch)
        return self.mlp(features)

    def training_step(self, batch, _):
        logits = self(batch)
        loss = self.criterion(logits, batch['label'])
        return loss
        
    def validation_step(self, batch, _):
        logits = self(batch).argmax(dim=-1).cpu().detach().tolist()
        self.val_targets += batch['label'].tolist()
        self.val_preds += logits
        
    def predict_step(self, batch, _):
        x1,x2,_ = batch
        if not self.get_features:
            logits = self(x1,x2).argmax(dim=-1).cpu().detach().tolist()
            return logits
        else:
            features = self.get_clip_features(x1,x2).cpu().detach().tolist()
            return features
        
    def calc_metric(self):
        return 100 * (f1_score(self.val_targets,self.val_preds,average='macro'))
        
    def on_validation_epoch_end(self):
        print(self.calc_metric())
        self.val_targets, self.val_preds = [],[]
            
    def configure_optimizers(self):
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in self.clip.visual.named_parameters()],
                "lr": self.cfg.clip_lr
            },
            {
                "params": [p for n, p in self.text_model.named_parameters()],
                "lr": self.cfg.clip_lr
            },
            {
                "params": [p for n, p in self.mlp.named_parameters()],
                "lr": self.cfg.mlp_lr
            }
        ]
        return torch.optim.AdamW(optimizer_grouped_parameters,
                                 weight_decay=self.cfg.weights_decay
                                )

In [10]:
clip, _, preprocess = open_clip.create_model_and_transforms(CFG.model, pretrained=CFG.pretrained)

In [11]:
bert = AutoModel.from_pretrained(CFG.text_encoder)
tokenizer = AutoTokenizer.from_pretrained(CFG.text_encoder)

Some weights of the model checkpoint at ai-forever/ruBert-base were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
dm = PLDataModule(preprocess,tokenizer)
dm.prepare_data()
dm.setup(0)

In [17]:
model_pl = PLModule(clip,bert)

In [18]:
lr_monitor = pl.callbacks.LearningRateMonitor(logging_interval='epoch')
checkpoint_cb = pl.callbacks.ModelCheckpoint(
    dirpath='./outputs/',
    filename='model_{epoch:02d}-{val_acc:.4f}',
    monitor='val_acc',
    mode='max',
    save_last=True
)

trainer = pl.Trainer(
    accelerator="gpu",
    precision=32,
    min_epochs=1,
    check_val_every_n_epoch=1,
    max_epochs=5
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [19]:
trainer.fit(model_pl,datamodule=dm)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | clip       | CustomTextCLIP   | 203 M 
1 | text_model | BertModel        | 178 M 
2 | mlp        | Sequential       | 4.8 M 
3 | criterion  | CrossEntropyLoss | 0     
4 | pool       | MeanPooling      | 0     
------------------------------------------------
386 M     Trainable params
0         Non-trainable params
386 M     Total params
1,544.863 Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

6.292517006802721


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

83.14589395522628


Validation: |          | 0/? [00:00<?, ?it/s]

81.87668074750547


Validation: |          | 0/? [00:00<?, ?it/s]

85.85470501685974


Validation: |          | 0/? [00:00<?, ?it/s]

85.87569018995427


Validation: |          | 0/? [00:00<?, ?it/s]

85.78031232382189


`Trainer.fit` stopped: `max_epochs=5` reached.


In [16]:
test_preds = trainer.predict(model_pl,datamodule=dm)



Predicting: 0it [00:00, ?it/s]

In [17]:
df = pd.DataFrame()

In [18]:
df['label'] = np.sum(test_preds)
df['id'] = pd.read_csv(CFG.test_path,sep=';').id

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


In [19]:
df['label'] = df['label'].apply(lambda x:list(label_vc.keys())[x])

In [20]:
df.to_csv('sfsfvt10.csv',index=False,sep=';')