In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp39-cp39-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Collecting plotly
  Downloading plotly-5.18.0-py3-none-any.whl (15.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.6/15.6 MB[0m [31m97.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, graphviz, plotly, catboost
Successfully installed catboost-1.2.2 graphviz-0.20.1 plotly-5.18.0 tenacity-8.2.3
[0m

In [21]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score,f1_score
from sklearn.preprocessing import LabelEncoder 
from catboost import CatBoostClassifier,Pool,cv
import open_clip

In [2]:
le = LabelEncoder()
label_vc = {'Развлечения и юмор':0,
            'Кулинария':1,
            'Торговля и объявления':2,
            'СМИ':3,
            'Философия и религия':4,
            'Животные':5,
            'Творчество и дизайн':6,
            'Путешествия':7}

def process_labels(label):
    if label not in label_vc.keys():
        return -1
    else:
        return label_vc[label]
def process_text(text:str):
    return text.strip().lower()

def make_df(path):
    df = pd.DataFrame()
    data = pd.read_csv(path,sep=';')
    if 'label' in data.columns:
        data['label'] = data['label'].map(process_labels)
        data = data[data['label'] != -1].reset_index()
        df['text'] = data[data['label'] != -1]['description'].fillna('')
        df['label'] = le.fit_transform(data['label'])
        df['image'] = data['id'].map(lambda x:'./vseros-final-data/Data/Train/'+str(x))
    else:
        df['label'] = 0
        df['text'] = data['description'].fillna('')
        df['image'] = data['id'].map(lambda x:'./vseros-final-data/Data/Test/'+str(x))
    return df

In [3]:
df = make_df('./train-test-csvs/train-7.csv')

In [4]:
class PLDataset(torch.utils.data.Dataset):
    def __init__(self, df,preprocess,tokenizer):
        super().__init__()
        self.data = df[['image','label','text']]
        self.data = self.data.values
        self.processor = preprocess
        self.tokenizer = tokenizer
    def __getitem__(self, index):
        image = Image.open(self.data[index][0])
        image = self.processor(image)
        text = self.tokenizer(self.data[index][2])
        label = self.data[index][1]
        return image,text[0],label
    def __len__(self):
        return len(self.data)

In [5]:
clip, _, preprocess = open_clip.create_model_and_transforms('xlm-roberta-base-ViT-B-32',
                                                            pretrained='laion5b_s13b_b90k')

In [6]:
tokenizer = open_clip.get_tokenizer('xlm-roberta-base-ViT-B-32')

In [10]:
train_ds = PLDataset(df,preprocess,tokenizer)
train_dl = DataLoader(train_ds,
                      batch_size=32,
                      num_workers=4,
                      shuffle=False)

In [12]:
features = []
for batch in tqdm(train_dl):
    x1,x2,targets = batch
    image_features = clip.encode_image(x1)
    text_features = clip.encode_text(x2)
    features += [torch.cat([image_features,text_features],axis=-1).cpu().detach().numpy()]

  0%|          | 0/155 [00:00<?, ?it/s]

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [13]:
len(features)

155

In [15]:
stack_feat = np.concatenate(features).T
for i in range(stack_feat.shape[0]):
    df[f'feature_{i}'] = stack_feat[i]

  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
  df[f'feature_{i}'] = stack_feat[i]
 

In [16]:
df

Unnamed: 0,text,label,image,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_1014,feature_1015,feature_1016,feature_1017,feature_1018,feature_1019,feature_1020,feature_1021,feature_1022,feature_1023
0,"""Когда устал и жить не хочешь,Полезно помнить ...",4,./vseros-final-data/Data/Train/814469951099,-0.066702,0.155645,-0.028238,-0.313402,0.071953,-0.176494,0.105081,...,0.011616,-0.022740,0.001822,-0.027504,-0.029541,-0.001247,-0.006176,0.000297,0.005331,0.021493
1,,2,./vseros-final-data/Data/Train/849433210092,1.517470,0.098180,0.022926,-0.118828,-1.001894,0.049392,-0.372113,...,-0.020388,0.020386,0.019053,0.001716,-0.096211,-0.010076,0.028890,0.008629,-0.018711,-0.012377
2,"""МИР ВАШЕМУ ДОМУ! ДОРОГИЕ ДРУЗЬЯ, ХРАНИ ВАС ГО...",4,./vseros-final-data/Data/Train/852458632411,0.198421,-0.094205,0.158658,0.279904,-0.411174,-0.381934,0.301060,...,0.007204,0.010326,0.005440,-0.020561,-0.020253,-0.010979,0.017347,0.010617,-0.000670,0.022870
3,"""Альбом \""Праздничные мопсы\"" https://ok.ru/mo...",5,./vseros-final-data/Data/Train/860243294215,0.017377,0.503241,-0.782633,-0.020394,0.200160,-0.295039,0.393912,...,0.003428,0.037173,0.012656,-0.010435,-0.018469,-0.002034,-0.010821,0.021637,-0.003497,-0.000402
4,Умнее некоторых людей,5,./vseros-final-data/Data/Train/861555576675,0.099410,-0.229236,-0.382765,0.187351,-0.365266,-0.441162,0.139805,...,-0.016369,0.013807,0.031914,0.005379,-0.015435,0.014899,0.014931,0.023953,-0.033931,-0.002533
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4942,Депутат Госдумы Михаил Романов обратился к губ...,3,./vseros-final-data/Data/Train/909322079091,-0.696525,-0.162396,-0.617308,0.681656,-0.298284,0.298156,0.715054,...,-0.000414,-0.012995,0.003661,0.027963,-0.014852,-0.020780,-0.010124,0.011329,-0.000221,0.007858
4943,"""Сколько людей, столько и идей. Наша подписчиц...",6,./vseros-final-data/Data/Train/909332887609,-0.079451,-0.086231,0.506067,0.910091,-0.052220,0.139546,-0.038095,...,-0.005147,-0.004233,0.012897,-0.013363,-0.025559,0.018540,0.021115,0.003747,-0.030671,-0.016799
4944,В Петербурге готовят к работе 70 пунктов вакци...,3,./vseros-final-data/Data/Train/909333845107,-0.321878,0.218555,0.072714,0.252609,-0.603051,-0.686623,0.757196,...,-0.008435,0.020820,0.006765,0.048533,-0.016744,-0.001586,-0.030573,0.030244,-0.003102,0.009634
4945,Фарерские острова,7,./vseros-final-data/Data/Train/909334888398,0.850247,-0.846517,-0.285012,0.650887,0.278446,-0.054432,-0.034436,...,0.033173,0.033054,0.032514,0.042018,-0.026123,0.015691,0.010501,-0.004853,-0.004943,0.009525


In [17]:
train_df,val_df = train_test_split(df,test_size=0.2,random_state=56)

In [23]:
train_pool = Pool(train_df.drop(['image','label','text'],axis=1),
                  label = train_df['label'])

eval_pool = Pool(val_df.drop(['image','label','text'],axis=1),
                  label = val_df['label'])

In [26]:
params = {'iterations':1200,
          'learning_rate':0.1,
          'loss_function':'MultiClass',
          'max_depth':5,
          'eval_metric':'TotalF1:average=Macro',
          'random_seed':56}

model = CatBoostClassifier(**params)

In [27]:
model.fit(train_pool,eval_set=eval_pool,verbose=50)

0:	learn: 0.4370929	test: 0.4271271	best: 0.4271271 (0)	total: 189ms	remaining: 3m 46s
50:	learn: 0.8131086	test: 0.7408012	best: 0.7408012 (50)	total: 4.88s	remaining: 1m 49s
100:	learn: 0.8794594	test: 0.7835221	best: 0.7848605 (99)	total: 9.57s	remaining: 1m 44s
150:	learn: 0.9132952	test: 0.7944287	best: 0.7976831 (131)	total: 14.2s	remaining: 1m 38s
200:	learn: 0.9282556	test: 0.8009781	best: 0.8030024 (189)	total: 18.8s	remaining: 1m 33s
250:	learn: 0.9380436	test: 0.8101491	best: 0.8102158 (242)	total: 23.3s	remaining: 1m 28s
300:	learn: 0.9496398	test: 0.8125159	best: 0.8164204 (294)	total: 27.9s	remaining: 1m 23s
350:	learn: 0.9620381	test: 0.8161070	best: 0.8206473 (304)	total: 32.4s	remaining: 1m 18s
400:	learn: 0.9703372	test: 0.8193496	best: 0.8211527 (387)	total: 36.9s	remaining: 1m 13s
450:	learn: 0.9779112	test: 0.8219863	best: 0.8237064 (432)	total: 41.4s	remaining: 1m 8s
500:	learn: 0.9819688	test: 0.8258938	best: 0.8258938 (494)	total: 46.2s	remaining: 1m 4s
550:	lea

<catboost.core.CatBoostClassifier at 0x7f2ca040ef10>