# Data preprocessing

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
from typing import Dict, List, Set, Union
import numpy as np
import pandas as pd
from tqdm import tqdm
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import classification_report
import catboost as cb
from copy import deepcopy

In [2]:
data_path = './train.csv'

In [3]:
tracks = pd.read_csv(data_path)
tracks_without_text = tracks[tracks.isnull().any(axis=1)]
tracks_with_text = tracks.dropna()

In [4]:
knn_size = int(0.6 * len(tracks))
catboost_train_size = int(0.3 * len(tracks))

In [5]:
knn_train, catboost_train_test, knn_train_is_cover, catboost_train_test_is_cover = train_test_split(
    tracks_with_text.drop('isCover', axis=1), 
    tracks_with_text['isCover'], 
    train_size=knn_size,
    random_state=42,
    stratify=tracks_with_text['isCover'],
)

In [6]:
knn_train['isCover'] = knn_train_is_cover
catboost_train_test['isCover'] = catboost_train_test_is_cover

In [7]:
catboost_train_test = pd.concat([catboost_train_test, tracks_without_text], axis=0, ignore_index=True)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    catboost_train_test.drop('isCover', axis=1), 
    catboost_train_test['isCover'], 
    train_size=catboost_train_size,
    random_state=1337,
    stratify=catboost_train_test['isCover'],
)

In [9]:
# sentence_model = SentenceTransformer('cointegrated/rubert-tiny')
sentence_model = SentenceTransformer('DeepPavlov/rubert-base-cased-sentence')

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/976 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/711M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

No sentence-transformers model found with name C:\Users\azatv/.cache\torch\sentence_transformers\DeepPavlov_rubert-base-cased-sentence. Creating a new one with MEAN pooling.


In [11]:
class KNNOnEmbeddings:
    def __init__(self, model, ):
        super().__init__()
        self.model = model
        self.embeddings = []
        self.embedding_id_to_track_id = []
        self.track_info = []
        self.text_field = 'text'
        
    def _parse_track(self, track_text: str) -> Set[str]:
        return set([line for line in track_text.split('\n') if line])
    
    def _get_line_embedding(self, line: str) -> np.ndarray:
        return self.model.encode(line)
    
    def __len__(self):
        return len(self.track_info)
    
    def _get_track_embeddings(self, track_text: str):
        parsed_text = self._parse_track(track_text)
        
        embeddings = []
        for line in parsed_text:
            embedding = self._get_line_embedding(line)
            embeddings.append(embedding)
            
        return embeddings
                
    def append(self, track: Dict[str, Union[str, int, bool, None]]) -> None:
        self.track_info.append(track)
        embeddings = self._get_track_embeddings(track[self.text_field])
        self.embeddings += embeddings
        self.embedding_id_to_track_id += [len(self.track_info) - 1, ] * len(embeddings)
        
    def make_from_dataframe(self, tracks: pd.DataFrame) -> None:
        for _, track in tqdm(tracks.iterrows()):
            self.append(dict(track))
            
    def dump(self, file_path: str) -> None:
        with open(file_path, 'wb') as file:
            pickle.dump(self, file)
    
    @staticmethod
    def load(file_path: str) -> None:
        with open(file_path, 'rb') as file:
            obj = pickle.load(file)
        
        return obj
    
    def _from_embedding_id_to_track_id(self, embedding_ids: List[int]) -> List[int]:
        return [self.embedding_id_to_track_id[embedding_id] for embedding_id in embedding_ids]
            
    def _find_k_nearest_neigbors_by_score(self, scores: List[float], k: int = 3):
        ids = np.argpartition(scores, -k)[-k:]
        return list(ids), scores[ids]
        
    def _find_k_nearest_neigbors_by_embeddings(self, embeddings: np.ndarray, k: int = 3):
        cos_sims = util.cos_sim(embeddings, knn.embeddings).numpy()
        scores = []
        for line_cos_sim in cos_sims:
            scores.append(self._find_k_nearest_neigbors_by_score(line_cos_sim, k=k))
        return scores
 
    def find_k_nearest_neighbors(self, track: Dict[str, Union[str, int, bool, None]], k: int = 3):
        embeddings = self._get_track_embeddings(track[self.text_field])
        
        ids_scores = self._find_k_nearest_neigbors_by_embeddings(embeddings, k=k)
        
        ids = []
        scores = []
        for ids_, scores_ in ids_scores:
            scores_ /= scores_.sum()
            
            ids += self._from_embedding_id_to_track_id(ids_)
            scores += list(scores_)
        
        ids_scores = sorted(zip(ids, scores))
        ids_agg_scores = []
        agg_score = 0.
        for i, (idx, score) in enumerate(ids_scores):
            if not i or idx == ids_scores[i - 1][0]:
                agg_score += score
            else:
                ids_agg_scores.append((ids_scores[i - 1][0], agg_score))
                agg_score = score
        
        ids_agg_scores.append((ids_scores[-1][0], agg_score))
        ids_agg_scores = np.array(ids_agg_scores)
                
        top_k_ids, _ = self._find_k_nearest_neigbors_by_score(ids_agg_scores[:, 1], k=k)
        top_k = ids_agg_scores[top_k_ids, :]
        top_k = top_k[::-1]
        top_k[:, 0] = top_k[:, 0].astype(int)
        return top_k
    
    def find_k_nearest_tracks(self, track: Dict[str, Union[str, int, bool, None]], fields: List[str], k: int = 3):
        if isinstance(track.get(self.text_field, 0.), str):
            top_k = self.find_k_nearest_neighbors(track, k=k)

            tracks = []

            for neighbor in top_k:
                idx = int(neighbor[0])
                track_info = self.track_info[idx]

                for field in fields:
                    tracks.append(track_info.get(field, None))

                tracks.append(neighbor[1])
        else:
            tracks = [None, ] * ((len(fields) + 1) * k)
            
        return tracks

In [12]:
knn = KNNOnEmbeddings(sentence_model)

In [13]:
knn.make_from_dataframe(knn_train)

1198it [12:57,  1.54it/s]


In [14]:
fields = [
    'track_id',
    'artistsName',
    'trackTitle',
    'isCover',
    'rating',
    'title_regex_flag',
    'other_titles_regex_flag',
    'version_regex_flag',
]

In [15]:
neighbors_info_train = []

for _, track in tqdm(X_train.iterrows()):
    neighbors_info_train.append(knn.find_k_nearest_tracks(dict(track), fields))

  a = torch.tensor(a)
599it [07:57,  1.26it/s]


In [16]:
additional_features_names = [field + '_top' + suffix for suffix in ['1', '2', '3'] for field in fields + ['similarity']]

In [17]:
train_additional_features = pd.DataFrame(neighbors_info_train, columns=additional_features_names)

In [18]:
X_train = pd.concat([X_train.reset_index(drop=True), train_additional_features], axis=1, ignore_index=False)

In [19]:
y_train.reset_index(drop=True, inplace=True)

In [20]:
neighbors_info_test = []

for _, track in tqdm(X_test.iterrows()):
    neighbors_info_test.append(knn.find_k_nearest_tracks(dict(track), fields))

200it [02:17,  1.46it/s]


In [21]:
test_additional_features = pd.DataFrame(neighbors_info_test, columns=additional_features_names)

In [22]:
X_test = pd.concat([X_test.reset_index(drop=True), test_additional_features], axis=1, ignore_index=False)

In [23]:
y_test.reset_index(drop=True, inplace=True)

In [24]:
X_train_ = deepcopy(X_train)

for i in range(1, 4):
    X_train_[f'similarity_top{i}'] = X_train[f'similarity_top{i}'].fillna(0.)
    
X_test_ = deepcopy(X_test)

for i in range(1, 4):
    X_test_[f'similarity_top{i}'] = X_test[f'similarity_top{i}'].fillna(0., in)

# Train Catboost

In [25]:
categorical_features = [
    'track_id', 
    'rating', 
    'rating_nan', 
    'title_regex_flag', 
    'other_titles_regex_flag', 
    'version_regex_flag',
    'track_id_top1',
    'isCover_top1',
    'rating_top1', 
    'title_regex_flag_top1', 
    'other_titles_regex_flag_top1',
    'version_regex_flag_top1',
    'track_id_top2',
    'isCover_top2',
    'rating_top2', 
    'title_regex_flag_top2', 
    'other_titles_regex_flag_top2',
    'version_regex_flag_top2',
    'track_id_top3',
    'isCover_top3',
    'rating_top3', 
    'title_regex_flag_top3', 
    'other_titles_regex_flag_top3',
    'version_regex_flag_top3',
]

text_features = [
    'artistsName', 
    'trackTitle', 
    'text',
    'artistsName_top1', 
    'trackTitle_top1',
    'artistsName_top2', 
    'trackTitle_top2',
    'artistsName_top3', 
    'trackTitle_top3',
]

In [26]:
train_dataset = cb.Pool(
    X_train_.drop(text_features, axis=1).fillna(-1).astype(int).replace(-1, 'NaN'),
    y_train,
    cat_features=categorical_features,
)

test_dataset = cb.Pool(
    X_test_.drop(text_features, axis=1).fillna(-1).astype(int).replace(-1, 'NaN'),
    y_test,
    cat_features=categorical_features,
)

whole_dataset = cb.Pool(
    pd.concat([X_train_, X_test_], axis=0, ignore_index=True).drop(text_features, axis=1).fillna(-1).astype(int).replace(-1, 'NaN'),
    pd.concat([y_train, y_test], axis=0, ignore_index=True),
    cat_features=categorical_features,
)

In [37]:
catboost_classifier = cb.CatBoostClassifier(iterations=200)

In [38]:
catboost_classifier.fit(whole_dataset)

Learning rate set to 0.040952
0:	learn: 0.6645523	total: 43.4ms	remaining: 8.63s
1:	learn: 0.6383854	total: 62.7ms	remaining: 6.21s
2:	learn: 0.6136822	total: 78.2ms	remaining: 5.13s
3:	learn: 0.5921611	total: 84.6ms	remaining: 4.14s
4:	learn: 0.5726525	total: 93.8ms	remaining: 3.66s
5:	learn: 0.5549478	total: 97.1ms	remaining: 3.14s
6:	learn: 0.5386716	total: 116ms	remaining: 3.2s
7:	learn: 0.5231883	total: 135ms	remaining: 3.24s
8:	learn: 0.5100157	total: 139ms	remaining: 2.95s
9:	learn: 0.4980694	total: 145ms	remaining: 2.75s
10:	learn: 0.4866310	total: 151ms	remaining: 2.6s
11:	learn: 0.4759861	total: 170ms	remaining: 2.67s
12:	learn: 0.4671004	total: 174ms	remaining: 2.51s
13:	learn: 0.4589574	total: 186ms	remaining: 2.47s
14:	learn: 0.4507517	total: 216ms	remaining: 2.67s
15:	learn: 0.4441143	total: 223ms	remaining: 2.57s
16:	learn: 0.4379295	total: 235ms	remaining: 2.53s
17:	learn: 0.4324539	total: 241ms	remaining: 2.44s
18:	learn: 0.4271699	total: 259ms	remaining: 2.46s
19:	lea

165:	learn: 0.3405205	total: 3.1s	remaining: 636ms
166:	learn: 0.3403893	total: 3.13s	remaining: 618ms
167:	learn: 0.3402317	total: 3.15s	remaining: 600ms
168:	learn: 0.3400640	total: 3.16s	remaining: 580ms
169:	learn: 0.3398927	total: 3.2s	remaining: 564ms
170:	learn: 0.3398159	total: 3.21s	remaining: 544ms
171:	learn: 0.3396289	total: 3.24s	remaining: 528ms
172:	learn: 0.3395258	total: 3.26s	remaining: 510ms
173:	learn: 0.3391102	total: 3.29s	remaining: 491ms
174:	learn: 0.3384200	total: 3.32s	remaining: 475ms
175:	learn: 0.3382686	total: 3.37s	remaining: 459ms
176:	learn: 0.3380429	total: 3.41s	remaining: 443ms
177:	learn: 0.3380256	total: 3.48s	remaining: 430ms
178:	learn: 0.3379207	total: 3.49s	remaining: 410ms
179:	learn: 0.3376212	total: 3.52s	remaining: 392ms
180:	learn: 0.3375669	total: 3.55s	remaining: 373ms
181:	learn: 0.3372319	total: 3.58s	remaining: 354ms
182:	learn: 0.3370882	total: 3.6s	remaining: 334ms
183:	learn: 0.3369449	total: 3.62s	remaining: 314ms
184:	learn: 0.3

<catboost.core.CatBoostClassifier at 0x25407d06f40>

In [40]:
print(classification_report(whole_dataset.get_label(), catboost_classifier.predict(whole_dataset)))

              precision    recall  f1-score   support

           0       0.89      0.99      0.94       698
           1       0.73      0.11      0.19       101

    accuracy                           0.88       799
   macro avg       0.81      0.55      0.56       799
weighted avg       0.87      0.88      0.84       799



# Dump model weights

In [41]:
knn.dump('knn_weights')

In [42]:
catboost_classifier.save_model('catboost_weights')