In [1]:
import pandas as pd
import numpy as np
import faiss
import catboost as cb
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors


In [2]:
df_base = pd.read_csv("data/base.csv", index_col=0)
df_base.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,62,63,64,65,66,67,68,69,70,71
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0-base,-115.08389,11.152912,-64.42676,-118.88089,216.48244,-104.69806,-469.070588,44.348083,120.915344,181.4497,...,-42.808693,38.800827,-151.76218,-74.38909,63.66634,-4.703861,92.93361,115.26919,-112.75664,-60.830353
1-base,-34.562202,13.332763,-69.78761,-166.53348,57.680607,-86.09837,-85.076666,-35.637436,119.718636,195.23419,...,-117.767525,41.1,-157.8294,-94.446806,68.20211,24.346846,179.93793,116.834,-84.888941,-59.52461
2-base,-54.233746,6.379371,-29.210136,-133.41383,150.89583,-99.435326,52.554795,62.381706,128.95145,164.38147,...,-76.3978,46.011803,-207.14442,127.32557,65.56618,66.32568,81.07349,116.594154,-1074.464888,-32.527206
3-base,-87.52013,4.037884,-87.80303,-185.06763,76.36954,-58.985165,-383.182845,-33.611237,122.03191,136.23358,...,-70.64794,-6.358921,-147.20105,-37.69275,66.20289,-20.56691,137.20694,117.4741,-1074.464888,-72.91549
4-base,-72.74385,6.522049,43.671265,-140.60803,5.820023,-112.07408,-397.711282,45.1825,122.16718,112.119064,...,-57.199104,56.642403,-159.35184,85.944724,66.76632,-2.505783,65.315285,135.05159,-1074.464888,0.319401


In [3]:
# Scale data
scaler = StandardScaler()
scaled_vectors = pd.DataFrame(scaler.fit_transform(df_base), index=df_base.index)

In [4]:
# number of dimensions and clusters
dim = scaled_vectors.shape[1]
k_clusters = 1024

# init 
quantizer = faiss.IndexFlatL2(dim)
idx_l2 = faiss.IndexIVFFlat(quantizer, dim, k_clusters)

# number of clusters to check
idx_l2.nprobe = 16

# Number of neighbors to search
k_neighbors = 100

In [5]:
idx_l2.train(np.ascontiguousarray(scaled_vectors.values).astype('float32'))
idx_l2.add(np.ascontiguousarray(scaled_vectors.values).astype('float32'))

In [6]:
# Сохраним базовые индексы
base_index = {k: v for k, v in enumerate(scaled_vectors.index.to_list())}

In [7]:
df_train = pd.read_csv("data/train.csv", index_col=0)
df_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,63,64,65,66,67,68,69,70,71,Target
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0-query,-53.882748,17.971436,-42.117104,-183.93668,187.51749,-87.14493,-347.360606,38.307602,109.08556,30.413513,...,70.10736,-155.80257,-101.965943,65.90379,34.4575,62.642094,134.7636,-415.750254,-25.958572,675816-base
1-query,-87.77637,6.806268,-32.054546,-177.26039,120.80333,-83.81059,-94.572749,-78.43309,124.9159,140.33107,...,4.669178,-151.69771,-1.638704,68.170876,25.096191,89.974976,130.58963,-1035.092211,-51.276833,366656-base
2-query,-49.979565,3.841486,-116.11859,-180.40198,190.12843,-50.83762,26.943937,-30.447489,125.771164,211.60782,...,78.039764,-169.1462,82.144186,66.00822,18.400496,212.40973,121.93147,-1074.464888,-22.547178,1447819-base
3-query,-47.810562,9.086598,-115.401695,-121.01136,94.65284,-109.25541,-775.150134,79.18652,124.0031,242.65065,...,44.515266,-145.41675,93.990981,64.13135,106.06192,83.17876,118.277725,-1074.464888,-19.902788,1472602-base
4-query,-79.632126,14.442886,-58.903397,-147.05254,57.127068,-16.239529,-321.317964,45.984676,125.941284,103.39267,...,45.02891,-196.09207,-117.626337,66.92622,42.45617,77.621765,92.47993,-1074.464888,-21.149351,717819-base


In [8]:
targets = df_train["Target"]
df_train.drop("Target", axis=1, inplace=True)

In [9]:
# Нормализация векторов
scaled_train = pd.DataFrame(scaler.transform(df_train), index=df_train.index)

In [10]:
# Получим расстояния и индексы соседей
dist, idx = idx_l2.search(np.ascontiguousarray(scaled_train.values).astype('float32'), k_neighbors)

In [11]:
# Преобразуем индекс к np.array
idx = np.array([[base_index[_] for _ in el] for el in idx], dtype=object)

In [12]:
# Посчитаем acc@5
acc = 0
for target, el in zip(targets.values.tolist(), idx):
    if target in el:
        acc += 1

print(100 * acc / len(idx))

76.077


In [13]:
# Сохраним целевой признак для catboost
cb_target = np.array([[1 if x == target else 0 for x in el] for el, target in zip(idx, targets.values.tolist())])

In [14]:
print('dist:', dist.shape)
print('idx:', idx.shape)
print('targets:', targets.shape)

dist: (100000, 100)
idx: (100000, 100)
targets: (100000,)


In [15]:
# Clear memory
del base_index
del df_train
del df_base

In [16]:
# Получение признаков кандидатов и истинного соседа
candidate_features = scaled_vectors.loc[idx.flatten()].values
neighbor_features = scaled_vectors.loc[targets].values

# Изменение порядка расстояний
reshaped_dist = dist.reshape(-1,1)

# Повторение значений признаков для таргета
repeated_neighbor = np.repeat(neighbor_features, k_neighbors, axis=0)

# Проверка размерностей
print('dist:', reshaped_dist.shape)
print('idx:', candidate_features.shape)
print('targets:', repeated_neighbor.shape)

# Горизонтальное объединение массивов
cb_features = np.hstack((reshaped_dist, candidate_features, repeated_neighbor))
print('CB_features:', cb_features.shape)

dist: (10000000, 1)
idx: (10000000, 72)
targets: (10000000, 72)
CB_features: (10000000, 145)


In [17]:
# Clear memory
del scaled_vectors
del neighbor_features
del reshaped_dist
del repeated_neighbor
del candidate_features
del idx
del dist
del scaled_train
del targets