In [1]:
import numpy as np
from sklearn.decomposition import PCA

import util
from NNS import NNS

## 1. Data Loading

In [2]:
retrieval_repository_data = util.load_data('./image_retrieval_repository_data.pkl')

## 2. Data Exploration

In [3]:
print("Image Retrieval Repository Data Shape:", retrieval_repository_data.shape)

Image Retrieval Repository Data Shape: (5000, 257)


## 3. Data Preprocessing 

In [4]:
# remove index column
repository_data_index = retrieval_repository_data[:, 0]
retrieval_repository_data = retrieval_repository_data[:, 1:]

In [5]:
repository_data_index.shape

(5000,)

In [6]:
retrieval_repository_data.shape

(5000, 256)

## 4. Model

In [7]:
nns_model = NNS(k=5)

## 5. Train

In [8]:
nns_model.fit(X_train=retrieval_repository_data)

## 6. Predict

In [9]:
k_nearest = nns_model.predict(retrieval_repository_data[:1000])

100%|██████████| 1000/1000 [00:19<00:00, 51.67it/s]


In [13]:
k_nearest.shape

(1000, 5)

In [25]:
util.save_data('knn.pkl', nns_model)

Saved successfully


In [30]:
from retrieval import Retrieval
ret = Retrieval()
y_pred = ret.inference(retrieval_repository_data[:1000])


 26%|██▌       | 260/1000 [00:05<00:14, 49.76it/s]


KeyboardInterrupt: 

In [38]:
cnt = 0
y_test = repository_data_index[:1000]
for i in range(1000):
    if y_pred[i].any() == y_test[i]:
        cnt += 1
print(cnt)

0


In [52]:
from tqdm import tqdm
from sklearn.neighbors import KNeighborsClassifier
from retrieval import Retrieval
from util import load_data
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

X = load_data('./image_retrieval_repository_data.pkl')
y = X[:4000, 1]
X = X[:, 1:]
print(X.shape)
knn = NearestNeighbors(n_neighbors=5, metric='euclidean', algorithm='ball_tree')
for i in tqdm(range(X.shape[0]), desc="Fitting K-NN"):
    knn.fit(X[:i+1])

(5000, 256)


Fitting K-NN: 100%|██████████| 5000/5000 [02:26<00:00, 34.13it/s] 


In [55]:
util.save_data('knn.pkl', knn)
X = load_data('./image_retrieval_repository_data.pkl')
y = X[:, 1]
X = X[:, 1:]
X_test = X[4000:, :]
distances, indices = knn.kneighbors(X_test)
print(sum(sum(distances)))

Saved successfully
1121535.9648807435


In [99]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
pca.fit(X)
X_pca = pca.transform(X)
util.save_data('pca.pkl', pca)
print(X_pca.shape)
knn_pca = NearestNeighbors(n_neighbors=5, metric='euclidean', algorithm='ball_tree')
for i in tqdm(range(X_pca.shape[0]), desc="Fitting K-NN"):
    knn_pca.fit(X_pca[:i+1])
util.save_data('knn.pkl', knn_pca)

Saved successfully
(5000, 50)


Fitting K-NN: 100%|██████████| 5000/5000 [00:20<00:00, 243.24it/s]

Saved successfully





In [96]:
distances, indices = knn.kneighbors(pca.transform(X_test))
print(sum(sum(distances)))


953890.9949200222
Saved successfully


In [89]:
from imblearn.over_sampling import SMOTE

X = load_data('./image_retrieval_repository_data.pkl')
y = X[:, 0]
X = X[:, 1:]
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print(X_resampled.shape)
print(y_resampled.shape)

(5000, 256)
(5000,)


原始数据集大小: 5000
过采样后的数据集大小: 5082
