In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics, model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
import os
from metric_learn import MMC
from metric_learn import ITML
from metric_learn import SDML

# Data import

In [35]:
if not os.path.exists(r'Dataset\reduced_x_train.npy'):
    PCA_preprocessing()
pca_reduced_x_train = np.load(r'Dataset\reduced_x_train.npy')
pca_reduced_x_test = np.load(r'Dataset\reduced_x_test.npy')
pca_reduced_y_train = np.load(r'Dataset\reduced_y_train.npy')
pca_reduced_y_test = np.load(r'Dataset\reduced_y_test.npy')

# Preprocessor
Get the target input representations (pair,triplet or quadru-plets) from the original input representation(array-like).

**Note:**
The way of specifying pairs is not recommended for a large number of tuples, as it is redundant and hence takes a lot of memory. Indeed each feature vector of a point will be replicated as many times as a point is involved in a tuple. 

Instead of forming each point in each tuple, a more efficient representation would be to keep the dataset of points `X-features` aside, and just represent tuples as a collection of **tuples of indices** from the points in `X-features`.

# Pairs construction
Get 15000 similar pairs' indices and 15000 dissimilar pairs' indices from pca_reduced_x_train, total number of training pairs: 30000

One pair sample: (a,b), where a,b are both sampled from pca_reduced_x_train

In [95]:
np.random.seed=1     # set the numpy.random.seed to ensure that each permutation follow the same order, which means that permutated_pca_reduced_x_train,permutated_pca_reduced_y_train are still matched

### permutated data

In [102]:
permutated_pca_reduced_x_train=np.random.permutation(pca_reduced_x_train)
permutated_pca_reduced_y_train=np.random.permutation(pca_reduced_y_train)

In [104]:
p_sorted_pca_reduced_y_train=np.sort(np.squeeze(permutated_pca_reduced_y_train))
p_sort_indices=np.argsort(np.squeeze(permutated_pca_reduced_y_train))
p_sorted_pca_reduced_x_train=permutated_pca_reduced_x_train[p_sort_indices,:]

In [105]:
print(p_sorted_pca_reduced_x_train.shape,p_sorted_pca_reduced_y_train.shape)

(22393, 50) (22393,)


In [106]:
p_sort_indices[:10]

array([ 8343,  6550,  8187, 15921,  6005, 14205, 20194,  1010, 15898,
        9743], dtype=int64)

In [107]:
p_sorted_pca_reduced_y_train[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

### non-permutated data

In [57]:
sorted_pca_reduced_y_train=np.sort(np.squeeze(pca_reduced_y_train))
sort_indices=np.argsort(np.squeeze(pca_reduced_y_train))
sorted_pca_reduced_x_train=pca_reduced_x_train[sort_indices,:]

In [58]:
print(sorted_pca_reduced_x_train.shape,sorted_pca_reduced_y_train.shape)

(22393, 50) (22393,)


In [83]:
sort_indices[:10]

array([    0,  1699, 18938,  4676,  9152,  4685, 12619,  4696, 12602,
       18960], dtype=int64)

In [84]:
sorted_pca_reduced_y_train[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)

## Dissimilar pairs construction 

In [60]:
a_train,_,a_y_train,_=model_selection.train_test_split(sorted_pca_reduced_x_train, sorted_pca_reduced_y_train, test_size=0.32,random_state=1, stratify=sorted_pca_reduced_y_train)

In [61]:
print(a_train.shape,a_y_train.shape)

(15227, 50) (15227,)


In [62]:
a_y_train[:10]

array([29, 24, 19, 13,  1, 40, 35, 20,  3, 21], dtype=int64)

In [63]:
b_train,_,b_y_train,_=model_selection.train_test_split(sorted_pca_reduced_x_train, sorted_pca_reduced_y_train, test_size=0.32,random_state=2, stratify=sorted_pca_reduced_y_train)

In [64]:
print(a_train.shape,a_y_train.shape)

(15227, 50) (15227,)


In [108]:
b_y_train[:10]     # we can see that (a_train,b_train) are dissimilar pairs 

array([ 1, 48, 37, 26, 23, 27, 37, 20, 33, 39], dtype=int64)

In [112]:
train_pairs_dissimilar=[]
for row in range(a_train.shape[0]):
    tmp_pair=[a_train[row].tolist(),b_train[row].tolist()]
    train_pairs_dissimilar.append(tmp_pair)
np.shape(train_pairs_dissimilar)    # (n_tuples, tuple_size, n_features)

(15227, 2, 50)

In [114]:
y_pairs_dissimilar=-1*np.ones(np.shape(train_pairs_dissimilar)[0])

In [117]:
print(np.shape(train_pairs_dissimilar),y_pairs_dissimilar.shape)

(15227, 2, 50) (15227,)


## Similar pairs construction 

In [118]:
b_train_si,_,b_y_train_si,_=model_selection.train_test_split(p_sorted_pca_reduced_x_train, p_sorted_pca_reduced_y_train, test_size=0.32,random_state=1, stratify=p_sorted_pca_reduced_y_train)

In [119]:
a_y_train[:10]

array([29, 24, 19, 13,  1, 40, 35, 20,  3, 21], dtype=int64)

In [120]:
b_y_train_si[:10]

array([29, 24, 19, 13,  1, 40, 35, 20,  3, 21], dtype=int64)

In [125]:
train_pairs_similar=[]
for row in range(a_train.shape[0]):
    tmp_pair=[a_train[row].tolist(),b_train_si[row].tolist()]
    train_pairs_similar.append(tmp_pair)
np.shape(train_pairs_similar)    # (n_tuples, tuple_size, n_features)

(15227, 2, 50)

In [126]:
y_pairs_similar=np.ones(np.shape(train_pairs_similar)[0])

In [128]:
print(np.shape(train_pairs_similar),y_pairs_similar.shape)

(15227, 2, 50) (15227,)


# Train pairs construction

In [131]:
train_pairs=np.vstack((train_pairs_dissimilar,train_pairs_similar))
train_pairs.shape

(30454, 2, 50)

In [134]:
y_pairs=np.vstack((np.reshape(y_pairs_dissimilar,(-1,1)),np.reshape(y_pairs_similar,(-1,1))))
y_pairs.shape

(30454, 1)

In [136]:
# save the data as .npy
np.save(r'Dataset\train_pairs', train_pairs)
np.save(r'Dataset\y_pairs', y_pairs)
np.save(r'Dataset\train_pairs_dissimilar', train_pairs_dissimilar)
np.save(r'Dataset\train_pairs_similar', train_pairs_similar)
np.save(r'Dataset\y_pairs_dissimilar', y_pairs_dissimilar)
np.save(r'Dataset\y_pairs_similar', y_pairs_similar)

# Learning on pairs
In this case, one should provide the algorithm with `n` pairs of points, with a corresponding target containing `n` values being either +1 or -1. These values indicate whether the given pairs are **similar points or dissimilar points.**

Pari metric learning algorithms:
* MMC
* ITML
* SDML

# MMC

In [137]:
mmc = MMC(random_state=1234, max_iter=100, convergence_threshold=1e-3,verbose=True)

In [138]:
mmc.fit(pairs=train_pairs,y=np.squeeze(y_pairs))
mmc_metric=mmc.get_metric()

In [139]:
knn_mmc = KNeighborsClassifier(n_neighbors=5,metric=mmc_metric)
knn_mmc.fit(pca_reduced_x_train, pca_reduced_y_train)
Y_pred_mmc = knn_mmc.predict(pca_reduced_x_test)
print("Accuracy:",metrics.accuracy_score(pca_reduced_y_test, Y_pred_mmc))

  return self._fit(X, y)


Accuracy: 0.7302565476589189


# ITML

In [140]:
itml=ITML()

In [141]:
itml.fit(pairs=train_pairs,y=np.squeeze(y_pairs))
itml_metric=itml.get_metric()

  X = np.vstack({tuple(row) for row in pairs.reshape(-1, pairs.shape[2])})
  alpha = min(_lambda[i], gamma_proj * (1. / wtw - 1. / pos_bhat[i]))


In [142]:
knn_itml = KNeighborsClassifier(n_neighbors=5,metric=itml_metric)
knn_itml.fit(pca_reduced_x_train, pca_reduced_y_train)
Y_pred_itml = knn_itml.predict(pca_reduced_x_test)
print("Accuracy:",metrics.accuracy_score(pca_reduced_y_test, Y_pred_itml))

  return self._fit(X, y)


Accuracy: 0.8694487239600777


# SDML

In [143]:
sdml=SDML()

In [None]:
sdml.fit(pairs=train_pairs,y=np.squeeze(y_pairs))
sdml_metric=sdml.get_metric()

In [None]:
knn_sdml = KNeighborsClassifier(n_neighbors=5,metric=sdml_metric)
knn_sdml.fit(pca_reduced_x_train, pca_reduced_y_train)
Y_pred_sdml = knn_sdml.predict(pca_reduced_x_test)
print("Accuracy:",metrics.accuracy_score(pca_reduced_y_test, Y_pred_sdml))