# Load files

In [1]:
# from numpy.linalg import eig
from scipy.linalg import eig
import numpy as np


def pca_cust(A, k_num=-1, return_order=False):
    if k_num == -1:
        k_num = min(A.shape[0], A.shape[1])
    if k_num > min(A.shape[0], A.shape[1]):
        raise ValueError(k_num + " must be less than min(A.shape[0],A.shape[1]) " + min(A.shape[0], A.shape[1]))
    cov = np.cov(A)
    k, U = eig(cov)

    U_sorted, k_order = get_sorted_matrix_on_weights(k, U, return_order=True)
    k = k[::-1]
    U = U.transpose()[::-1]
    if return_order:
        return U.transpose().astype(float), np.diag(k.astype(float)), U.astype(float), k_order
    else:
        return U.transpose().astype(float), np.diag(k.astype(float)), U.astype(float)


def svd_cust(A, k_num=-1, return_order=False):
    if k_num == -1:
        k_num = min(A.shape[0], A.shape[1])
    if k_num > min(A.shape[0], A.shape[1]):
        raise ValueError(k_num + " must be less than min(A.shape[0],A.shape[1]) " + min(A.shape[0], A.shape[1]))
    transpose = A.shape[0] > A.shape[1]
    if transpose:
        A = np.transpose(A)
    data_mat = np.dot(A, np.transpose(A))
    feature_mat = np.dot(np.transpose(A), A)

    k1, U = eig(data_mat)
    k2, V = eig(feature_mat)

    V = np.transpose(V).astype(float)
    U_sorted, k1_order = get_sorted_matrix_on_weights(k1, U, return_order=True)
    k1 = np.transpose(np.array(U_sorted, dtype=object))[0]
    U = np.stack(np.transpose(np.array(U_sorted, dtype=object))[1])

    V_sorted = get_sorted_matrix_on_weights(k2, V)
    V = []
    for i in range(len(V_sorted)):
        if i < len(k1):
            V.append(V_sorted[i][1] * V_sorted[i][0] / k1[i])
        else:
            break

    if return_order:
        if not transpose:
            return np.array(U).astype(float), np.diag(np.nan_to_num(np.sqrt(k1.astype(float)))), np.array(V).astype(
                float), k1_order
        else:
            return np.transpose(np.array(V)).astype(float), np.diag(np.nan_to_num(np.sqrt(k1.astype(float)))), np.transpose(
                np.array(U).astype(float)), k1_order
    else:
        if not transpose:
            return np.array(U).astype(float), np.diag(np.nan_to_num(np.sqrt(k1.astype(float)))), np.array(V).astype(float)
        else:
            return np.transpose(np.array(V)).astype(float), np.diag(np.nan_to_num(np.sqrt(k1.astype(float)))), np.transpose(
                np.array(U).astype(float))


def get_sorted_matrix_on_weights(weights, V, return_order=False):
    v_dict = {}
    weight_dict = {}
    V = np.array(V)
    for i in range(len(weights)):
        if weights[i] < 0:
            v_dict.update({-weights[i]: -V[i]})
            weight_dict.update({-weights[i]: i})
        else:
            v_dict.update({weights[i]: V[i]})
            weight_dict.update({weights[i]: i})

    V_sorted = sorted(v_dict.items())[::-1]
    weights_order = sorted(weight_dict.items())[::-1]
    # print(V_sorted)
    # print(weights_order)
    if return_order:
        return V_sorted, weights_order
    else:
        return V_sorted

In [2]:
# Top K image search using original images
# Here I am using a library named faiss for the similarity search
# Eucleadean and cosine:: TODO :: why this and not other distances
import random
import os
import pandas as pd
import numpy as np
from scipy.stats import wasserstein_distance
from scipy.spatial import distance


def euclidean_fn(xb, xq):
  """
  Calculate the euclidean distance and return distance matrix
  :param xb: Data matrix to find similarity in
  :param xq:  Query matrix to find similarity for
  """
  eu=np.sqrt(np.sum(np.square(xb-xq), axis=1))
  return eu


def cosine_fn(xb, xq):
  """
  Calculate the cosine distance and return distance matrix
  :param xb: Data matrix to find similarity in
  :param xq:  Query matrix to find similarity for
  """
  cos = np.array([distance.cosine(i, xq) for i in xb])
  return cos


def manhattan_fn(xb, xq):
  """
  Calculate the manhattan distance and return distance matrix
  :param xb: Data matrix to find similarity in
  :param xq:  Query matrix to find similarity for
  """
  man = np.sum(np.absolute(xb-xq), axis=1)
  return man


def kl_divergence_fn(xb, xq):
  """
  Calculate the kl divergence and return divergence matrix
  :param xb: Data matrix to find similarity in
  :param xq:  Query matrix to find similarity for
  """
  return np.sum(np.where(xb != 0, xb * np.log(xb / xq), 0), axis=1)


def euclidean(xb, k, xq):
  """
  Calculate the euclidean distance and return the top k values
  :param xb: Data matrix to find similarity in
  :param k: Number of top objects to return
  :param xq:  Query matrix to find similarity for
  """                       
  eu=euclidean_fn(xb, xq)
  idx = np.argpartition(eu, k)[:k]
  return eu[idx], idx

def kl_divergence(xb, k, xq):
  """
  Calculate the kl divergence and return divergence matrix
  :param xb: Data matrix to find similarity in
  :param k: Number of top objects to return
  :param xq:  Query matrix to find similarity for
  """
  kl = kl_divergence_fn(xb, xq)
  idx = np.argpartition(kl, k)[:k]
  return kl[idx], idx


def cosine(xb, k, xq):
  """
  Calculate the cosine distance and return the top k values
  :param xb: Data matrix to find similarity in
  :param k: Number of top objects to return
  :param xq:  Query matrix to find similarity for
  """
  cos = cosine_fn(xb, xq)
  idx = np.argpartition(em, k)[:k]
  return cos[idx], idx


def manhattan(xb, k, xq):
  """
  Calculate the manhattan distance and return the top k values
  :param xb: Data matrix to find similarity in
  :param k: Number of top objects to return
  :param xq:  Query matrix to find similarity for
  """
  man = manhattan_fn(xb, xq)
  idx = np.argpartition(man, k)[:k]
  return man[idx], idx


def earth_movers(xb, k, xq):
  """
  Calculate the earth movers distance and return the top k values
  :param xb: Data matrix to find similarity in
  :param k: Number of top objects to return
  :param xq:  Query matrix to find similarity for
  """
  # em = np.array([wasserstein_distance(i,xq[0]) for i in xb])
  em = np.array([wasserstein_distance(np.histogram(i)[1], np.histogram(xq)[1]) for i in xb])
  idx = np.argpartition(em, k)[:k]
  return em[idx], idx


def top_k_match(xb, k, xq, method="euclidean"):
  """
  General function to call distance functions
  :param xb: Data matrix to find similarity in
  :param k: Number of top objects to return
  :param xq:  Query matrix to find similarity for
  """
  if method == "euclidean":
    return euclidean(xb, k, xq)
  elif method == "cosine":
    return cosine(xb, k, xq)
  elif method == "manhattan":
    return manhattan(xb, k, xq)
  elif method == "earth_movers":
    return earth_movers(xb, k, xq)


def get_image_file(features_dir, image_ids):
  """
  Get image-image id mapping file
  :param features_dir: features directory where mapping is stored
  :param image_ids: image ids to fetch
  """
  df = pd.read_csv(os.path.join(features_dir, "image_ids.csv"))#.iloc[image_ids]
  return df["image_idx"].to_list()

In [3]:
import pandas as pd
import os


class Pca:
    """
    Represents PCA dimension technique
    ...
    Attributes:
        k: int
            Number of reduced features

        X: ndarray of shape (num_objects, num_features)
            Data matrix to be reduced

    Methods:
        transform(X)
            Transforms and returns X in the latent semantic space and the latent semantics
    """

    def __init__(self, *args):
        """
        Parameters:
            k: int
                Number of reduced features

            X: ndarray of shape (num_objects, num_features)
                Data matrix to be reduced
        """
        if (len(args)) == 2:
            X = args[1]
            k = args[0]
            self.x_ = np.array(X, dtype=np.float32)
            self.features_ = self.x_.shape[1]

            self.x_covariance_ = np.cov(self.x_)
            self.eigen_values_, self.eigen_vectors_ = eig(self.x_covariance_)

            temp, self.sub_wt_pairs = get_sorted_matrix_on_weights(self.eigen_values_, self.eigen_vectors_, return_order=True)
            self.eigen_vectors_ = self.eigen_vectors_.transpose()[::-1]
            self.eigen_values_ = self.eigen_values_[::-1]

            self.u_, self.s_, self.u_transpose_ = self.eigen_vectors_[:k].transpose(), \
                                                  np.diag(self.eigen_values_[:k].astype(np.float)), \
                                                  self.eigen_vectors_[:k]
        elif (len(args)) == 1:
            self.u_ = pd.read_csv(os.path.join(args[0], "U.csv")).to_numpy()
            self.u_transpose_ = self.u_.transpose()
            self.s_ = pd.read_csv(os.path.join(args[0], "S.csv")).to_numpy()
            self.x_ = pd.read_csv(os.path.join(args[0], "X.csv")).to_numpy()
            self.sub_wt_pairs = pd.read_csv(os.path.join(args[0], "sub_wt_pairs.csv")).to_numpy()
        else:
            raise Exception("Invalid object instantiation: parameters must be either <data_matrix:2D numpy>,<k:int> "
                            "or <folder:string>.")

    def get_decomposition(self):
        """
        Parameters:
            X: ndarray of shape (num_objects, num_features)
                Data matrix to be reduced

        Returns:
            Transforms and returns X in the latent semantic space and the latent semantics
        """
        return self.u_, self.s_, self.u_transpose_

    def get_latent_features(self):
        """
        :return: components of PCA
        """
        return self.u_, self.s_

    def transform(self, data_matrix):
        """
        :param data_matrix: matrix to transform (query_objects, num_features)
        :return: pca transformed matrix
        """
        Q = np.concatenate((self.x_, data_matrix), axis=0)
        return np.dot(np.cov(Q)[-1][:-1], self.u_)

    def get_obj_weight_pairs(self):
        """
        :return: objects
        :return: weights
        """
        return self.sub_wt_pairs

    def save(self, folder):
        """
        Save PCA to given folder
        """
        pd.DataFrame(self.u_).to_csv(os.path.join(folder, "U.csv"), index=False)
        pd.DataFrame(self.x_).to_csv(os.path.join(folder, "X.csv"), index=False)
        pd.DataFrame(self.s_).to_csv(os.path.join(folder, "S.csv"), index=False)
        pd.DataFrame(self.sub_wt_pairs).to_csv(os.path.join(folder, "sub_wt_pairs.csv"), index=False)

    def get_top_k_matches(self, k, xq):
        return euclidean(np.dot(self.u_, np.diag(self.s_)), k, self.transform([xq]))


In [4]:
from sklearn.decomposition import LatentDirichletAllocation as LDA


class Lda:
    """
    Represents LDA feature reduction class
    ...
    Attributes:

        Data_matrix: ndarray of shape (num_objects, num_features)
            Data matrix to be reduced

        k: int
            Number of reduced features

    Methods:

        compute_lda(X):
            Returns a Matrix of K latent features * N objects

    """

    def __init__(self, *args):
        """
        :param data_matrix: input matrix of shape (num_objects, num_features)
                Data matrix to be reduced
        :param k: Number of reduced features

        OR 
        :param folder: folder containing LDA latent features

        """
        if (len(args)) == 2:
            # normal object instantiation with data_matrix and k
            self.data_matrix = args[0]
            self.k = args[1]
            self.lda_ = LDA(n_components=self.k).fit(self.data_matrix)
            self.new_object_map = self.transform(self.data_matrix)
            # Take average as sum of all probabilities will always be 1
            temp, self.sub_wt_pairs = get_sorted_matrix_on_weights(self.data_matrix, np.average(self.lda_.components_, axis=0), return_order=True)
        elif (len(args)) == 1:
            # load object from folder
            self.lda_ = LDA()
            self.lda_.components_ = pd.read_csv(os.path.join(args[0], "components.csv")).to_numpy()
            self.lda_.exp_dirichlet_component_ = pd.read_csv(
                os.path.join(args[0], "exp_dirichlet_components.csv")).to_numpy()
            self.new_object_map = pd.read_csv(os.path.join(args[0], "new_object_map.csv")).to_numpy()
            self.lda_.set_params(n_components=self.lda_.components_.shape[0])
            self.lda_.doc_topic_prior_ = 1 / self.lda_.components_.shape[0]
            self.sub_wt_pairs = pd.read_csv(os.path.join(args[0], "sub_wt_pairs.csv")).to_numpy()
        else:
            raise Exception("Invalid object instantiation: parameters must be either <data_matrix:2D numpy>,<k:int> "
                            "or <folder:string>.")

    def transform(self, data_matrix):
        """
        :param data_matrix: matrix to transform (query_objects, num_features)
        :return: lda transformed matrix
        """
        return self.lda_.transform(data_matrix)

    def get_latent_features(self):
        """
        :return: components of lda(normalized)
        """
        return self.lda_.components_

    def get_obj_weight_pairs(self):
        """
        :param data_matrix: matrix to transform (query_objects, num_features)
        :return: objects - lda transformed matrix
        :return: weights - components of lda(normalized)
        """
        # TODO: HOW?
        return self.sub_wt_pairs

    def save(self, folder):
        """
        Save LDA to given folder
        """
        pd.DataFrame(self.lda_.components_).to_csv(os.path.join(folder, "components.csv"), index=False)
        pd.DataFrame(self.lda_.exp_dirichlet_component_).to_csv(os.path.join(folder, "exp_dirichlet_components.csv"), index=False)
        pd.DataFrame(self.new_object_map).to_csv(os.path.join(folder, "new_object_map.csv"), index=False)

    def get_top_k_matches(self, k, xq):
        # KL divergence as it is a probability distribution
        return kl_divergence(self.new_object_map, k, self.transform([xq]))


In [5]:
import numpy as np
import pandas as pd
import os


class Svd:
    """
    Represents SVD dimension technique
    ...
    Attributes:
        k: int
            Number of reduced features

        X: ndarray of shape (num_objects, num_features)
            Data matrix to be reduced

    Methods:
        transform(X)
            Transforms and returns X in the latent semantic space and the latent semantics
    """

    def __init__(self, *args):
        """
        :param data_matrix: input matrix of shape (num_objects, num_features)
                Data matrix to be reduced
        :param k: Number of reduced features

        OR
        :param folder: folder containing LDA latent features

        """
        if (len(args)) == 2:
            # normal object instantiation with data_matrix and k
            self.U, self.S, self.VT, self.sub_wt_pairs = svd_cust(args[0], k_num=args[1],return_order=True)
        elif (len(args)) == 1:
            self.U = pd.read_csv(os.path.join(args[0], "U.csv")).to_numpy()
            self.S = pd.read_csv(os.path.join(args[0], "S.csv")).to_numpy()
            self.VT = pd.read_csv(os.path.join(args[0], "VT.csv")).to_numpy()
            self.sub_wt_pairs = pd.read_csv(os.path.join(args[0], "sub_wt_pairs.csv")).to_numpy()
        else:
            raise Exception("Invalid object instantiation: parameters must be either <data_matrix:2D numpy>,<k:int> "
                            "or <folder:string>.")

    def get_decomposition(self):
        """
        Parameters:
            X: ndarray of shape (num_objects, num_features)
                Data matrix to be reduced

        Returns:
            Transforms and returns X in the latent semantic space and the latent semantics
        """
        return self.U, self.S, self.VT

    def get_latent_features(self):
        """
        :return: U and S
        """
        return self.U, self.S

    def transform(self, data_matrix):
        """
        :param data_matrix: matrix to transform (query_objects, num_features)
        :return: pca transformed matrix
        """
        return np.dot(data_matrix, np.transpose(self.VT))

    def get_obj_weight_pairs(self):
        """
        :return: objects
        :return: weights
        """
        return self.sub_wt_pairs

    def save(self, folder):
        """
        Save SVD to given folder
        """
        pd.DataFrame(self.U).to_csv(os.path.join(folder, "U.csv"), index=False)
        pd.DataFrame(self.S).to_csv(os.path.join(folder, "S.csv"), index=False)
        pd.DataFrame(self.VT).to_csv(os.path.join(folder, "VT.csv"), index=False)
        pd.DataFrame(self.sub_wt_pairs).to_csv(os.path.join(folder, "sub_wt_pairs.csv"), index=False)

    def get_top_k_matches(self, k, xq):
        return euclidean(np.dot(self.U, self.S), k, self.transform([xq]))



In [6]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import os


class Kmeans:
    """
    Represents Kmeans dimension technique
    ...
    Attributes:
        k: int
            Number of reduced features

        X: ndarray of shape (num_objects, num_features)
            Data matrix to be reduced

    Methods:
        transform(X)
            Transforms and returns X in the latent semantic space and the latent semantics
    """

    def __init__(self, *args):
        """
        :param data_matrix: input matrix of shape (num_objects, num_features)
                Data matrix to be reduced
        :param k: Number of reduced features

        OR
        :param folder: folder containing LDA latent features

        """
        if (len(args)) == 2:
            # normal object instantiation with data_matrix and k
            imgs_slc = args[0]
            k = args[1]
            num_imgs = len(imgs_slc)
            arr_shp = imgs_slc[0][1].shape[0]
            imgs = np.zeros((num_imgs, arr_shp))
            for i in range(0, num_imgs):
                imgs[i] = imgs_slc[i][1]
            imgs_flat = imgs.reshape(num_imgs, arr_shp)
            kmeans = KMeans(n_clusters=k, random_state=0).fit(imgs_flat)
            self.centers = kmeans.cluster_centers_
            self.new_object_map = np.zeros((num_imgs, k))
            self.weight = np.zeros((num_imgs))
            # TODO: # features then truncate to k?
            # TODO: ask logic for two loops
            for i in range(0, num_imgs):
                for j in range(0, k):
                    self.new_object_map[i][j] = manhattan_fn(imgs_flat[i], self.centers[j])
                self.weight[i] = np.sum(self.new_object_map[i][:])
            # Since good latent semantics give high discrimination power
            # INFO: variance or distance maximized? Distance as we have a center not a line or curve
            temp, self.sub_wt_pairs = get_sorted_matrix_on_weights(self.new_object_map, np.average(self.weight, axis=0), return_order=True)
        elif (len(args)) == 1:
            self.centers = pd.read_csv(os.path.join(args[0], "centers.csv")).to_numpy()
            self.new_object_map = pd.read_csv(os.path.join(args[0], "new_object_map.csv")).to_numpy()
            self.weight = pd.read_csv(os.path.join(args[0], "weight.csv")).to_numpy()
            self.sub_wt_pairs = pd.read_csv(os.path.join(args[0], "sub_wt_pairs.csv")).to_numpy()
        else:
            raise Exception("Invalid object instantiation: parameters must be either <data_matrix:2D numpy>,<k:int> "
                            "or <folder:string>.")

    def get_decomposition(self):
        """
        Parameters:
            X: ndarray of shape (num_objects, num_features)
                Data matrix to be reduced

        Returns:
            Transforms and returns X in the latent semantic space and the latent semantics
        """
        return self.centers

    def get_latent_features(self):
        """
        :return: centers
        """
        return self.centers

    def transform(self, data_matrix):
        """
        :param data_matrix: matrix to transform (query_objects, num_features)
        :return: pca transformed matrix
        """
        new_map = np.zeros((len(data_matrix), len(self.centers)))
        for j in range(len(self.centers)):
            new_map[j] = manhattan_fn(data_matrix[:len(self.centers)][j], self.centers[j])
        return new_map

    def get_obj_weight_pairs(self):
        """
        :return: objects
        :return: weights
        """
        return self.sub_wt_pairs

    def save(self, folder):
        """
        Save Kmeans to given folder
        """
        pd.DataFrame(self.centers).to_csv(os.path.join(folder, "centers.csv"), index=False)
        pd.DataFrame(self.new_object_map).to_csv(os.path.join(folder, "new_object_map.csv"), index=False)
        pd.DataFrame(self.weight).to_csv(os.path.join(folder, "weight.csv"), index=False)
        pd.DataFrame(self.sub_wt_pairs).to_csv(os.path.join(folder, "sub_wt_pairs.csv"), index=False)

    def get_top_k_matches(self, k, xq):
        return manhattan(self.new_object_map, k, self.transform([xq]))



# Instantiate

In [7]:
A = np.array([
	[1,2,3,4,5,6,7,8,9,10],
	[11,12,13,14,15,16,17,18,19,20],
	[21,22,23,24,25,26,27,28,29,30],
	[42,44,46,48,50,52,54,56,58,60]])

# LDA class calls

In [None]:
lda_f = lda(A, 3)
lda_f.save("")
lda_q.get_top_k_matches(2, A[0])

In [None]:
lda_f.get_obj_weight_pairs()

In [None]:
lda_q = lda("")
lda_q.get_top_k_matches(2, A[0])

# PCA class calls

In [None]:
print(np.cov(A))
pca = Pca(3,A)
np.dot(pca.get_decomposition()[0], np.dot(pca.get_decomposition()[1],pca.get_decomposition()[2]))

In [None]:
np.dot(*pca.get_latent_features()), pca.get_obj_weight_pairs()

In [None]:
pca.get_top_k_matches(2, A[0])

# SVD calls

In [None]:
svd = Svd(A,3)
np.dot(svd.get_decomposition()[0], np.dot(svd.get_decomposition()[1], svd.get_decomposition()[2]))

In [None]:
np.dot(*svd.get_latent_features())

In [None]:
svd.get_top_k_matches(2, A[0])

# Kmeans calls

# Wrapper function

In [8]:
def perform_dimensionality_reductions(matrix, k, technique):
  if technique=="pca":
    obj = Pca(k,matrix)
  elif technique=="svd":
    obj = Svd(k,matrix)
  else:
    obj = Lda(k,matrix)
  obj.save("")
  return obj.get_obj_weight_pairs

SyntaxError: ignored