In [1]:
import numpy as np
import os
from scipy.io import loadmat

In [17]:
class PNC_dataset(object):
    def __init__(self, PNC_dic = os.path.join(r'C:\Users\lxiao1\Desktop\GangQu\Dataset\PNC\Raw\fmri\PNC_all'),
                 meta_log=[True, True], fmri_log=[True, True, True], sig=1, method = 'Gaussian', off_diagnal=True):
        
        self.PNC_dic = PNC_dic
        self.meta_log = meta_log
        self.fmri_log = fmri_log
        self.sig = sig
        self.method = method
        self.off_diagnal = off_diagnal
    
    def select_subset(self, meta_log=[True, True], fmri_log=[True, True, True]):
        """
        select the subset dataset from all PNC data 
        loading the .mat file - PNC_all

        input : 
                PNC_dic: the path of PNC_all
                meta_log: [True, True] if using [PVRT ,WRAT] for labels; False if using age for labels
                fmri_log: logical for emoid, nback, rest paradigm, respectively
        return:
                subset of PNC_dic
        """
        # loading the mat data

        data_tem = loadmat(self.PNC_dic)
        PNC_dic = data_tem['PNC_dic']
        N = len(PNC_dic)

        # select the data based on the paradigm and labels

        ind_select = []

        for i in range(N):

            if all(PNC_dic[i]['meta'].item()['meta_log'].item().flatten() == meta_log) and all(PNC_dic[i]['fmri'].item()['fmri_log'].item().flatten() == fmri_log):

                ind_select.append(i)

        return PNC_dic[ind_select]

    
    
    def load_PNC_data(self):
        """
        return the data and labels for each selected paradigm

        input:
                PNC_dic: the path of PNC_all
                meta_log: [True, True] if using [PVRT ,WRAT] for labels; False if using age for labels
                fmri_log: logical for emoid, nback, rest paradigm, respectively
        return:
                fmri- dictionary with paradigms as keys, each fmri paragigm stored in N*N_ROI*N_time 3D array
                label-dictionary with label types as keys, each label stored in N*1 array
        """

        # select the subset
        meta_log = self.meta_log
        fmri_log = self.fmri_log
        data = self.select_subset(meta_log=meta_log, fmri_log=fmri_log)

        if not(meta_log == [True, True] and fmri_log == [True, True, True]):

            data_ = self.select_subset()
            data = np.concatenate((data, data_), axis=0)
            
        N = len(data) # number of subjects

        # initialize the dictionary
        paradigm = ['emoid', 'nback', 'rest']
        paradigm = [j for i, j in enumerate(paradigm) if fmri_log[i] ]

        name = ['WRAT', 'PVRT']
        label_name = ['age']

        label_name.extend([j for i, j in enumerate(name) if meta_log[i]])

        fmri = {}
        label = {}

        for fmri_para in paradigm:

            N_ROI, N_time = np.squeeze(data[0]['fmri'].item()[fmri_para].item()).shape
            fmri[fmri_para] = np.zeros((N, N_ROI, N_time))

        for l in label_name:

            label[l] = np.zeros((N, 1))

        # loading the data into dictionary
        for ind in range(N):

            for fmri_para in paradigm:

                fmri[fmri_para][ind, :, :] = np.squeeze(data[ind]['fmri'].item()[fmri_para].item())

            for l in label_name:

                label[l][ind, :] = data[ind]['meta'].item()[l].item().item()

        return fmri, label, data
    
    
    def Cos_sim(self, x1, x2):
        """
        calculate the cosine similarity
        Input:
            x1, x2 are vector 1*N
        Output:
            Normalized cosine similarity
        """
        sim = np.sum(np.multiply(x1, x2)) / (np.linalg.norm(x1) * np.linalg.norm(x2))
        
        return 1 - np.arccos(sim) / np.pi
    
    
    def Gaussian_sim(self, x1, x2, sig):
        """
        calculate the Gaussian similarity
        Input:
            x1, x2 are vector 1*N
        Output:
            Gaussian similarity
        """
        sim = np.exp(-np.linalg.norm(x1 - x2) / sig)
        
        return sim
    
    
    def similarity(self, data, method, sig):
        """
        calculate the similarity using the function
        input:
                data: N*d, numpy array N is number of subjects, d is dimension of features
                method: Gaussian or Cosine
        output:
                sim: similarity matrix N*N
        """
        from itertools import combinations

        if isinstance(data,(np.ndarray)):

            N, d = data.shape
            sim = np.zeros((N, N))
        else:

            raise RuntimeError('input data should be numpy array')

        sample_size = [i for i in range(N)]
        pair_pool = list(combinations(sample_size, 2))

        if method == 'Gaussian':

            for ind_p in pair_pool:

                i_, j_ = ind_p

                if i_ != j_:
                    x_0 = data[i_, :]
                    x_1 = data[j_, :]
                    sim[i_, j_] = self.Gaussian_sim(x_0, x_1, sig)
                    sim[j_, i_] = sim[i_, j_]
                else:
                    sim[i_, j_] = 1

        elif method == 'Cosine':
            for ind_p in pair_pool:

                i_, j_ = ind_p

                if i_ != j_:
                    x_0 = data[i_, :]
                    x_1 = data[j_, :]
                    sim[i_, j_] = self.Cos_sim(x_0, x_1)
                    sim[j_, i_] = sim[i_, j_]
                else:
                    sim[i_, j_] = 1

        else:

            raise RuntimeError('method not supported')

        return sim
    
    def get_FCN(self, signal, off_diagnal):
        """
        calculate the brain functional connectivity from signal, correlation 
        Input:
            signal: N*N_ROI*N_time 3D array
        Output:
            if pick off_diagnal elements:
                FCN: N*d   d = [N^2+N]/2
            else:
                FCN: N*d   d = [N^2]
        """
        N, N_ROI, N_time = signal.shape
        if off_diagnal:
            d = int((N_ROI**2 + N_ROI)/2)
        else:
            d = int(N_ROI**2)

        FCN = np.zeros((N, d))

        for i in range(N):

            fcn = np.corrcoef(signal[i, :, :])
            fcn[np.isnan(fcn)] = 0
            if off_diagnal:
                fcn = fcn[np.tril_indices(N_ROI)]
            else:
                fcn = fcn.reshape(-1)

            FCN[i, :] = fcn

        return FCN
    
    def cal_inclass_sim(self, fmri, method, sig):
        """
        calculate the similarity between class, two paradigm

        """
        fmri_sim = {}
        features = {}
        for para in fmri.keys():

            signal = fmri[para]
            feature = self.get_FCN(signal, off_diagnal = self.off_diagnal)
            features[para] = feature
            fmri_sim[para] = self.similarity(feature, method = method, sig=sig)
        return fmri_sim, features
    
    def return_all(self):
        raw_fmri, label, data = self.load_PNC_data()
        fmri_sim, features = self.cal_inclass_sim(fmri = raw_fmri, method = self.method, sig = self.sig)
        all_data = {}
        raw_data = {}
        all_data['features'] = features
        all_data['fmri_sim'] = fmri_sim
        all_data['label'] = label
        raw_data['raw_fmri'] = raw_fmri
        raw_data['label'] = label
        raw_data['dataall'] = data
        return all_data, raw_data