# Represented brain Regions by Features
In this notebook, we find the brain region that overlaps the most with each feature.
<br>That means the brain region is represented by that feature.
<br>Then, we see how many brain regions are repeated.

In [None]:
import numpy as np
import pandas as pd

In [None]:
'''Clustering each feature to 2 clusters to find the active regions'''

def cluster_festures_separately(features_arr):
    '''
    This function clusters each feature of the input features array (samples * features) separately
    using K-means clustering and returns an array of labeled features (samples * features).
    
    Args:
        features_arr: numpy array
            The numpy array of features (samples * features).
    
    Returns:
        labels_arr: numpy array
            The numpy array of labeled features (samples * features).
    '''

    # Making an array to store all labels
    labels_arr = np.zeros_like(features_arr)

    for feature_n in range(features_arr.shape[1]):
        
        # Selecting the feature
        feature = features_arr[:, feature_n].reshape(-1,1)

        # Running clustering
        n_clusters = 2
        kmeans = KMeans(n_clusters = n_clusters, random_state = 0)
        kmeans.fit_predict(feature)
        labels_arr[:, feature_n] = kmeans.labels_
        
    return labels_arr

In [None]:
'''Computing DICE index of brain regions and clusterings'''

def region_feature_overlap(labels_arr, features_arr, cluster_selection = 'best_overlap'):
    '''
    This function takes the labels array (samples * features) and measures the DICE score
    for each of the labels for each feature with the brain regions.
    
    Args:
        labels_arr: numpy array
            Array of labeled features (samples * features).
        
        features_arr: numpy array
            Array of features (samples * features).
        
        cluster_selection: {'best_overlap', 'count', 'intensity'}, default = 'best_overlap'
            The method for choosing the label (0 or 1) for each labeled cluster.
                best_overlap: chooses the one with higher DICE score with the best region.
                count: chooses the smaller one based on the count of values.
                intensity: chooses the label with higher intensities.
                
    Returns:
        overlap_df: Pandas DataFrame
            DataFrame of DICE scores of brain regions and labeled features.
    '''
    
    # Making a desired dataframe
    df_path = 'files/overlap_template_df.csv'
    overlap_df = pd.read_csv(df_path, index_col = 'Unnamed: 0')
    region_ids_list = overlap_df['ant_id']
    
    # Copying Features and Number of features
    total_features = features_arr.shape[1]
    features = features_arr.copy()
    
    for i in range(total_features):
        overlap_df['DICE_feature_'+str(i)] = np.nan

    # Inputting the indices path
    indices_path = 'files/mask_indices.npy'

    # Loading the anatomy file
    ant_file = 'files/allen_annot200_mask.npy'
    ant_vec = np.load(ant_file)

    for row, ID in enumerate(region_ids_list):

        # Masking the brain by the region ID
        masked_brain = np.where(ant_vec == ID, 1, 0)
        count_region = np.sum(masked_brain)

        # Looping over clusters
        for i in range(total_features):

            # Selecting the label and feature
            labels = labels_arr[:, i]
            feature = features[:, i]


            # Selecting the cluster ID
            if cluster_selection == 'intensity':

                # Making a list of average intensities in the clustered feature
                avg_feature_intensity_list = []

                for label_val in range(2):
                    masked_feature = np.where(labels == label_val, feature, 0)
                    count_nonzero = np.count_nonzero(masked_feature)
                    average_feature = np.sum(masked_feature) / count_nonzero
                    avg_feature_intensity_list.append(average_feature)

                cluster_ID = avg_feature_intensity_list.index(max(avg_feature_intensity_list))


            elif cluster_selection == 'count':

                # Selecting the smaller cluster between two
                counts = np.unique(labels, return_counts = True)[1]
                cluster_ID = np.argmin(counts)


            elif cluster_selection == 'best_overlap':

                # Making a list of overlaps for both clusters
                clusters_overlap_list = []

                for cluster_ID in range(2):

                    # Reconstructing the labels
                    labels_rc = reconstruct_ABA(labels, indices_path, mirror = False)

                    # Masking the clusters
                    masked_cluster = np.where(labels_rc == cluster_ID, 1, 0)
                    count_cluster = np.sum(masked_cluster)

                    # Counting the overlap and adding to the list
                    count_overlap = np.vdot(masked_brain, masked_cluster)
                    overlap_ratio = (2 * count_overlap) / (count_region + count_cluster)
                    clusters_overlap_list.append(overlap_ratio)

                # Adding maximum overlap ratio to the dataframe
                overlap_df['DICE_feature_'+str(i)].loc[row] = max(clusters_overlap_list)


            if cluster_selection in ['count', 'intensity']:
                # Reconstructing the labels
                labels_rc = reconstruct_ABA(labels, indices_path, mirror = False)

                # Masking the clusters
                masked_cluster = np.where(labels_rc == cluster_ID, 1, 0)
                count_cluster = np.sum(masked_cluster)

                # Counting the overlap and adding to the DataFrame
                count_overlap = np.vdot(masked_brain, masked_cluster)
                overlap_ratio = (2 * count_overlap) / (count_region + count_cluster)
                overlap_df['DICE_feature_'+str(i)].loc[row] = overlap_ratio
                
    return overlap_df

In [None]:
# Loading the features
features_arr = np.load('files/SFT_100features.npy')
n_features = features_arr.shape[1]

# Clustering the features separately
labels_arr = cluster_festures_separately(features_arr)

# Computing the DICE scores of the clustered features and brain regions
overlap_df = region_feature_overlap(labels_arr, features_arr, cluster_selection = 'best_overlap')

In [None]:
'''Choosing the top regions and DICE scores'''

# Should make a list of all brain regions and their counts in every method
acronyms_set = set([])

# Selecting the part of dataframe for DICE values
dropped_columns = ['full_ant_index','ish_id','ant_id','region_overlap_ratio','jac_overlap_ratio','DICE_neg_jcb_anat']
DICE_df = overlap_df.drop(labels = dropped_columns, axis = 1)

# List of regions and DICE scores
region_list = []
DICE_list = []

for feature_n in range(n_features):
    
    # Sorting by the weights in each column
    sorted_df = DICE_df.sort_values(by=['DICE_feature_'+str(feature_n)], ascending=False, na_position='last')
    sorted_df.reset_index(drop=True, inplace=True)
    
    # Selecting the region
    region_name = sorted_df['acronym'].loc[:0].tolist()
    region_list.extend(region_name)
    
    # Selecting the DICE score
    DICE_value = sorted_df['DICE_feature_'+str(feature_n)].loc[:0].tolist()
    DICE_list.extend(DICE_value)

# Top DICE dataframe
top_df = pd.DataFrame()
top_df['acronym'] = region_list
top_df['DICE_score'] = DICE_list
top_df['feature_ID'] = range(n_features)

# Sorting by the weights in each column
top_sorted_df = top_df.sort_values(by=['DICE_score'], ascending=False, na_position='last')
top_sorted_df.reset_index(drop=True, inplace=True)

# Adding acronyms to the acronyms list
acronyms_set.update(top_sorted_df['acronym'].unique())