In [1]:
import os
import math
import pickle
# import time

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# from pymatgen.io import ase
# from ase import atoms
# from ase.io import read, write

from tqdm import notebook as tqdm

from scipy.sparse import csr_matrix
from scipy.sparse import hstack
# from sklearn.decomposition import IncrementalPCA

from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import fcluster
from scipy.cluster import hierarchy
import scipy.spatial.distance as ssd
from sklearn import metrics as skmetrics

%matplotlib widget

In [2]:
class Agglomerative_Clustering:
    """
    A class to handle agglomerative clustering for each of the features. 

    ...

    Attributes
    ----------
    valid_features_df_path : str
        The location of the valid_features_df.pkl file. 
    
    labels_df_path : str
        The location of the labels_df.pkl file. 

    saved_features_path : str
        The directory where the sparse features are saved.
        
    results_df_path : str
        The location where the results should be saved: results_df.pkl.  
        
    valid_features_df : pd.DataFrame
        A dataframe that contains booleans indicating whether or not a structure-feature combination is error free. 
        
    labels_df : pd.DataFrame
        A dataframe containing the labeled data (i.e., conductivity and BVSE values). 
        
    results_df : pd.DataFrame
        A dataframe containing the results from the agglomerative clustering. 
        
    feature_map : list
        A list of all the feature files in the saved_features_path directory. These should be the sparse representations. 

    Methods
    -------
    list_features():
        Set the mode that the class operates in. Each move corresponds to one of the nine structure representations. 
        
    feature_merger(selection, multiplication_factor, valid_features_df, labels_df, feature_map, common_labels):
        Takes the requested features, multiplies them by a given factor, and merges them together into a combined feature vector. 
        
    distance_matrix_calculator(sparse_features):
        Uses skmetrics.pairwise_distances_chunked to calculate the distance matrix on the sparse represenatation of the feature vector. 
    
    augmented_dendrogram(*args, **kwargs):
        A wrapper for scipy.cluster.hierarchy.dendrogram that allows for dendrogram customization. 
    
    clustering(ddata, linkage_matrix, periods=2):
        Applies scipy.cluster.hierarchy.fcluster to the linkage matrix to return the cluster labels for each level of clustering. 
    
    conductivity_variance_calculation(valid_labels_df, cluster_sets):
        Given the cluster assignment at each clustering level, the intracluster conductivity variance is calculated from the labels. 
    
    bvse_variance_calculation(valid_labels_df, cluster_sets):
        Given the cluster assignment at each clustering level, the intracluster BVSE variance is calculated from the labels. 
    
    save_results(selection, multiplier, cluster_sets, conductivity_variance_by_cluster, bvse_variance_by_cluster, ddata, linkage_matrix):
        Save the agglomerative clustering results in a pandas dataframe using pickle.
    
    run_clustering(feature_selector_list, multiplier_list, common_labels=True):
        A wrapper which runs the entire agglomerative clustering routine using the methods above. 
        
    """
    def __init__(self, valid_features_df_path, labels_df_path, saved_features_path, results_df_path):
        self.valid_features_df_path = valid_features_df_path
        self.labels_df_path = labels_df_path
        self.saved_features_path = saved_features_path
        self.results_df_path = results_df_path
        
        # load in the valid_features_df
        open_file = open(self.valid_features_df_path, 'rb')
        self.valid_features_df = pickle.load(open_file)
        open_file.close()

        # load in the labels_df
        open_file = open(self.labels_df_path, 'rb')
        self.labels_df = pickle.load(open_file)
        open_file.close()
        
        # grab all features from the repository
        self.feature_map = []
        files = os.listdir(self.saved_features_path)
        for file in files:
            self.feature_map.append(file)
        
        # either create a fresh results_df or load one ine
        try:
            # load in the results_df
            open_file = open(self.results_df_path, 'rb')
            self.results_df = pickle.load(open_file)
            open_file.close()
        except:
            self.results_df = pd.DataFrame(columns=['feature0', 'multiplier0', 'feature1', 'multiplier1', 'conductivity_variance', 'bvse_variance', 'cluster_sets', 'ddata', 'linkage_matrix'])

    def list_features(self):
        """
        Prints feature_map attribute with the associated indices. 
        This allows the user to choose which features will be used. 
        """      
        for idx, feature in enumerate(self.feature_map):
            print("{} - {}".format(idx, feature[0:-4]))
            
   
    def feature_merger(self, selection, multiplication_factor, valid_features_df, labels_df, feature_map, common_labels = True):
        """
        The function takes a 1D list of selected features and a 1D list of multiplication factors. Each feature
        is multiplied by the corresponding entry in the multiplication list. The multiplied features are then merged to 
        form a composite feature vector. A sparse representation is returned. 

        Parameters
        ----------
        selection : list,1D
            A list of the features that will be merged. The list should be of the form: [x] or [x, y].
            The former representation simply returns the feature x.
            The latter representation would combine features x and y togther to make a composite feature vector.  
            The numbers are mapped to their feature through the feature_map list (described below). 

        multiplication_factor : list,1D
            A list of multipliers that will be applied to the selected features. This allows the features
            to be mixed in different ratios.  

        valid_features_df : pd.DataFrame
            The pandas dataframe containing boolean entries for all feature-structure combinations. And also
            a final column that is the logical.AND of all the columns (i.e., the structures that will work
            for every single feature representation). 

        feature_map : list
            The list of sparse features saved in the feature directory. The function uses this to map
            the numeric selections into the desired features. 

        common_labels : boolean
            If true, then the structures that are safe for all features representations will be used. 
            If false, then the structures that are safe only for the supplied features (in the selection list)
            will be used. 

        Returns
        -------
        sparse_features : scipy.sparse.csr.csr_matrix
            The combined features of interest in a sparse matrix. 

        valid_labels_df : pd.DataFrame
            A dataframe only containing the rows that are valid for the feature representation. 
        """      
        sparse_features = []
        feature_names = []
        for idx, feature_number in enumerate(tqdm.tqdm(selection, desc='Merging Features')):
            feature_path = feature_map[feature_number]
            feature_names.append(feature_path[0:-4])
            save_path = '{}{}'.format(self.saved_features_path, feature_path)
            open_file = open(save_path, 'rb')
            temp_instance = pickle.load(open_file)
            open_file.close()
            sparse_features.append(temp_instance*multiplication_factor[idx])

        if common_labels:
            valid_rows = valid_features_df.compiled
        else: 
            valid_rows = valid_features_df.loc[:, feature_names].all(axis=1)

        sparse_features = hstack(sparse_features, format='csr')[valid_rows]
        valid_labels_df = labels_df.loc[valid_rows, :].copy()

        return sparse_features, valid_labels_df
    
    
    def distance_matrix_calculator(self, sparse_features):
        """
        Function to take the sparse features and calculate the distance matrix. The distance matrix
        can be used to construct the agglomerative clustering representation. 

        Parameters
        ----------
        sparse_features : scipy.sparse.csr.csr_matrix
            The combined features of interest in a sparse matrix. 

        Returns
        -------
        ssd.squareform(distance_matrix) : np.array
            The squareform of the distance matrix. 
            """      
        distance = skmetrics.pairwise_distances_chunked(sparse_features, n_jobs=63, working_memory=1000)
        distance_matrix = next(distance)

        while True:
            try:
                distance_matrix = np.concatenate((distance_matrix, next(distance)), axis=0)
            except StopIteration:
                break  # Iterator exhausted: stop the loop

        distance_matrix = (distance_matrix+distance_matrix.T)/2
        return ssd.squareform(distance_matrix)
    
    
    def augmented_dendrogram(self, *args, **kwargs):
        """
        A wrapper for the scipy.cluster.hierarchy.dendrogram method. The dendrogram method is used to 
        construct the agglomerative dendrogram after applying the scipy.cluster.hierarchy.linkage method to the distance matrix. 
        
        Returns
        -------
        ddata : np.array
            The agglomerative dendrogram found by applying the scipy.cluster.hierarchy.dendrogram
            method to the linkage matrix.
        """ 
        max_d = kwargs.pop('max_d', None)
        if max_d and 'color_threshold' not in kwargs:
            kwargs['color_threshold'] = max_d
        annotate_above = kwargs.pop('annotate_above', 0)

        ddata = dendrogram(*args, **kwargs)

        if not kwargs.get('no_plot', False):
            plt.title('Hierarchical Clustering Dendrogram (truncated)')
            plt.xlabel('sample index or (cluster size)')
            plt.ylabel('distance')
            for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
                x = 0.5 * sum(i[1:3])
                y = d[1]
                if y > annotate_above:
                    plt.plot(x, y, 'o', c=c)
            if max_d:
                plt.axhline(y=max_d, c='k')
        return ddata
    
    
    def clustering(self, ddata, linkage_matrix, periods=2):
        """
        The function uses the agglomerative dendrogram and the linkage matrix to determine which structures are labeled
        with which clusters at each level of clustering. 

        Parameters
        ----------
        ddata : np.array
            The agglomerative dendrogram found by applying the scipy.cluster.hierarchy.dendrogram
            method to the linkage matrix.

        linkage_matrix : np.array
            The linkage matrix found by applying the scipy.cluster.hierarchy.linkage method to
            the distance matrix. 

        periods : int
            Used to calculate the midpoint positions in the agglomerative dendrogram. Should be set 
            to 2 for branches that split in two. 

        Returns
        -------
        cluster_sets : np.array
            The assignment of each structure to a cluster, for each level of clustering. 
        """
        dcoord = np.array(ddata['dcoord'])
        offset_points = np.sort(dcoord[:,1])[::-1]
        weights = np.ones(periods) / periods
        average_midpoints =  np.convolve(offset_points, weights, mode='valid')

        # create an initial row where everything is in cluster #1
        cluster_sets = [np.ones(np.shape(linkage_matrix)[0]+1, dtype=np.int32)]

        for max_d in average_midpoints:
            clusters = fcluster(linkage_matrix, max_d, criterion='distance')
            cluster_sets.append(clusters)

        cluster_sets = np.array(cluster_sets)

        return cluster_sets
    
    
    def conductivity_variance_calculation(self, valid_labels_df, cluster_sets): 
        """
        Given the assignment of labeled structures into each cluster, this method
        calculates the intracluster conductivity variance. The calculation is done 
        for each level of clustering. A frozen state strategy is used:
        (1) calculations start at the lowest level of clustering (i.e. 2 clusters)
        (2) at each level the partial variance is calculated for each label and then saved in a 'current_state' column
        (3) the partial variances will be summed to give the total intracluster variance
        (4) however, before summation, the labels are examined to see if any have been sorted into a cluster by themselves
            (i.e. without any other labels).
        (5) labels sorted by themselves would result in 0 partial variance. Instead, these labels are reverted to the 
            'previous_state' which is just the 'current_state' column from the previous clustering level
        
        Parameters
        ----------
        valid_labels_df : pd.DataFrame
            A dataframe only containing the rows that are valid for the feature representation.

        cluster_sets : np.array
            The assignment of each structure to a cluster, for each level of clustering. 

        Returns
        -------
        variance_by_cluster : np.array
            The intracluster conductivity variance at each level of clustering. 
        """
        # create four new columns in the dataframe
        valid_labels_df['log_conductivity'] = None
        valid_labels_df['partial_variance'] = 0
        valid_labels_df['partial_variance_previous_state'] = 0
        valid_labels_df['cluster'] = 0

        # take the log10 conductivity and populate the 'log_conductivity' column
        mask = valid_labels_df['conductivity']>0
        temp_df = valid_labels_df.loc[mask, 'conductivity']
        valid_labels_df.loc[mask, 'log_conductivity'] = temp_df.apply(lambda x: np.log10(x))

        # make a list to store the calculations in
        variance_by_cluster = []

        # iterate through the clustering sets, starting at 1 cluster
        for cluster_set in tqdm.tqdm(cluster_sets, desc='Calculating Conductivity Variance'):
            # label all the rows with their cluster
            valid_labels_df['cluster'] = cluster_set

            # groupby method to count the number of labels in each cluster
            cluster_counts = valid_labels_df.groupby('cluster').count()['conductivity']

            # groupby method to determine the log_conductivity mean for each cluster
            cluster_means = valid_labels_df.groupby('cluster')['log_conductivity'].mean()

             # helper function for calculating the partial variance. Used by pd.DataFrame.apply() in the variance_calculation function
            def partial_variance_helper(row, cluster_means, cluster_counts):
                return (row['log_conductivity']-cluster_means[row['cluster']])**2

            # calculate the partial variance for each row with a conductivity value
            mask = valid_labels_df['conductivity']>0
            temp_df = valid_labels_df[mask]
            valid_labels_df.loc[mask, 'partial_variance'] = temp_df.apply(partial_variance_helper,  axis=1, args=([cluster_means, cluster_counts]))

            # create a dataframe just with the variance labels
            variance_df = valid_labels_df[valid_labels_df['conductivity']>0].loc[:, ['cluster', 'partial_variance']]

            # group the variance_df by cluster
            cluster_grouping = variance_df.groupby('cluster')

            # find the clusters that contain <=1 label. These are the frozen clusters. Return the index for the labels in the frozen clusters. 
            frozen_groups = variance_df.groupby('cluster').count()[(variance_df.groupby('cluster').count()<=1).values]
            frozen_idx = [idx for sublist in [cluster_grouping.groups[x].values.tolist() for x in frozen_groups.index.values] for idx in sublist]

            # return all frozen rows to their previous value
            valid_labels_df.loc[frozen_idx, 'partial_variance'] = valid_labels_df.loc[frozen_idx, 'partial_variance_previous_state']

            # calculate the intracluster conductivity variance. This is just the sum of the partial variance column
            variance_by_cluster.append(valid_labels_df.partial_variance.sum())

            # update the frozen state before moving to the next iteration
            valid_labels_df.loc[:, 'partial_variance_previous_state'] = valid_labels_df.loc[:, 'partial_variance']

        return variance_by_cluster

    
    def bvse_variance_calculation(self, valid_labels_df, cluster_sets):
        """
        Given the assignment of labeled structures into each cluster, this method
        calculates the intracluster bvse variance. The calculation is done 
        for each level of clustering. A frozen state strategy is used:
        (1) calculations start at the lowest level of clustering (i.e. 2 clusters)
        (2) at each level the partial variance is calculated for each label and then saved in a 'current_state' column
        (3) the partial variances will be summed to give the total intracluster variance
        (4) however, before summation, the labels are examined to see if any have been sorted into a cluster by themselves
            (i.e. without any other labels).
        (5) labels sorted by themselves would result in 0 partial variance. Instead, these labels are reverted to the 
            'previous_state' which is just the 'current_state' column from the previous clustering level
        
        Parameters
        ----------
        valid_labels_df : pd.DataFrame
            A dataframe only containing the rows that are valid for the feature representation.

        cluster_sets : np.array
            The assignment of each structure to a cluster, for each level of clustering. 

        Returns
        -------
        variance_by_cluster : np.array
            The intracluster bvse variance at each level of clustering. 
        """
        # create four new columns in the dataframe
        valid_labels_df['partial_variance'] = 0
        valid_labels_df['partial_variance_previous_state'] = 0
        valid_labels_df['cluster'] = 0

        # make a list to store the calculations in
        variance_by_cluster = []

        # iterate through the clustering sets, starting at 1 cluster
        for cluster_set in tqdm.tqdm(cluster_sets, desc='Calculating BVSE Variance'):
            # label all the rows with their cluster
            valid_labels_df['cluster'] = cluster_set

            # groupby method to count the number of labels in each cluster
            cluster_counts = valid_labels_df.groupby('cluster').count()['BVSE']

            # groupby method to determine the BVSE mean for each cluster
            cluster_means = valid_labels_df.groupby('cluster')['BVSE'].mean()

             # helper function for calculating the partial variance. Used by pd.DataFrame.apply() in the variance_calculation function
            def partial_variance_helper(row, cluster_means, cluster_counts):
                return (row['BVSE']-cluster_means[row['cluster']])**2

            # calculate the partial variance for each row with a BVSE value
            mask = valid_labels_df['BVSE']>0
            temp_df = valid_labels_df[mask]
            valid_labels_df.loc[mask, 'partial_variance'] = temp_df.apply(partial_variance_helper,  axis=1, args=([cluster_means, cluster_counts]))

            # create a dataframe just with the variance labels
            variance_df = valid_labels_df[valid_labels_df['BVSE']>0].loc[:, ['cluster', 'partial_variance']]

            # group the variance_df by cluster
            cluster_grouping = variance_df.groupby('cluster')

            # find the clusters that contain <=1 label. These are the frozen clusters. Return the index for the labels in the frozen clusters. 
            frozen_groups = variance_df.groupby('cluster').count()[(variance_df.groupby('cluster').count()<=1).values]
            frozen_idx = [idx for sublist in [cluster_grouping.groups[x].values.tolist() for x in frozen_groups.index.values] for idx in sublist]

            # return all frozen rows to their previous value
            valid_labels_df.loc[frozen_idx, 'partial_variance'] = valid_labels_df.loc[frozen_idx, 'partial_variance_previous_state']

            # calculate the intracluster BVSE variance. This is just the sum of the partial variance column
            variance_by_cluster.append(valid_labels_df.partial_variance.sum())

            # update the frozen state before moving to the next iteration
            valid_labels_df.loc[:, 'partial_variance_previous_state'] = valid_labels_df.loc[:, 'partial_variance']

        return variance_by_cluster
    
    
    def save_results(self, selection, multiplier, cluster_sets, conductivity_variance_by_cluster, bvse_variance_by_cluster, ddata, linkage_matrix):
        """
        The function saves all the relevant outputs from the agglomerative clustering process 
        in a pandas dataframe using pickle. Save the pickled dataframe at the self.results_df_path attribute. 

        Parameters
        ----------
        selection : list
            The feature(s) that were combined. 
            
        selection : list
            The multiplication factor for each feature in the selection.  

        cluster_sets : np.array
            The assignment of each structure to a cluster, for each level of clustering. 

        conductivity_variance_by_cluster : np.array
            The intracluster conductivity variance at each level of clustering. 
            
        bvse_variance_by_cluster : np.array
            The intracluster bvse variance at each level of clustering. 
       
        ddata : np.array
            The agglomerative dendrogram found by applying the scipy.cluster.hierarchy.dendrogram
            method to the linkage matrix.

        linkage_matrix : np.array
            The linkage matrix found by applying the scipy.cluster.hierarchy.linkage method to
            the distance matrix. 
        """
        if len(self.results_df)==0:
            new_row_idx = 0
        else:
            new_row_idx = self.results_df.index.max()+1

        for idx, feature in enumerate(selection):
            self.results_df.at[new_row_idx, 'feature{}'.format(idx)] = self.feature_map[feature][0:-4]

        for idx, multiple in enumerate(multiplier):
            self.results_df.at[new_row_idx, 'multiplier{}'.format(idx)] = multiple

        self.results_df.at[new_row_idx, 'conductivity_variance'] = conductivity_variance_by_cluster
        self.results_df.at[new_row_idx, 'bvse_variance'] = bvse_variance_by_cluster
        self.results_df.at[new_row_idx, 'cluster_sets'] = cluster_sets
        self.results_df.at[new_row_idx, 'ddata'] = ddata
        self.results_df.at[new_row_idx, 'linkage_matrix'] = linkage_matrix
        
        save_path = os.path.join(os.getcwd(), self.results_df_path)
        save_file = open(save_path, 'wb')
        pickle.dump(self.results_df, save_file)
        save_file.close()
    
   
    def run_clustering(self, feature_selector_list, multiplier_list, common_labels=True):
        """
        A wrapper to run through the entire agglomerative clustering process, by calling the methods above.
        The function takes a 2D list of desired features and a 2D list of multiplication factors. For each zipped pair in the list 
        the features are multiplied by the multiplication factors and then merged. 

        Parameters
        ----------
        feature_selctor_list : list,2D
            A list of the features that will be merged. The list should be of the form: [[1,2], [1,3]] or [[1], [2], [3]].
            The former representation would combine features 1&2 during the first iteration and then 1&3 in
            the next iteration. The latter representation will simply return the features 1, 2, and 3 sequentially. 
            The numbers are mapped to their feature through the feature_map list. 

        multiplier_list : list,2D
            A list of multipliers that will be applied to the selected features. This allows the features
            to be mixed in different ratios.  

        common_labels : boolean
            If true, then the structures that are safe for all features representations will be used. 
            If false, then the structures that are safe only for the supplied features (in the feature selector list)
            will be used. 
        """
        for selection, multiplier in tqdm.tqdm(zip(feature_selector_list, multiplier_list), desc='Iterating Through Selections', total=len(feature_selector_list)):
            print('-' * 50)
            print('\033[1mWorking on selection = {} with multiplier = {}\033[0m'.format(selection, multiplier))
            sparse_features, valid_labels_df = self.feature_merger(selection, multiplier, self.valid_features_df, self.labels_df, self.feature_map, common_labels=common_labels)
            distance_matrix = self.distance_matrix_calculator(sparse_features)
            linkage_matrix = hierarchy.linkage(distance_matrix, 'ward')
            
            max_clustering = 300
            ddata = self.augmented_dendrogram(
                linkage_matrix,
                truncate_mode='lastp',
                p=max_clustering+1,
                leaf_rotation=90.,
                leaf_font_size=3.,
                show_contracted=False,
                annotate_above=5,  # useful in small plots so annotations don't overlap
                max_d=50, #where this is the distance cutoff
                above_threshold_color='grey',
                no_plot = True
            )

            cluster_sets = self.clustering(ddata, linkage_matrix)
            conductivity_variance_by_cluster = self.conductivity_variance_calculation(valid_labels_df, cluster_sets)
            bvse_variance_by_cluster = self.bvse_variance_calculation(valid_labels_df, cluster_sets)

            self.save_results(selection, multiplier, cluster_sets, conductivity_variance_by_cluster, bvse_variance_by_cluster, ddata, linkage_matrix)
            

### 5a. Instantiate the Agglomerative_Clustering class

In [3]:
ag = Agglomerative_Clustering(valid_features_df_path = 'semi-supervised_supporting_files/valid_features_df.pkl', \
                              labels_df_path = 'semi-supervised_supporting_files/labeled_data_BVSE.pkl', \
                              saved_features_path = 'sparse_features/', \
                              results_df_path='semi-supervised_supporting_files/ac_results_df.pkl')

### 5b. List the features in the saved directory

In [4]:
ag.list_features()

0 - SOAP_features_partialS_outer_rcut-3_nmax-5_lmax-3_mode-structure_CAN
1 - scm_features_mode-structure


### 5c. Feed a 2D list of features (by index) and multipliers for agglomerative clustering
The class can work either on single features or by combining features with a mixing ratio. 
For single features, the lists should be of the forms:
* feature_selector_list = [[x], [y], [z]]
* multiplier_list = [[1], [1], [1]]

This notation will apply agglomerative clustering to each supplied feature, seperately. An agglomerative clustering output will be saved for x, then y, then z. The multiplier value doesn't actually matter when only one feature is used.  

For combined feature vectors, the lists should be of the forms:
* feature_selector_list = [[x1,x2], [y1,y2], [z1,z2]]
* multiplier_list = [[a1,a2], [b1,b2], [c1,c2]]

In the example notation, agglomerative clustering will be applied three times on the following compositive feature vectors:
* a1(x1) concatenated with a2(x2)
* b1(y1) concatenated with b2(y2)
* c1(z1) concatenated with c2(z2)

__!!!WARNING!!!__ Addition of new features to the saved_features_path will likely change the valid features. This is true because new features may cause different structures to throw errors when the featurize is applied. If the valid features change after a new feature is added then previous conductivity or BVSE variance calculations cannot be directly compared to any new conductivity or bvse variance calculations.  

In [9]:
ag.run_clustering(feature_selector_list=[[0], [1]], \
                  multiplier_list=[[1], [1]], common_labels=True)

Iterating Through Selections:   0%|          | 0/2 [00:00<?, ?it/s]

--------------------------------------------------
[1mWorking on selection = [0] with multiplier = [1][0m


Merging Features:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Conductivity Variance:   0%|          | 0/300 [00:00<?, ?it/s]

Calculating BVSE Variance:   0%|          | 0/300 [00:00<?, ?it/s]

--------------------------------------------------
[1mWorking on selection = [1] with multiplier = [1][0m


Merging Features:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Conductivity Variance:   0%|          | 0/300 [00:00<?, ?it/s]

Calculating BVSE Variance:   0%|          | 0/300 [00:00<?, ?it/s]

In [10]:
ag.run_clustering(feature_selector_list=[[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1],[0,1]], \
                  multiplier_list=[[1, 1E-5],[1, 2E-5], [1, 3E-5], [1, 4E-5], [1, 5E-5], [1, 6E-5], [1, 7E-5], [1, 8E-5], [1, 9E-5], [1, 1E-5]], common_labels=True)

Iterating Through Selections:   0%|          | 0/10 [00:00<?, ?it/s]

--------------------------------------------------
[1mWorking on selection = [0, 1] with multiplier = [1, 1e-05][0m


Merging Features:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Conductivity Variance:   0%|          | 0/300 [00:00<?, ?it/s]

Calculating BVSE Variance:   0%|          | 0/300 [00:00<?, ?it/s]

--------------------------------------------------
[1mWorking on selection = [0, 1] with multiplier = [1, 2e-05][0m


Merging Features:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Conductivity Variance:   0%|          | 0/300 [00:00<?, ?it/s]

Calculating BVSE Variance:   0%|          | 0/300 [00:00<?, ?it/s]

--------------------------------------------------
[1mWorking on selection = [0, 1] with multiplier = [1, 3e-05][0m


Merging Features:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Conductivity Variance:   0%|          | 0/300 [00:00<?, ?it/s]

Calculating BVSE Variance:   0%|          | 0/300 [00:00<?, ?it/s]

--------------------------------------------------
[1mWorking on selection = [0, 1] with multiplier = [1, 4e-05][0m


Merging Features:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Conductivity Variance:   0%|          | 0/300 [00:00<?, ?it/s]

Calculating BVSE Variance:   0%|          | 0/300 [00:00<?, ?it/s]

--------------------------------------------------
[1mWorking on selection = [0, 1] with multiplier = [1, 5e-05][0m


Merging Features:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Conductivity Variance:   0%|          | 0/300 [00:00<?, ?it/s]

Calculating BVSE Variance:   0%|          | 0/300 [00:00<?, ?it/s]

--------------------------------------------------
[1mWorking on selection = [0, 1] with multiplier = [1, 6e-05][0m


Merging Features:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Conductivity Variance:   0%|          | 0/300 [00:00<?, ?it/s]

Calculating BVSE Variance:   0%|          | 0/300 [00:00<?, ?it/s]

--------------------------------------------------
[1mWorking on selection = [0, 1] with multiplier = [1, 7e-05][0m


Merging Features:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Conductivity Variance:   0%|          | 0/300 [00:00<?, ?it/s]

Calculating BVSE Variance:   0%|          | 0/300 [00:00<?, ?it/s]

--------------------------------------------------
[1mWorking on selection = [0, 1] with multiplier = [1, 8e-05][0m


Merging Features:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Conductivity Variance:   0%|          | 0/300 [00:00<?, ?it/s]

Calculating BVSE Variance:   0%|          | 0/300 [00:00<?, ?it/s]

--------------------------------------------------
[1mWorking on selection = [0, 1] with multiplier = [1, 9e-05][0m


Merging Features:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Conductivity Variance:   0%|          | 0/300 [00:00<?, ?it/s]

Calculating BVSE Variance:   0%|          | 0/300 [00:00<?, ?it/s]

--------------------------------------------------
[1mWorking on selection = [0, 1] with multiplier = [1, 1e-05][0m


Merging Features:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Conductivity Variance:   0%|          | 0/300 [00:00<?, ?it/s]

Calculating BVSE Variance:   0%|          | 0/300 [00:00<?, ?it/s]

In [11]:
ag.results_df.head()

Unnamed: 0,feature0,multiplier0,feature1,multiplier1,conductivity_variance,bvse_variance,cluster_sets,ddata,linkage_matrix
0,SOAP_features_partialS_outer_rcut-3_nmax-5_lma...,1,scm_features_mode-structure,1e-06,"[2185.983729796786, 1872.088143709971, 1872.06...","[52634.77398885612, 52077.38652050582, 51357.6...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","{'icoord': [[5.0, 5.0, 15.0, 15.0], [35.0, 35....","[[20630.0, 22671.0, 0.0, 2.0], [18022.0, 18071..."
1,SOAP_features_partialS_outer_rcut-3_nmax-5_lma...,1,scm_features_mode-structure,2e-06,"[2185.983729796786, 1872.088143709971, 1872.06...","[52634.77398885612, 52077.38652050582, 51357.6...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","{'icoord': [[5.0, 5.0, 15.0, 15.0], [35.0, 35....","[[20630.0, 22671.0, 0.0, 2.0], [18022.0, 18071..."
2,SOAP_features_partialS_outer_rcut-3_nmax-5_lma...,1,scm_features_mode-structure,0.001,"[2185.983729796786, 2079.520029468446, 2079.52...","[52634.77398885612, 51027.43413048695, 51026.9...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","{'icoord': [[15.0, 15.0, 25.0, 25.0], [5.0, 5....","[[20630.0, 22671.0, 0.0, 2.0], [18022.0, 18071..."
3,SOAP_features_partialS_outer_rcut-3_nmax-5_lma...,1,scm_features_mode-structure,0.002,"[2185.983729796786, 2185.983729796786, 2185.98...","[52634.77398885612, 52633.46227067928, 52633.4...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","{'icoord': [[15.0, 15.0, 25.0, 25.0], [35.0, 3...","[[20630.0, 22671.0, 0.0, 2.0], [18022.0, 18071..."
4,SOAP_features_partialS_outer_rcut-3_nmax-5_lma...,1,scm_features_mode-structure,0.003,"[2185.983729796786, 2185.983729796786, 2185.98...","[52634.77398885612, 52633.46227067928, 52633.4...","[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...","{'icoord': [[25.0, 25.0, 35.0, 35.0], [15.0, 1...","[[20630.0, 22671.0, 0.0, 2.0], [18022.0, 18071..."
