In [13]:
import os
import io
import sys
import re
import time
import math
import pickle

import numpy as np
import pandas as pd

from numpy import nan as Nan
from numpy import inf as inf
from tqdm import notebook as tqdm
from scipy.sparse import csr_matrix

### 4a. Introduction
With >25,000 structures and many different featurizers, we expect that there will be some incompatibilies. Some featurizer-structure combinations will return Nan, Null, or Inf values. It is important to consider that these erorrs will cause some feature representations to lose some of the labels. 

To compare the efficacy of agglomerative clustering between feature representations we need to ensure that the same labels are always used. This notebook does the following:

1. Finds all rows (structures) in the structures_df where Nan, Null, or Inf values exist. 
2. Creates a boolean array for each feature represention to indicate which rows are valid. 
3. Merges all the boolean arrays to find a subset of structures that work for all the features. 

Finally, the notebook will save a sparse representation of each feature. This is done because some of the feature vectors are extremely large. Creating a sparse representation can speed up computations.  

In [14]:
def nan_and_inf_finder(features):
    """
    Function to find the Nan, Null, or Inf values in the feature dataframe.

    Parameters
    ----------
    features : np.array
        A feature representation for each structure
        
    Returns
    -------
    lost_features_count : int
        A count of all rows that contain errors. 
    
    valid_features : np.array()
        The index positions for the valid features
    """      
    nan_array = np.isnan(features).any(1)
    inf_array = np.isinf(features).any(1)
    lost_features_count = np.logical_or(nan_array, inf_array).sum()
    valid_features = np.logical_not(np.logical_or(nan_array, inf_array))
    return lost_features_count, valid_features

In [15]:
def nan_and_inf_finder_SOAP(features):
    """
    Function to find the Nan, Null, or Inf values in the SOAP feature represntation.
    Because of SOAP's immense size, it is always saved as a sparse matrix. Thus this
    function is required to specifically handle SOAP. 

    Parameters
    ----------
    features : scipy.sparse.csr.csr_matrix
        A feature representation for each structure
        
    Returns
    -------
    lost_features_count : int
        A count of all rows that contain errors. 
    
    valid_features : np.array
        The index positions for the valid features
    """     
    if np.isnan(features.data).any() == False:
        if np.isinf(features.data).any() == False:
            lost_features_count = 0
            valid_features = np.ones(np.shape(features)[0], dtype=bool)
            return lost_features_count, valid_features
        

In [30]:
def save_sparse_features(features, filename):
    """
    Function to save a sparse feature representation for each feature. The files are saved with the same name
    but in a new directory: 'saved_sparse_features'.

    Parameters
    ----------
    features : np.array
        A feature representation for each structure.
        
    filename: str
        The original filename for the feature. 
    """ 
    sparse_features = csr_matrix(features)
    
    # save the sparse representation
    save_path = os.path.join(os.getcwd(), 'sparse_features/{}.pkl'.format(filename))
    save_file = open(save_path, 'wb')
    pickle.dump(sparse_features, save_file)
    save_file.close()

### 4b. Iterate over all the files in the saved_features directory
1. Check for Nan, Null, or Inf values. 
2. Compile a list of valid structures across all features. 
3. Save a sparse representation of each feature in the saved_sparse_features directory. 

In [31]:
valid_features_df = pd.DataFrame()
files = os.listdir('features/')
for file in files:

    # remove the .npy extension
    filename = file[0:-4]
    if re.search('SOAP', file):
        features = csr_matrix(np.load(io.BytesIO(open('features/{}'.format(file), 'rb').read()), allow_pickle=True).all())
        lost_features_count, valid_features = nan_and_inf_finder_SOAP(features)
        # save the sparse representation
        save_path = os.path.join(os.getcwd(), 'sparse_features/{}.pkl'.format(filename))
        save_file = open(save_path, 'wb')
        pickle.dump(features, save_file)
        save_file.close()
        
    elif re.search('ipynb_checkpoints', file):
        next
    else:
        features = np.load('features/{}'.format(file), allow_pickle=True)
        lost_features_count, valid_features = nan_and_inf_finder(features)
        # create a sparse representation for each feature
        save_sparse_features(features, filename)

    valid_features_df[filename] = valid_features
    print("{} rows are lost in the feature: {}".format(lost_features_count, file))

0 rows are lost in the feature: scm_features_mode-structure.npy
0 rows are lost in the feature: SOAP_features_partialS_outer_rcut-3_nmax-5_lmax-3_mode-structure_CAN.npy


In [32]:
valid_features_df.head()

Unnamed: 0,scm_features_mode-structure,SOAP_features_partialS_outer_rcut-3_nmax-5_lmax-3_mode-structure_CAN
0,True,True
1,True,True
2,True,True
3,True,True
4,True,True


In [33]:
valid_features_df['compiled'] = valid_features_df.all(axis=1)
print("{} rows will be lost because of Nan or Inf entries.".format(len(valid_features_df)-valid_features_df.compiled.sum()))

0 rows will be lost because of Nan or Inf entries.


### 4c. Determine how many labels will be lost

In [34]:
# open the labeled data file
save_path = os.path.join(os.getcwd(), 'semi-supervised_supporting_files/labels_df.pkl')
open_file = open(save_path, 'rb')
labels_df = pickle.load(open_file)
open_file.close()

# grab the index positions for the labels
idx_of_labels = labels_df[labels_df.conductivity>0].index

# grab the index positions for rows without any feature errors
idx_of_valid_features = valid_features_df[valid_features_df.compiled==True].index

# list comprehension to find the indices that represent labels without having any feature errors
idx_of_valid_labels = [x for x in idx_of_labels if x in idx_of_valid_features]
print('After deleting the rows, {}/{} labels will remain in the dataset.'.format(len(idx_of_valid_labels), len(idx_of_labels)))

After deleting the rows, 220/220 labels will remain in the dataset.


In [35]:
save_path = os.path.join(os.getcwd(), 'semi-supervised_supporting_files/valid_features_df.pkl')
save_file = open(save_path, 'wb')
pickle.dump(valid_features_df, save_file)
save_file.close()