In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score
import os
import pandas as pd
import numpy as np
from sklearn.utils import check_random_state
from tqdm import tqdm
import matplotlib.pyplot as plt
import glob

# load data

In [None]:
path = os.getcwd()
par = os.path.abspath(os.path.join(path, os.pardir))

data_path = os.path.join(par,'3_generate_features','dimensionless_cropped_final_feature_array.csv')
label_path = os.path.join(par,'3_generate_features','final_label_array.csv')

os.path.isfile(data_path)


# preprocess features

In [None]:
#Function to change multiclass classification to 1 vs all
def multiclass_to_binary(labels, most_common_id):
    to_binary = lambda val: 1 if val == most_common_id else 0
    to_binary_vec = np.vectorize(to_binary)

    labels_1vsall = to_binary_vec(labels)

    return labels_1vsall


def subsample_data(features,labels_1vsall,n_subsample):

    rng = np.random.default_rng()

    #Dominant class boolean index
    positive_class_mask = labels_1vsall==1
    
    #Dominant class indexing to grab for training/test set (we want 50/50 representation)
    SR_features_train = features[positive_class_mask,:]
    SR_target_train = labels_1vsall[positive_class_mask]

    #Grabbing all negative examples of which we're going to grab a number equal to the number of dominant class
    SR_negative_train = features[~positive_class_mask,:]
    SR_negative_target = labels_1vsall[~positive_class_mask]

    #Apply subsampling. We grab a random subset from the negative set of the same size as the positive examples
    subsample_idx = rng.permutation(SR_negative_target.size)[:n_subsample]

    #Concatenate an equal amount of negative training data to the list of positive training data so we have 50/50 class representation
    SR_features = np.concatenate((SR_features_train, SR_negative_train[subsample_idx,:]),axis=0)
    SR_targets = np.concatenate((SR_target_train, SR_negative_target[subsample_idx]), axis=0)

    return SR_features, SR_targets


def run_random_forest(features,labels_1vsall):
    #Stratified K fold (maintain class balance)

    skf = StratifiedKFold(n_splits=10)

    cv_precisions = []
    cv_recalls = []

    aggregated_feature_importances = []
    
    for i, (train_idx, test_idx) in enumerate(skf.split(features,labels_1vsall)):
        train_features_i = features[train_idx]
        train_labels_i = labels_1vsall[train_idx]

        test_features_i = features[test_idx]
        test_labels_i = labels_1vsall[test_idx]

        #Fit naive rf model
        naive_rf_i = RandomForestClassifier()
        naive_rf_i.fit(train_features_i, train_labels_i)
        
        predict_labels_i = naive_rf_i.predict(test_features_i)

        #Fit on k-folded validation set
        precision, recall = precision_score(test_labels_i, predict_labels_i), recall_score(test_labels_i, predict_labels_i)

        aggregated_feature_importances += [np.array(naive_rf_i.feature_importances_)]
        
        cv_precisions += [precision]
        cv_recalls += [recall]

    aggregated_feature_importances = np.array(aggregated_feature_importances)
    aggregated_feature_importances = np.mean(aggregated_feature_importances, axis=0)
    sorted_indices = np.argsort(aggregated_feature_importances)
    sorted_importances = aggregated_feature_importances[sorted_indices]
    sorted_feature_names = np.array(feature_names)[sorted_indices]
    feature_info = pd.DataFrame(data={'Name':list(sorted_feature_names),
                                      
    'Importances':sorted_importances}).sort_values(by='Importances',ascending=False,inplace=False)
    feature_info.to_csv(f'features/{feature_list_number}_iteration_feature_{len(feature_names)}_features.csv')
    return np.mean(cv_precisions), np.mean(cv_recalls)

In [None]:
nfeats, precs, recalls = [], [], []

feature_names = np.zeros(999999)

while len(feature_names) > 1:
    print(f'{len(feature_names)} features remaining')
    #Turn into DF
    feat_df = pd.read_csv(data_path, index_col=0)
    label_df = pd.read_csv(label_path)
    
    #Dataframe convert label into categorical variable for classification
    #Then convert labels into numpy array
    label_name = 'Prototype'
    label_df[label_name]= pd.Categorical(label_df[label_name])
    label_df['numeric_label'] = label_df[label_name].cat.codes

    #Convert numerical dataframe column to array
    labels = label_df['numeric_label'].to_numpy()

    #Convert features to numpy
    #Also define feature names for symbolic regression
    features = feat_df.to_numpy()
    feature_names = np.array(feat_df.columns)

    #Define most class id of interest and relabel
    id_ofinterest = 203

    labels_1vsall = multiclass_to_binary(labels, most_common_id=203)
    
    # #Set rng seed and permutation of data examples for training
    # rng = check_random_state(5)

    #Check number of dominant class examples
    n_positive_class = np.sum(labels_1vsall)
    
    def feature_list_number_func(filename):
        return int(filename.split('_')[0].split('/')[-1])

    previous_features = glob.glob('features/*')
    previous_features_sorted = sorted(previous_features,key=feature_list_number_func)

    if len(previous_features_sorted) == 0: 
        feature_list_number = 1
        pass
    else:
        previous_feature_data = pd.read_csv(previous_features_sorted[-1]) # read most recent feature reduction iteration
        feature_list_number = feature_list_number_func(previous_features_sorted[-1]) + 1 # get current feature reduction iteration
        previous_feature_keep = previous_feature_data.query(f"Importances > {previous_feature_data['Importances'].quantile(q=0.25)}") # get list of feature names with importances in the top 75%
        previous_feature_names = previous_feature_keep['Name']

        keep = np.isin(feature_names, previous_feature_names) # identify feature names that need to be kept

        feature_names = feature_names[keep]
        
    ### Train, and get feature importances
    subsampled_features, subsampled_targets = subsample_data(feat_df[feature_names].to_numpy(),labels_1vsall,n_positive_class)
    prec, recall = run_random_forest(subsampled_features, subsampled_targets)
    
    ### Record
    nfeats.append(len(feature_names))
    precs.append(prec)
    recalls.append(recall)

In [None]:
stat_df = pd.DataFrame({'n_features':nfeats, 'precision':precs, 'recall':recalls})
stat_df.to_csv('pruning_stats.csv', index=None)

In [7]:
stat_df

Unnamed: 0,n_features,precision,recall
0,1043,0.941123,0.975172
1,782,0.959942,0.981708
2,586,0.950145,0.980405
3,439,0.953843,0.982361
4,329,0.944195,0.981712
5,246,0.945538,0.981712
6,184,0.942125,0.979756
7,138,0.947991,0.97844
8,103,0.948063,0.981708
9,77,0.948434,0.982361
