# Filter clinician feature sets.

Candidate feature sets inspired by a clinician review are filtered based on their entropy. Further details are in this notebook's parent notebook "UNSEEN filter feature sets.ipynb".

## Imports

In [1]:
import os
import itertools
import sklearn.metrics

## Define functions is needed

In [2]:
if "mutlinomRepresentation" not in dir(os):
    # Define function that will calculate the multinomial
    # representation of a feature set.
    #
    # The function takes an n-by-m array of n patients and m features
    # and produces an n-by-1 array indicating the multinomial category
    # to which each patient record belongs.
    def mutlinomRepresentation(var_vals):
        # Check that the variables have two or fewer values and
        # only progress if True.
        for i_col in range(var_vals.shape[1]-1):
            unique_feature_vals = var_vals.iloc[:, i_col].drop_duplicates()
            if (len(unique_feature_vals) > 2):
                print("\n** Error: At least one of the",
                      "component features has more than",
                      "two values so the multinomial",
                      "representation will not be computed.**\n")
                print(i_col, "th variable:", var_vals.columns.values[i_col])
                unique_feature_vals
                next_iter = True
                return 0, next_iter

        # Get all combinations of values of the component features
        # and define feature set values for each multinomial combination.
        feature_combins = var_vals.drop_duplicates()
        feature_combins =\
            pandas.DataFrame(data = feature_combins, columns = var_vals.columns)\
            .reset_index()\
            .drop(['index'], axis = 1)
        feature_combins['multinom_vals'] = feature_combins.index


        # Define a vector indicating the feature set value.
        myMerge =\
            pandas.merge(
                var_vals,
                feature_combins,
                how = 'left',
                on = list(var_vals.columns.values)
        )

        # Extract multinomial representation as output variable.
        featureSet = myMerge['multinom_vals']
        next_iter = False
        return featureSet, next_iter

In [3]:
if "featuresetmi" not in dir(os):
    # Define function that will calculate two-way mutual
    # information for the features of order m.
    def featuresetmi(featureSet_array,
                     casenessVector,
                     m = None,
                     savelocation = None,
                     representation = None):
        # ## Assess argument validty.

        # Check order of feature set. If not provided,
        # default to m = 1.    
        if m == None:
            order_int = 1
            order_label = "Individuals"
            print("\nNo value for m provided." +
                  "\n...Default value of m = 1 will be used.")
        elif m == 1:
            order_int = m
            order_label = "Individuals"
        elif m == 2:
            order_int = m
            order_label = "Pairs"
        elif m == 3:
            order_int = m
            order_label = "Triplets"
        else:
            print("\n** Error: Integer value between 1",
                  "and 3 not supplied for m.**\n")
            return

        # Check and set save location.
        if savelocation == None:
            savelocation = \
               ("Mutual information saves/"+\
                order_label)
            print("\nNo save location provided." +
                  "\n...Defaulting to ~/" + savelocation)    

        # ## Check encoding. If not provided, 
        # ## default to OR encoding.
        if representation == None:
            representation_label = "ALLrepresentation"
            print("\nNo representation provided." +
                  "\n...Defaulting to '" + representation_label + "' representation.")
        elif representation == "all":
            representation_label = "ALLrepresentation"
        elif representation == "multi":
            representation_label = "MULTIrepresentation"
        else:
            print("\n** Error: Representation value from ",
                  "{'or', 'and', 'multi'} not provided.**\n")
            return

        print("\n\n\n****************************************")  
        print("Calculating mutual information values...")
        # Define the m-way tuples of features sets as a numpy array. We will loop
        # through the rows of this array to create the feature sets.
        combins = \
            numpy.asarray(
                list(
                    itertools.combinations(
                        featureSet_array.columns[featureSet_array.columns != 'person_id'],
                        order_int)
                    )
                )

        # Ensure feature-set and casenesss values are matched for person_id.
        full_array = featureSet_array.merge(casenessVector, on = 'person_id')

        # Instantiate specific storage for mutual information.
        featureSet_MI = \
            pandas.DataFrame(columns = ['Feature set', 'Mutual information'])

        # Instantiate batch number.
        batch = 0

        # Instantiate tally of feature sets that are dropped due to low entropy.
        drop_tally = 0

        # Define entropy of the particular caseness variable.
        entropy_caseness = \
            scipy.stats.entropy(casenessVector.iloc[:,-1].value_counts(),
                                base = math.e)

        # ## loop through the feature sets.
        for i_fs in range(len(combins)):

            # Define a vector indicating the feature set value.
            var_vals = full_array[combins[i_fs]]
            if representation_label == "ALLrepresentation":
                fs_val = var_vals.all(True)
            elif representation_label == "MULTIrepresentation":
                fs_val, next_iter = mutlinomRepresentation(var_vals)
                if next_iter:
                    continue


            # Calculate the mutual information between the feature set and
            # caseness variable.
            f_MI = sklearn.metrics.mutual_info_score(fs_val, full_array.iloc[:,-1])

            if f_MI < entropy_caseness:
                drop_tally += 1
                continue
            else:
                # Name the feature set.
                # ...
                # Store the name and mutual information value.
                featureSet_MI.loc[len(featureSet_MI)] = name_var, f_MI

            if len(featureSet_MI) > 9:
                    # Increment batch.
                    batch += 1

                    # Make an interim save of results.
                    featureSet_MI.to_csv(savelocation +
                                      order_label + "_" +
                                      representation_label + "_" +
                                      "_batch" + \
                                      str(batch) + \
                                      ".csv", index = False)
                    # Instantiate new storage.
                    featureSet_MI = \
                        pandas.DataFrame(columns = ['Feature set', 'Mutual information'])


        # Increment counter.
        batch += 1

        # Final save.
        if len(featureSet_MI) != 0:
            featureSet_MI.to_csv(savelocation +
                              order_label + "_" +
                              representation_label + "_" +
                              "_batch" + \
                              str(batch) + \
                              ".csv", index = False)

        # Feedback messages.
        print("...\n")
        print(str(batch), "batch(es) of feature sets processed.")
        print(str(drop_tally), "/",
              str(len(combins)),
              "feature sets dropped due to low entropy.")
        print("****************************************")  

## Load clinician feature-set array

Here, we run the notebook that creates the clinician feature-set array. We will then save the feature-set array as "my_featureSet_array", so that the remaining syntax in this notebook is common for all feature-set sources.


It is assumed that the caseness variables have already been created in the parent notebook.

In [4]:
%%capture
%run ./"UNSEEN_create_clinician_feature_sets.ipynb"
my_featureSet_array = fs_clinician

In [7]:
#my_featureSet_array.head()

Unnamed: 0,person_id,UPSI,UPSICYP,teenagePregnancy,attemptedSuicide,attemptedSuicideCYP,selfHarm,selfHarmCYP,CAMHSrefsAndDisch,IAPTrevolvingDoor,...,CYPmentalHealthConcern,FamilyConcerns,AdulthoodConcerns,AccessToHealthcare,Meds_PsychosisAndRelated,Meds_hypnoticsAndAnxiolytics,annualCountOfUniqueAntidepressants,newAntidepressentThreeMonths,MentalHealthTreatments,RecurringMentalSymptoms
0,5,0,0,0,0,0,0,0,0,0,...,False,False,False,False,0,0,,0,False,False
1,6,0,0,0,0,0,0,0,0,0,...,False,False,False,False,0,0,,0,False,False
2,29,0,0,0,0,0,0,0,0,0,...,False,False,False,False,0,0,,0,False,False
3,33,0,0,0,0,0,0,0,0,0,...,False,False,False,False,0,0,,0,False,False
4,39,0,0,1,0,0,0,0,0,0,...,False,False,False,False,0,0,,0,False,False


## Filter feature sets.

### 1. Mutual information of individual feature sets and the caseness variables.

In [8]:
# Set the order of the composite: 1 = individual, 2 = pair, 3 = triplet.
m = 1

#### 1.1. Multinomial caseness

##### 1.1.1. ALL representation.

In [9]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD']],
             m = m,
             representation = "all")


No save location provided.
...Defaulting to ~/Mutual information saves/Individuals



****************************************
Calculating mutual information values...
...

1 batch(es) of feature sets processed.
46 / 46 feature sets dropped due to low entropy.
****************************************


##### 1.1.2. MULTI representation.

In [None]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD']],
             m = m,
             representation = "multi")


No save location provided.
...Defaulting to ~/Mutual information saves/Triplets



****************************************
Calculating mutual information values...

** Error: At least one of the component features has more than two values so the multinomial representation will not be computed.**

1 th variable: annualCountOfUniqueAntidepressants

** Error: At least one of the component features has more than two values so the multinomial representation will not be computed.**

1 th variable: annualCountOfUniqueAntidepressants

** Error: At least one of the component features has more than two values so the multinomial representation will not be computed.**

1 th variable: annualCountOfUniqueAntidepressants

** Error: At least one of the component features has more than two values so the multinomial representation will not be computed.**

1 th variable: annualCountOfUniqueAntidepressants

** Error: At least one of the component features has more than two values so the multinomial repr

#### 1.2. Definitive caseness

##### 1.2.1. ALL representation.

In [10]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_dx_and_rx']],
             m = m,
             representation = "all")


No save location provided.
...Defaulting to ~/Mutual information saves/Individuals



****************************************
Calculating mutual information values...
...

1 batch(es) of feature sets processed.
46 / 46 feature sets dropped due to low entropy.
****************************************


##### 1.2.2. MULTI representation.

In [None]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_dx_and_rx']],
             m = m,
             representation = "multi")

#### 1.3. Possible caseness

##### 1.3.1. ALL representation.

In [11]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_rx_not_dx']],
             m = m,
             representation = "all")


No save location provided.
...Defaulting to ~/Mutual information saves/Individuals



****************************************
Calculating mutual information values...
...

1 batch(es) of feature sets processed.
46 / 46 feature sets dropped due to low entropy.
****************************************


##### 1.3.2. MULTI representation.

In [None]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_rx_not_dx']],
             m = m,
             representation = "multi")

#### 1.4. No caseness (i.e. control group)

##### 1.4.1. ALL representation.

In [12]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_control']],
             m = m,
             representation = "all")


No save location provided.
...Defaulting to ~/Mutual information saves/Individuals



****************************************
Calculating mutual information values...
...

1 batch(es) of feature sets processed.
46 / 46 feature sets dropped due to low entropy.
****************************************


##### 1.4.2. MULTI representation.

In [None]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_control']],
             m = m,
             representation = "multi")

### 2. Mutual information of pair-composite feature sets and the caseness variables.

In [13]:
# Set the order of the composite: 1 = individual, 2 = pair, 3 = triplet.
m = 2

#### 1.1. Multinomial caseness

##### 1.1.1. ALL representation.

In [14]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD']],
             m = m,
             representation = "all")


No save location provided.
...Defaulting to ~/Mutual information saves/Pairs



****************************************
Calculating mutual information values...
...

1 batch(es) of feature sets processed.
1035 / 1035 feature sets dropped due to low entropy.
****************************************


##### 1.1.2. MULTI representation.

In [None]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD']],
             m = m,
             representation = "multi")

#### 1.2. Definitive caseness

##### 1.2.1. ALL representation.

In [15]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_dx_and_rx']],
             m = m,
             representation = "all")


No save location provided.
...Defaulting to ~/Mutual information saves/Pairs



****************************************
Calculating mutual information values...
...

1 batch(es) of feature sets processed.
1035 / 1035 feature sets dropped due to low entropy.
****************************************


##### 1.2.2. MULTI representation.

In [None]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_dx_and_rx']],
             m = m,
             representation = "multi")

#### 1.3. Possible caseness

##### 1.3.1. ALL representation.

In [16]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_rx_not_dx']],
             m = m,
             representation = "all")


No save location provided.
...Defaulting to ~/Mutual information saves/Pairs



****************************************
Calculating mutual information values...
...

1 batch(es) of feature sets processed.
1035 / 1035 feature sets dropped due to low entropy.
****************************************


##### 1.3.2. MULTI representation.

In [None]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_rx_not_dx']],
             m = m,
             representation = "multi")

#### 1.4. No caseness (i.e. control group)

##### 1.4.1. ALL representation.

In [17]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_control']],
             m = m,
             representation = "all")


No save location provided.
...Defaulting to ~/Mutual information saves/Pairs



****************************************
Calculating mutual information values...
...

1 batch(es) of feature sets processed.
1035 / 1035 feature sets dropped due to low entropy.
****************************************


##### 1.4.2. MULTI representation.

In [None]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_control']],
             m = m,
             representation = "multi")

### 3. Mutual information of triplet-composite feature sets and the caseness variables.

In [6]:
# Set the order of the composite: 1 = individual, 2 = pair, 3 = triplet.
m = 3

#### 1.1. Multinomial caseness

##### 1.1.1. ALL representation.

In [19]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD']],
             m = m,
             representation = "all")


No save location provided.
...Defaulting to ~/Mutual information saves/Triplets



****************************************
Calculating mutual information values...
...

1 batch(es) of feature sets processed.
15180 / 15180 feature sets dropped due to low entropy.
****************************************


##### 1.1.2. MULTI representation.

In [None]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD']],
             m = m,
             representation = "multi")

#### 1.2. Definitive caseness

##### 1.2.1. ALL representation.

In [20]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_dx_and_rx']],
             m = m,
             representation = "all")


No save location provided.
...Defaulting to ~/Mutual information saves/Triplets



****************************************
Calculating mutual information values...
...

1 batch(es) of feature sets processed.
15180 / 15180 feature sets dropped due to low entropy.
****************************************


##### 1.2.2. MULTI representation.

In [None]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_dx_and_rx']],
             m = m,
             representation = "multi")

#### 1.3. Possible caseness

##### 1.3.1. ALL representation.

In [21]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_rx_not_dx']],
             m = m,
             representation = "all")


No save location provided.
...Defaulting to ~/Mutual information saves/Triplets



****************************************
Calculating mutual information values...
...

1 batch(es) of feature sets processed.
15180 / 15180 feature sets dropped due to low entropy.
****************************************


##### 1.3.2. MULTI representation.

In [None]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_rx_not_dx']],
             m = m,
             representation = "multi")

#### 1.4. No caseness (i.e. control group)

##### 1.4.1. ALL representation.

In [7]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_control']],
             m = m,
             representation = "all")


No save location provided.
...Defaulting to ~/Mutual information saves/Triplets



****************************************
Calculating mutual information values...
...

1 batch(es) of feature sets processed.
15180 / 15180 feature sets dropped due to low entropy.
****************************************


##### 1.4.2. MULTI representation.

In [None]:
featuresetmi(featureSet_array = my_featureSet_array,
             casenessVector = caseness_array[['person_id','CMHD_control']],
             m = m,
             representation = "multi")