In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale

In [228]:
from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import numpy as np

IC* Algorithm with Missing Data

SkeletonFinder

PairwiseConditionalIndependenceTester
    - variables to compare
    - conditioning set
    
    independent(cutoff?): boolean

ImmoralitiesFinder

DataSplitter

DataGenerator

ConditionalIndependenceTester

In [5]:
df = pd.DataFrame({ 'hello': ['a','b','c'], 'hi': ['1', '2', '3']})

In [227]:
class ConditionallyIndependentDataGenerator:
    """
        @param data [pandas.DataFrame] Part of the columns are the conditioning_set_names 
            and swappables as columns. 
        
        @param swappables [List[String]] The variable we're copying from half of the data 
        set.
        
        @param conditioning_set_names [List[String]] List of String names of what we're 
        conditioning on (i.e. Z in the example above). Defaults to empty list.
        
        Main idea is to generate a data set that satisfies X _||_ Y | Z: X is independent of
        Y given Z, which is satisfied if f(x|z)f(y|z)f(z) = f(x,y,z). We're able to do this
        by splitting the data set into two. For the first data set, find the 1-Nearest 
        Neighbors from the second data set using z. Then pick the Y from the second data set 
        and set that as the new Y for the first data set. This amounts to sampling f(y | z).
        We already have f(x|z) and f(z) in the first data set, so by returning the first data 
        set, we have f(x|z)f(y|z)f(z) = f(x,y,z).
    """
    def __init__(self, data, swappables, conditioning_set_names=[]):
        self.swappables = swappables
        self.conditioning_set_names = conditioning_set_names
        self.data = data
        
        half_length = int(data.shape[0] / 2)
            
        self.data_1 = data.iloc[:half_length].copy()
        self.data_2 = data.iloc[half_length:self.__end_index__()].copy()
        
        assert len(swappables) >= 1
        assert self.data_1.shape[0] == self.data_2.shape[0]
        

    def generate(self):
        """
            Generates a conditionally independent data set.

            @return [pandas.DataFrame] DataFrame with half the length of the original data passed in
            the init method.
        """
        columns = self.data_1.columns 
        not_swappables = list(set(columns) - set(self.swappables))
            
        if len(self.conditioning_set_names) == 0:
            # shuffle the swappable part to break dependence between "swappable" and 
            # "not swappable" columns
            
            return pd.concat(
                [
                    self.data_1[not_swappables].reset_index(drop=True),
                    self.data_1[self.swappables]\
                        .sample(n=self.data_1.shape[0]).reset_index(drop=True)
                ],
                axis=1
            )
        
        data_1_cond_set = self.data_1[self.conditioning_set_names].copy()
        data_2_cond_set = self.data_2[self.conditioning_set_names].copy()
    
        nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').\
            fit(data_1_cond_set.values)
        
        _, indices = nbrs.kneighbors(data_2_cond_set.values)
        
        data_2_nearest_neighbors = self.data_2.iloc[indices.reshape(1, indices.shape[0])[0]]
        
        data_1_concatables = self.data_1[not_swappables]
        data_2_concatables = data_2_nearest_neighbors[self.swappables]
        
        return pd.concat(
            [
                data_1_concatables.reset_index(drop=True),
                data_2_concatables.reset_index(drop=True)
            ],
            axis=1
        )
    
    def __end_index__(self):
        """
            Meant to help enforce equal sizes for the two data sets to make later steps 
            (e.g swapping) easier.
        """
        if self.data.shape[0] % 2 != 0:
            return self.data.shape[0] - 1
        else: 
            return self.data.shape[0]
        

In [224]:
class ConditionalIndependenceTester:
    """
        @param data [pandas.DataFrame] 
        @param comparables_1 [List[Strings]] List of variable names.
        @param comparables_2 [List[Strings]] List of variable names.
        @param conditioning_set = [List[Strings]] The list of variable names 
            corresponding to the conditioning set.
    """
    def __init__(self, data, comparables_1, comparables_2, conditioning_set=[]):
        self.data = data
        self.comparables_1 = comparables_1
        self.comparables_2 = comparables_2
        self.conditioning_set = conditioning_set
        
        assert len(self.comparables_1) >= 1
        assert len(self.comparables_2) >= 1
        

    def is_independent(self):
        """
            Tests the conditional independence X _||_ Y | Z.

            X represents comparables_1 param (passed in to the init method).
            Y represents comparables_2 param (passed in to the init method).
            Z represents the conditioning_set (passed in to the init method).
                Could be empty.

            @return boolean
        """
        data_1_length = int(self.data.shape[0] / 3)
        data_1 = self.data.iloc[0:data_1_length].copy()
        data_2 = self.data.iloc[data_1_length:]
        
        cid_generator = ConditionallyIndependentDataGenerator(
            data=data_2, 
            swappables=self.comparables_2
        )
        
        conditionally_indep_data = cid_generator.generate()
        
        data_1['label'] = 1
        conditionally_indep_data['label'] = 0
        
        train_and_test_data = pd.concat([
            data_1,
            conditionally_indep_data
        ])
        
        predictors = list(set(train_and_test_data.columns) - set(['label']))
        
        X = train_and_test_data[predictors].values
        y = train_and_test_data['label']
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.5, stratify=y
        )
        
        clf = RandomForestClassifier()
        clf.fit(X_train, y_train)
        return clf.score(X_test, y_test) <= 0.54

In [225]:
def percent_independent(results):
    """
        @param results [List[Boolean]] True if independent, False if dependent
        
        @return percent of true count / results count
    """
    
    return results.sum() / len(results)

In [23]:
def test_a_bunch(func, num_times=1000, size=1000):
    results = []
    
    for i in range(num_times):
        results.append(func(size))
    
    return np.array(results)

In [24]:
def dependent_two_vars(size=1000):
    x = np.random.normal(size=size)
    u = np.random.normal(size=size)
    y = x + u
    uncond_indep_df = pd.DataFrame({'x': x, 'y': y})

    tester = ConditionalIndependenceTester(data=uncond_indep_df, comparables_1=['x'], comparables_2=['y'])
    return tester.is_independent()

In [25]:
def independent_two_vars(size=1000):
    x = np.random.normal(size=size)
    y = np.random.normal(size=size)
    df = pd.DataFrame({'x': x, 'y': y})

    tester = ConditionalIndependenceTester(data=df, comparables_1=['x'], comparables_2=['y'])
    return tester.is_independent()

Test cases:



* $X \perp Y | Z$
    - fork
    - chain
    
* $X \not\perp Y | Z$
    - collider case
    - fork with unobserved confounding
    - chain with unobserved confounding

* Variable type
    - Discrete variables
    - Continuous variables
    - Mix of Discrete and Continuous variables

* Positivity


In [None]:
percent_independent(test_a_bunch(func=dependent_two_vars, size=10000))

In [321]:
percent_independent(test_a_bunch(func=independent_two_vars, size=10000))

1.0

In [315]:
percent_independent(test_a_bunch(func=dependent_two_vars, size=1000))

0.034

In [316]:
percent_independent(test_a_bunch(func=independent_two_vars, size=1000))

0.917

In [318]:
percent_independent(test_a_bunch(func=dependent_two_vars, size=100))

0.415

In [319]:
percent_independent(test_a_bunch(func=independent_two_vars, size=100))

0.659

In [294]:
dep_two_vars_results.sum()

0

In [226]:
x = np.random.normal(size=1000)
y = np.random.normal(size=1000)
uncond_indep_df = pd.DataFrame({'x': x, 'y': y})

tester = ConditionalIndependenceTester(data=uncond_indep_df, comparables_1=['x'], comparables_2=['y'])
tester.is_independent()

True

In [218]:
class EuclideanDistanceDataPreprocessor:
    """
        @param data [pandas.DataFrame] 
            Each column represents data from a variable.  
        @param continuous_var_names [List[String]]
            Names of variables that are continuous.
        @param multi_choice_categorical_var_names [List[String]]
            Names of variables where each there could be multiple
            values per row.
                e.g. Race: Someone could be Black ([1 0]).
                    Another could be Asian ([0 1]).
                    And another could identify as being both Black & Asian ([1 1]).
        @param single_choice_categorical_var_names [List[String]]
            Names of variables. Makes sense when there's only one option 
            possible per category.
                e.g. Height:
                    - Under 5 ft.
                    - Between 5 ft. and 6 ft.
                    - Over 6 ft.
                    
        This class is meant to help prepare data for the ConditionalIndependenceTester 
        class, which uses Nearest Neighbor methods. Nearest Neighbor methods have a 
        notion of distance. We vectorize the data so we can do nearest neighbor methods
        on them.
        
        This Preprocessor makes data amenable to nearest neighbor methods that use
        the Euclidean distance as the distance metric.
        
        It does this by: 
        
        1. Representing all columns as vectors of floats (i.e. no strings). 
        
        2. Scaling the variables so that their effects on the distance 
            metric is about the same.

        Here's the strategy that is being used here to implement that:
        
        For continuous data, we will scale the data between 0 and 1.
        
        For non-ordinal categorical data, there are two situations to consider:
            - single-choice categorical columns
            - multi-choice categorical columns
         
        For single-choice categorical columns, we could dummify / one-hot-encode 
        them. Once they are one-hot encoded, we could scale the
        one-hot-encoding by 1/sqrt(2) so that categorical data weighs the 
        same as a continuous variable.
        
        For multi-choice categorical columns, we could dummify / one-hot-encode
        them, just like we did for the single-choice categorical columns. 
        However, instead of scaling by 1/sqrt(2), we scale by some function that
        takes into account the cardinality of the variable: 1/sqrt(m), where m
        is the cardinality of the multi-choice variable. This scaling will make 
        multi-choice categorical columns be weighed the same as other types of 
        variables.
        
        Example:
        
        Single-choice example; Let's say there's a single-choice variable such as 
        a discretized version of Height (e.g. one could be below 5 feet, be between
        5 feet and 6 feet, or be 6 feet and greater).
        
        In terms of the worst distance possible between two people, let's say
        one individual identifies as below 5 feet, and another identifies
        as above 6 feet. The former will be represented by:
        
            [1, 0, 0] * 1 / sqrt(2) = [0.707, 0, 0]
        
        While the latter will be represented by:
        
            [0, 0, 1] * 1 / sqrt(2) = [0, 0, 0.707]
            
        The Euclidean distance between the two would be sqrt(2(0.707-0)^2) = 1.
            
            
        Multi-choice example: Let's say there's a multi-choice variable such as
        Race is 3. In this contrived example, let's say that variable can take 
        one or more values in [Black, Asian, White].
        
        In terms of the worst distance possible between two people, let's say
        one individual identifies as Black, Asian, and White, and another identifies
        as none of those. The former will be represented by:
        
            [1, 1, 1] * 1 / sqrt(3) = [0.577, 0.577, 0.577]
        
        While the latter will be represented by:
        
            [0, 0, 0] * 1 / sqrt(3) = [0, 0, 0]
            
        The Euclidean distance between the two would be sqrt(3(0.577-0)^2) = 1.
        
        Finally, let's consider the continuous case. Let's say there's a variable called
        Weight, which is some continuous variable between 10 & 300 pounds. Let's say that
        one individual is 10 lbs and another is 300 lbs. The first individual will be
        represented as 0, and the other as 1. So the Euclidean distance between the two is 
        
            sqrt((1-0)^2)= 1
            
        As we can see from the two scenarios, the scaling factor enables us to make
        each variable have the same impact on the Euclidean distance metric as other
        variables, regardless of the variable being discrete or continuous.
    """
    
    SINGLE_CHOICE_SCALER = 0.707
    
    def __init__(
        self, 
        data, 
        continuous_var_names, 
        multi_choice_categorical_var_names, 
        single_choice_categorical_var_names
    ):
        self.data = data
        self.continuous_var_names = continuous_var_names
        self.multi_choice_categorical_var_names = \
            multi_choice_categorical_var_names
        self.single_choice_categorical_var_names = \
            single_choice_categorical_var_names\
            
    def preprocess(self):
        """
            @return [pandas.DataFrame] where each row is a vector that is amenable to 
            computing the Euclidean distance between that and other rows.

            For continuous variables, their names would stay the same (e.g. if 
            "weight" was passed in the list of continuous_var_names, then we would see
            "weight" as a column name. Values would be scaled between 0 and 1.)

            For single-choice categorical variables, their dummy-fied names would be 
            appended by the values (e.g. if there's a single-choice categorical 
            variable named "favorite color" and the possible answers are "red", 
            "white", and "green", then we will have three columns: 
                - "favorite color | red", 
                - "favorite color | white", and 
                - "favorite color | green"

            For multi-choice categorical variables, their dummy-fied names would be 
            appended by the values (e.g. if there's a multi-choice categorical 
            variable named "favorite colors" and the possible answers are "red", 
            "white", and "green", then we will have three columns: 
                - "favorite color | red", 
                - "favorite color | white", and 
                - "favorite color | green"
        """
        self._dummify_categorical_columns()
        self._scale_single_choice_cat_columns()
        self._scale_multi_choice_cat_columns()
        self._scale_continuous_columns()
        
        return pd.concat(
            [
                self.single_choice_cat_columns,
                self.multi_choice_cat_columns,
                self.continuous_columns
            ],
            axis=1
        )

    def _dummify_categorical_columns(self):
        self.multi_choice_cat_columns = pd.DataFrame()
        self.single_choice_cat_columns = pd.DataFrame()
        
        for multi_choice_categorical_var_name in self.multi_choice_categorical_var_names:
                
            
            self.multi_choice_cat_columns = pd.concat(
                [
                    self.multi_choice_cat_columns,
                    self.data[multi_choice_categorical_var_name]\
                        .explode()\
                        .str\
                        .get_dummies()\
                        .sum(level=0)\
                        .add_prefix(multi_choice_categorical_var_name + ' | ')
                ],
                axis=1
            )
            
        for single_choice_categorical_var_name in self.single_choice_categorical_var_names:
            self.single_choice_cat_columns = pd.concat(
                [
                    self.single_choice_cat_columns,
                    pd.get_dummies(
                        self.data[single_choice_categorical_var_name], 
                        prefix=single_choice_categorical_var_name, 
                        prefix_sep=' | '
                    )
                ],
                axis=1
            )
    
    def _scale_single_choice_cat_columns(self):
        self.single_choice_cat_columns = self.single_choice_cat_columns * self.SINGLE_CHOICE_SCALER
        
    def _scale_multi_choice_cat_columns(self):        
        df = pd.DataFrame()
        
        for multi_choice_categorical_var_name in self.multi_choice_categorical_var_names:
            sub_columns = self.multi_choice_cat_columns.columns[
                self\
                    .multi_choice_cat_columns\
                    .columns\
                    .str\
                    .contains(multi_choice_categorical_var_name + ' | ')
            ]
            
            scaler = 1.0 / np.sqrt(len(sub_columns))
            
            df = pd.concat(
                [
                    df,
                    self.multi_choice_cat_columns * scaler
                ], 
                axis=1
            )
            
        self.multi_choice_cat_columns = df
        
    def _scale_continuous_columns(self):
        self.continuous_columns = \
            pd.DataFrame(
                minmax_scale(
                    self.data[self.continuous_var_names]
                ), 
                columns=self.continuous_var_names
            )

In [221]:
continuous_var_names = ['continuous']
multi_choice_categorical_var_names = ['cat_2']
single_choice_categorical_var_names = ['cat_1']

In [222]:
preprocessor = EuclideanDistanceDataPreprocessor(
    continuous_var_names=continuous_var_names,
    data=sample_df,
    multi_choice_categorical_var_names=multi_choice_categorical_var_names,
    single_choice_categorical_var_names=single_choice_categorical_var_names
)

preprocessor.preprocess()

Unnamed: 0,cat_1 | blue,cat_1 | red,cat_1 | white,cat_2 | a,cat_2 | b,cat_2 | c,continuous
0,0.0,0.707,0.0,0.57735,0.0,0.0,0.935169
1,0.0,0.0,0.707,0.0,0.57735,0.0,0.0
2,0.707,0.0,0.0,0.57735,0.57735,0.57735,1.0
