In [None]:
from folktables import ACSDataSource, BasicProblem, generate_categories
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, matthews_corrcoef, confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from scipy.stats import  pearsonr
import scipy
try:
    import seaborn as sns
    sns.set()
except:
    print("Run without Seaborn")

### 1. Load and Preprocess the data
We are going to work with the [Folktables](https://github.com/socialfoundations/folktables#quick-start-examples) dataset (*you have already worked with it*).

1. As last week, we are still predicting the *Total person's income*  (I've digitized  it in  `target_transform=lambda x: x > 25000`).
2. Today, we are going to implement two methods for data debiasing: [Fair PCA](https://deepai.org/publication/efficient-fair-pca-for-fair-representation-learning) and [A Geometric Solution to Fair Representations](https://dl.acm.org/doi/10.1145/3375627.3375864).
3. We are going to evaluate the performance on two sensitive features: `SEX` and `RAC1P` (we will consider only *Whites* and *African-Americans*)
4. I updated the filtering method `adult_filter` to keep the specified groups.


In [129]:
data_source = ACSDataSource(survey_year='2018', horizon='1-Year', survey='person')
acs_data = data_source.get_data(states=["CA"], download=True)


def adult_filter(data):
    """Mimic the filters in place for Adult data.
    Adult documentation notes: Extraction was done by Barry Becker from
    the 1994 Census database. A set of reasonably clean records was extracted
    using the following conditions:
    ((AAGE>16) && (AGI>100) && (AFNLWGT>1)&& (HRSWK>0))
    """
    df = data
    df = df[df['AGEP'] > 16]
    df = df[df['PINCP'] > 100]
    df = df[df['WKHP'] > 0]
    df = df[df['PWGTP'] >= 1]
    df = df[df["RAC1P"] < 3] ## keep only Whites and African-Americans
    return df


ACSIncomeNew = BasicProblem(
    features=[
        'AGEP',
        'COW',
        'SCHL',
        'MAR',
        'RELP',
        'WKHP',
        'PWGTP',
        'SEX',
        'RAC1P',
    ],
    target='PINCP',
    target_transform=lambda x: x > 25000,    
    group=['SEX', 'RAC1P'],
    preprocess=adult_filter,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

In [130]:
definition_df = data_source.get_definitions(download=True)
categories = generate_categories(features=ACSIncomeNew.features, definition_df=definition_df)
features, labels, groups = ACSIncomeNew.df_to_pandas(acs_data, categories=categories, dummies=True)
features.head()

Unnamed: 0,AGEP,WKHP,PWGTP,"COW_Employee of a private for-profit company or business, or of an individual, for wages, salary, or commissions","COW_Employee of a private not-for-profit, tax-exempt, or charitable organization",COW_Federal government employee,"COW_Local government employee (city, county, etc.)","COW_Self-employed in own incorporated business, professional practice or farm","COW_Self-employed in own not incorporated business, professional practice, or farm",COW_State government employee,...,RELP_Parent-in-law,RELP_Reference person,RELP_Roomer or boarder,RELP_Son-in-law or daughter-in-law,RELP_Stepson or stepdaughter,RELP_Unmarried partner,SEX_Female,SEX_Male,RAC1P_Black or African American alone,RAC1P_White alone
0,21.0,20.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,65.0,8.0,33.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,33.0,40.0,53.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,18.0,18.0,106.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,27.0,50.0,23.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


#### 2.1. Pre-processing 
We encoded categorical using the *one-hot encoding*: 
1. For example, column `SEX` is split into two solumns `SEX_Male` and `SEX_Female`. 
2. You can argue that one of this columns is redundant (as when one attribute is equal to 1, the other is equal to 0). 
3. You can extend this line of thougth to a multicategory setting. Let's assume I have a feature with three categories *dog*, *cat* and *monkey* (which are represented using *one-hot encoding*). If both *cat* and *dog* are equal to 0, them *monkey* is equal to 1. In theory, I can drop *monkey* category without loosing any kind of information. 
4. Depending on your task, you might want to keep this columns, but today we are going to assume that those columns are *redundant*. 

In [131]:
# Drop the "redundant" columns
features = features.drop(["RAC1P_White alone", 
                          "SEX_Male", 
                          "SCHL_1 or more years of college credit, no degree",  
                          "MAR_Divorced", 
                          "RELP_Adopted son or daughter",
                          'COW_Working without pay in family business or farm' ], axis = 1) 

print("Columns with the protected features:")
for i, f in enumerate(features.columns):
    if ("RAC1P" in f) or ("SEX" in f):
        print("Column ID: %s" %i, "(%s)"%f)

Columns with the protected features:
Column ID: 54 (SEX_Female)
Column ID: 55 (RAC1P_Black or African American alone)


In [132]:

X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    features.values, labels.values.reshape(-1), groups, test_size=0.3, random_state=0, shuffle=True)

N = 20000 ### I am subsampling because it is slow on my machine

X_train = X_train[:N]
y_train = y_train[:N]
group_train = group_train[:N]
X_test = X_test[:N]
y_test = y_test[:N]
group_test = group_test[:N]

### 2. Correlations in Data
Let's look at the correlations in our data. For the sake of the time, we are going to assume only *linear interactions* between the features -- and we are just use **Pearson's Correlation Coefficient** (since we have one-hot encoded categorical variables, it is more or less ok to use **Pearson's Correlation** (Thought in your projects, I advice to use more appropriate measures like Cramer's V for categorical-categorical correlation etc. -- [dython](https://shakedzy.xyz/dython/modules/nominal/)).
1. Normilize the continious features, aka first four columns.
2. Use `scipy.stats.pearsonr` to estimate correlations in your data. This function outputs *correlation* and *p-value*. 
3. Use `seaborn.heatmap` to plot correlations (only plot correlations with the significant *p-values*)
4. What features correlate with the protected features?

In [135]:
n_features = X_train.shape[1]
alpha = 0.05 ## significance level
corrected_alpha = alpha / (n_features**2/2) # Bonferroni correction for multiple testings

corr = np.zeros((n_features, n_features))
p = np.zeros((n_features, n_features))

##############################
######### Your code here
##############################


In [136]:
#########################
### Visualise full correlarion matrix
#########################
#plt.figure(figsize=(15,15))
#seaborn.heatmap(... mask= p > corrected_alpha)
#plt.show()

In [138]:
#########################
### Visualise columns of the correlation matrix that are associated only with protected features (see Lecture Slides)
#########################

#### 3. Simple Logistic Regression (by droping the protected attributes)
1. Drop the protected attributes from the training and test data
2. Let's train a simple logistic regression (**WITHOUT** any penalty). If your model does not converge, then increase `max_iter`. 
3. Use your favourite **fairness metric** (e.g. False Positive Rate) to see how your model performs based on the protected feature (you have 2 for `SEX` and 2 groups for `RAC1P`) - use `group_test`. 
3. Use your favourite **evaluation metric** to find out how your model performed in general.

In [140]:
####################
##### YOUR CODE HERE
####################

#### 4. Fair Representation
Now, we are going to implement the method from [A Geometric Solution to Fair Representations](https://dl.acm.org/doi/10.1145/3375627.3375864): we want to remove protected features from the dataset, plus, remove any existing correlations.
1. Let's split our pre-processed `X_train` into chunks that contain only protected features (aka columns associated with `SEX` and `RAC1P`) and non-protected features.
2. Implement a method that outputs debiased data representation:
    1. Find the orthonormal basis spanned by the column of **protected** features  -- use `scipy.linalg.orth`
    2. Project **non-protected** features onto this basis (in the paper: $ \mathbf{P}_f\mathbf{x}_i$  )
    3. Complete the solution according to $\mathbf{r}_j = \mathbf{x}_i - \mathbf{P}_f\mathbf{x}_i$ 
3. Look at the correlations between **newly aquired non-protected features** and the original protected features. What do you see?
    1. Change the fairness penalty term and describe how it changes the correlation structure in data.
4. Train a logistic regression model (WITHOUT any penalty) with the **newly aquired non-protected** features. 
    1. According to the paper - the coefficients of the model (aka betas) that you obtain should be debiased - and you do not have to transform your test set (i.e. project it into unbiased space).
    2. Use non-protected features of the test set to evaluate the performance of the model (with fairnes penalty of 1) on different protected subgroups. What do you see?

In [234]:
## last columns of our data contains the protected features
#protected = ...[:,54:] 
#nonprotected = ...[:,:54]

In [235]:
def debias_data(nonprotected, protected):
    basis = ...
    projection = ...



#### 5. Fair PCA
We are going to implement the method from [Efficient fair PCA for fair representation learning](https://deepai.org/publication/efficient-fair-pca-for-fair-representation-learning). Here, we will use dimensionality reduction to remove any existing proxies associated with the protected features
1. We are going to work with the pre-processed `X_train` (I'll further refer to it as $X$).
2. Create a matrix $Z$ that contains protected features. **Z = [SEX, RAC1P]**. 
3. Remove mean from each column of $Z$. 
4. Find orthonormal null-space spanned by $\mathbf{Z}^T\mathbf{X}$ - use `scipy.linalg.null_space`. This is our matrix $\mathbf{R}$ (columns are basis vectors).
5. Now we need to find orthonormal eigenvectors of $\mathbf{R}^T\mathbf{R}\mathbf{X}\mathbf{X}^T\mathbf{R}$ - use `scipy.linalg.eig` (it outputs both eigenvectors and eigenvalues, and eigenvectors are already sorted based on the eigenvalues). Let's used the first 30 eigenvectors. Now we have matrix $\mathbf{L}$ (columns are eigenvectors).
6. Finaly, we can find the projection matrix $\mathbf{U} = \mathbf{R}\mathbf{L}$.
7. Now, we can use $\mathbf{U}$ to project our data into fair space. $\mathbf{X}' = \mathbf{XU}$
8. Let's see if the projected columns of $\mathbf{X}'$ are correlated with columns of $\mathbf{Z}$.
9. Use $\mathbf{X}'$ to train another logistic regression WITHOUT any penalty. 
10. See how does the model performs on different groups (based on the protected features). Don't forget to use projection matrix $\mathbf{U}$ to project `X_test` to fair. dimensions

In [None]:
####################
##### YOUR CODE HERE
####################