<a href="https://colab.research.google.com/github/Cumberkid/Learning-the-Optimal-Solution-Path/blob/main/experiments/fair-regression/notebooks/01 Preprocessing Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

## Law School Admission Bar Passage Dataset
We use the Law School Admission Bar Passage dataset originally collected for a study called 'LSAC National Longitudinal Bar Passage Study' by Linda Wightman in 1998 and then port it over an object of the Pytorch dataset class so it can be read by data_loader.

Description of the dataset (classification): https://www.kaggle.com/datasets/danofer/law-school-admissions-bar-passage?resource=download

In [None]:
data_dir = '/content/Learning-the-Optimal-Solution-Path/experiments/fair-regression/data/'

In [4]:
csv_file_path = data_dir + 'bar_pass_prediction.csv'
df = pd.read_csv(csv_file_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22407 entries, 0 to 22406
Data columns (total 39 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   decile1b                 20803 non-null  float64
 1   decile3                  20803 non-null  float64
 2   ID                       22407 non-null  int64  
 3   decile1                  21315 non-null  float64
 4   sex                      22402 non-null  float64
 5   race                     22391 non-null  float64
 6   cluster                  22311 non-null  float64
 7   lsat                     22407 non-null  float64
 8   ugpa                     22407 non-null  float64
 9   zfygpa                   21423 non-null  float64
 10  DOB_yr                   22357 non-null  float64
 11  grad                     22404 non-null  object 
 12  zgpa                     21118 non-null  float64
 13  bar1                     22407 non-null  object 
 14  bar1_yr               

In [5]:
print("Feature Names:", "', '".join(df.columns))

Feature Names: decile1b', 'decile3', 'ID', 'decile1', 'sex', 'race', 'cluster', 'lsat', 'ugpa', 'zfygpa', 'DOB_yr', 'grad', 'zgpa', 'bar1', 'bar1_yr', 'bar2', 'bar2_yr', 'fulltime', 'fam_inc', 'age', 'gender', 'parttime', 'male', 'race1', 'race2', 'Dropout', 'other', 'asian', 'black', 'hisp', 'pass_bar', 'bar', 'bar_passed', 'tier', 'index6040', 'indxgrp', 'indxgrp2', 'dnn_bar_pass_prediction', 'gpa


In [6]:
df = df.dropna()
len(df)

20427

In [7]:
from sklearn.preprocessing import OneHotEncoder

# Assuming 'categorical_column' is a column that you want to one-hot encode
# Replace 'categorical_column' with the actual name of the categorical column
categorical_column = ['grad', 'gender', 'race1', 'race2', 'Dropout', 'indxgrp', 'indxgrp2']

# Create a DataFrame with the categorical column
categorical_df = df[categorical_column]

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, drop='first')

# Fit and transform the categorical column
one_hot_encoded = encoder.fit_transform(categorical_df)

# Convert the one-hot encoded result to a DataFrame
one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(categorical_column))


# Concatenate the one-hot encoded DataFrame with the original DataFrame, dropping the original categorical column
df_encoded = pd.concat([df.drop(columns=categorical_column), one_hot_encoded_df], axis=1)

df_encoded = df_encoded.dropna()


In [8]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18668 entries, 0 to 20426
Data columns (total 53 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   decile1b                 18668 non-null  float64
 1   decile3                  18668 non-null  float64
 2   ID                       18668 non-null  float64
 3   decile1                  18668 non-null  float64
 4   sex                      18668 non-null  float64
 5   race                     18668 non-null  float64
 6   cluster                  18668 non-null  float64
 7   lsat                     18668 non-null  float64
 8   ugpa                     18668 non-null  float64
 9   zfygpa                   18668 non-null  float64
 10  DOB_yr                   18668 non-null  float64
 11  zgpa                     18668 non-null  float64
 12  bar1                     18668 non-null  object 
 13  bar1_yr                  18668 non-null  float64
 14  bar2                  

In [9]:
print("Feature Names:", "', '".join(df_encoded.columns))

Feature Names: decile1b', 'decile3', 'ID', 'decile1', 'sex', 'race', 'cluster', 'lsat', 'ugpa', 'zfygpa', 'DOB_yr', 'zgpa', 'bar1', 'bar1_yr', 'bar2', 'bar2_yr', 'fulltime', 'fam_inc', 'age', 'parttime', 'male', 'other', 'asian', 'black', 'hisp', 'pass_bar', 'bar', 'bar_passed', 'tier', 'index6040', 'dnn_bar_pass_prediction', 'gpa', 'gender_male', 'race1_black', 'race1_hisp', 'race1_other', 'race1_white', 'race2_b white', 'race2_c other', 'indxgrp_b 400-460', 'indxgrp_c 460-520', 'indxgrp_d 520-580', 'indxgrp_e 580-640', 'indxgrp_f 640-700', 'indxgrp_g 700+', 'indxgrp2_b 400-460', 'indxgrp2_c 460-520', 'indxgrp2_d 520-580', 'indxgrp2_e 580-640', 'indxgrp2_f 640-700', 'indxgrp2_g 700-760', 'indxgrp2_h 760-820', 'indxgrp2_i 820+


In [10]:
# Choose the features to be used
cols = ['decile1b', 'decile3', 'ID', 'decile1', 'sex', 'race', 'cluster', 'lsat',
        'ugpa', 'zfygpa', 'DOB_yr', 'zgpa', 'fulltime', 'fam_inc', 'age', 'parttime',
        'male', 'other', 'asian', 'black', 'hisp', 'tier', 'index6040', 'gpa',
        'gender_male', 'race1_black', 'race1_hisp', 'race1_other', 'race1_white',
        'race2_b white', 'race2_c other', 'indxgrp_b 400-460', 'indxgrp_c 460-520',
        'indxgrp_d 520-580', 'indxgrp_e 580-640', 'indxgrp_f 640-700', 'indxgrp_g 700+',
        'indxgrp2_b 400-460', 'indxgrp2_c 460-520', 'indxgrp2_d 520-580', 'indxgrp2_e 580-640',
        'indxgrp2_f 640-700', 'indxgrp2_g 700-760', 'indxgrp2_h 760-820', 'indxgrp2_i 820+']

In [11]:
X = df_encoded[cols]
y = df_encoded['pass_bar']

Since the whole dataset is big, we take 1000 random subsamples.

In [12]:
sp = df_encoded.sample(n=1000, random_state=2024)

The subsampled dataset is also highly imbalanced.

In [13]:
passed = sp[sp['pass_bar'] == 1]
len(passed)

956

In [14]:
X_sp = sp[cols]
y_sp = sp['pass_bar']

In [15]:
pd.DataFrame(X_sp).describe()

Unnamed: 0,decile1b,decile3,ID,decile1,sex,race,cluster,lsat,ugpa,zfygpa,...,indxgrp_f 640-700,indxgrp_g 700+,indxgrp2_b 400-460,indxgrp2_c 460-520,indxgrp2_d 520-580,indxgrp2_e 580-640,indxgrp2_f 640-700,indxgrp2_g 700-760,indxgrp2_h 760-820,indxgrp2_i 820+
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,5.563,5.562,13798.149,5.743,1.578,6.559,2.769,37.0043,3.2431,0.08512,...,0.171,0.673,0.003,0.022,0.032,0.097,0.171,0.227,0.226,0.22
std,2.848182,2.835171,7996.022171,2.790831,0.494126,1.301619,1.345157,5.349013,0.355976,0.933104,...,0.376697,0.469352,0.054717,0.146757,0.176088,0.296106,0.376697,0.419102,0.418448,0.414454
min,1.0,1.0,2.0,1.0,1.0,1.0,1.0,18.0,2.2,-2.55,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,3.0,6802.0,3.0,1.0,7.0,1.0,34.0,3.0,-0.54,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,6.0,6.0,14149.5,6.0,2.0,7.0,3.0,37.0,3.3,0.105,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,8.0,8.0,20442.5,8.0,2.0,7.0,4.0,41.0,3.5,0.73,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,10.0,10.0,27447.0,10.0,2.0,8.0,6.0,48.0,3.9,2.76,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
# preprocess data
from sklearn.preprocessing import StandardScaler, RobustScaler

scaler = StandardScaler()
scaler = RobustScaler()
scaler.fit(X_sp)
X_sp = scaler.transform(X_sp)

pd.DataFrame(X_sp).describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,-0.0874,-0.0876,-0.025758,-0.0514,-0.422,-0.441,-0.077,0.000614,-0.1138,-0.01565354,...,0.171,-0.327,0.003,0.022,0.032,0.097,0.171,0.227,0.226,0.22
std,0.569636,0.567034,0.586197,0.558166,0.494126,1.301619,0.448386,0.764145,0.711953,0.7347278,...,0.376697,0.469352,0.054717,0.146757,0.176088,0.296106,0.376697,0.419102,0.418448,0.414454
min,-1.0,-1.0,-1.037169,-1.0,-1.0,-6.0,-0.666667,-2.714286,-2.2,-2.090551,...,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.6,-0.6,-0.538653,-0.6,-1.0,0.0,-0.666667,-0.428571,-0.6,-0.507874,...,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-6.071532e-18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.4,0.4,0.461347,0.4,0.0,0.0,0.333333,0.571429,0.4,0.492126,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.8,0.8,0.974854,0.8,0.0,1.0,1.0,1.571429,1.2,2.090551,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


We examine the performance of a predictor trained by an "out-of-the-box" logistic regression from scikit-learn.

In [17]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

In [18]:
clf = LogisticRegression(solver='lbfgs', max_iter=100).fit(X_sp, y_sp)

In [19]:
accuracy_score(y_sp, clf.predict(X_sp))

0.96

In [20]:
from sklearn.metrics import confusion_matrix

# ?confusion_matrix

This naive predictor gives a highly imbalanced prediction, which has a high false positive rate (FPR).

In [21]:
cm = confusion_matrix(y_sp, clf.predict(X_sp))
cm

array([[  6,  38],
       [  2, 954]])

In [22]:
TPR =  cm.ravel()[3] / (cm.ravel()[3] + cm.ravel()[2] )                               #TPR = TP / P = TP / (TP + FN)
print('TPR is: %.4f' % TPR)

FPR = cm.ravel()[1] / (cm.ravel()[1] + cm.ravel()[0] )                         #FPR = FP / N = FP / (FP + TN)
print('FPR is: %.4f' % FPR)

TPR is: 0.9979
FPR is: 0.8636


In [23]:
X = np.array(X_sp)
y = np.array(y_sp)

In [25]:
df_X = pd.DataFrame(X, columns=cols)
df_y = pd.DataFrame(y, columns=['pass_bar'])

# Save the DataFrame to a CSV file
df_X.to_csv('X_processed.csv', index=False)
df_y.to_csv('y_processed.csv', index=False)