<a href="https://colab.research.google.com/github/Consmart18/Debiasing-Student-Algorithms/blob/main/Virgin_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Debiasing algorithms for student progress monitoring
### Evaluating fairness of the virgin models (i.e. models with no debiasing stragegy applied).

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Load the data
projectdata_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Project files/studentinfo_updated_150.csv')

In [4]:
# drop student identifier
df = projectdata_df.drop(['id_student', 'score_tma3', 'score_tma4', 'score_tma5', 'sum_interaction_90',
                          'sum_interaction_120', 'sum_interaction_150',], axis=1)

In [5]:
# fill missing values in imd_band and convert to label
mapping = {'0-10%': 1, '10-20%': 2, '20-30%': 3, '30-40%': 4, '40-50%': 5,
           '50-60%': 6, '60-70%': 7, '70-80%': 8, '80-90%': 9, '90-100%': 10}
df['imd_band'] = df['imd_band'].map(mapping)

# fill missing values with mode
df['imd_band'].fillna(df['imd_band'].mode()[0], inplace=True)

# convert age_band to label as well
mapping = {'0-35': 0, '35-55': 1, '55<=': 1}
df['age_band'] = df['age_band'].map(mapping)

In [6]:

# One-hot encode the 'code_module' column
df = pd.get_dummies(df, columns=['code_module'])

# Split the 'code_presentation' column into 'year' and 'term'
df['year'] = df['code_presentation'].str[:4]
df['term'] = df['code_presentation'].str[4:]

# Label encode the 'term' column
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['term'] = le.fit_transform(df['term'])

# Drop the 'code_presentation' column
df = df.drop(columns=['code_presentation'])

In [7]:
df['year'] = df['year'].astype(int)

In [8]:
# Change gender and disability to binary

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
df['disability'] = le.fit_transform(df['disability'])

In [9]:
# one-hot encode region and highest education

df = pd.get_dummies(df, columns=['region'], prefix = ['Reg'])

df = pd.get_dummies(df, columns=['highest_education'], prefix = ['Edu'])

In [10]:
# Combine pass and distinction to be 'Pass'; also combine Fail & Withdrawn to be 'At Risk'
df['final_result'] = df['final_result'].replace(['Pass', 'Distinction'], 'Pass')
df['final_result'] = df['final_result'].replace(['Fail', 'Withdrawn'], 'At Risk')
# final_result to binary
mapping = {'Pass': 0, 'At Risk': 1}
df['final_result'] = df['final_result'].map(mapping)

In [11]:
X = df.drop('final_result', axis=1)  # Features
y = df['final_result']  # Target variable

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Train the model
lr = LogisticRegression(C=1, penalty='l2', solver='liblinear', max_iter=3000)
lr.fit(X_train, y_train)

# Make predictions
y_pred_lr = lr.predict(X_test)

# Print out accuracy
print('Logistic Regression accuracy:', accuracy_score(y_test, y_pred_lr))

Logistic Regression accuracy: 0.7786868480261813


In [None]:
pip install holisticai

In [14]:
def compute_fairness(sensitive_feature):

  # Group data by sensitive feature
  group_a = (X_test[sensitive_feature] == 1).values
  group_b = (X_test[sensitive_feature] == 0).values

  # Calculate ABROCA
  from holisticai.bias.metrics import abroca
  abroca_value = abroca(group_a, group_b, lr.predict_proba(X_test)[:, 1], y_test)
  print(f'ABROCA value: {abroca_value}')

  # Calculate Average Odds Difference
  from holisticai.bias.metrics import average_odds_diff
  aod_value = average_odds_diff(group_a, group_b, y_pred_lr, y_test)
  print(f'Average Odds Difference value: {aod_value}')

  # Calculate Equal Opportunity Difference
  from holisticai.bias.metrics import equal_opportunity_diff
  eod_value = equal_opportunity_diff(group_a, group_b, y_pred_lr, y_test)
  print(f'Equal Opportunity Difference value: {eod_value}')

In [15]:
compute_fairness('gender')

ABROCA value: 0.023879272633336335
Average Odds Difference value: 0.03867455700632337
Equal Opportunity Difference value: 0.06569408523113474


In [16]:
compute_fairness('disability')

ABROCA value: -0.00366140020476613
Average Odds Difference value: 0.1268815049844077
Equal Opportunity Difference value: 0.10674794722788428


In [17]:
compute_fairness('age_band')

ABROCA value: -0.0018113535708093043
Average Odds Difference value: -0.03572590059447479
Equal Opportunity Difference value: -0.03022592396649948


### Gradient Boosting

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [19]:
from sklearn.ensemble import GradientBoostingClassifier
# Gradient Boosting
gb = GradientBoostingClassifier(learning_rate=0.1, max_depth=4, min_samples_leaf=50, n_estimators=240, random_state=42)
gb.fit(X_train, y_train)

# Make predictions
y_pred_gb = gb.predict(X_test)

# Print out accuracy
print('Gradient Boosting accuracy:', accuracy_score(y_test, y_pred_gb))

Gradient Boosting accuracy: 0.8009817958682757


In [20]:
def compute_fairness1(sensitive_feature):

  # Group data by gender
  group_a = (X_test[sensitive_feature] == 1).values
  group_b = (X_test[sensitive_feature] == 0).values

  # Calculate ABROCA
  from holisticai.bias.metrics import abroca
  abroca_value = abroca(group_a, group_b, gb.predict_proba(X_test)[:, 1], y_test)
  print(f'ABROCA value: {abroca_value}')

  # Calculate Average Odds Difference
  from holisticai.bias.metrics import average_odds_diff
  aod_value = average_odds_diff(group_a, group_b, y_pred_gb, y_test)
  print(f'Average Odds Difference value: {aod_value}')

  # Calculate Equal Opportunity Difference
  from holisticai.bias.metrics import equal_opportunity_diff
  eod_value = equal_opportunity_diff(group_a, group_b, y_pred_gb, y_test)
  print(f'Equal Opportunity Difference value: {eod_value}')

In [21]:
compute_fairness1('gender')

ABROCA value: 0.00919327296763206
Average Odds Difference value: 0.030460925628085153
Equal Opportunity Difference value: 0.04188903439814495


In [22]:
compute_fairness1('age_band')

ABROCA value: 0.010929679459563135
Average Odds Difference value: -0.04251131969733572
Equal Opportunity Difference value: -0.023461313220187408


In [23]:
compute_fairness1('disability')

ABROCA value: 4.272996665322104e-05
Average Odds Difference value: 0.07830895147560069
Equal Opportunity Difference value: 0.06229020410767061


### Random Forest

In [24]:
from sklearn.ensemble import RandomForestClassifier
# Random Forest
rf = RandomForestClassifier(max_depth=15, min_samples_leaf=2, min_samples_split=5, n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_test)

# Print out accuracy
print('Random Forest accuracy:', accuracy_score(y_test, y_pred_rf))

Random Forest accuracy: 0.7944364900797709


In [25]:
def compute_fairness2(sensitive_feature):

  # Group data by gender
  group_a = (X_test[sensitive_feature] == 1).values  # assuming 1 represents 'male'
  group_b = (X_test[sensitive_feature] == 0).values  # assuming 0 represents 'female'

  # Calculate ABROCA
  from holisticai.bias.metrics import abroca
  abroca_value = abroca(group_a, group_b, rf.predict_proba(X_test)[:, 1], y_test)
  print(f'ABROCA value: {abroca_value}')

  # Calculate Average Odds Difference
  from holisticai.bias.metrics import average_odds_diff
  aod_value = average_odds_diff(group_a, group_b, y_pred_rf, y_test)
  print(f'Average Odds Difference value: {aod_value}')

  # Calculate Equal Opportunity Difference
  from holisticai.bias.metrics import equal_opportunity_diff
  eod_value = equal_opportunity_diff(group_a, group_b, y_pred_rf, y_test)
  print(f'Equal Opportunity Difference value: {eod_value}')

In [26]:
compute_fairness2('gender')

ABROCA value: 0.011016568194867515
Average Odds Difference value: 0.027222656521885866
Equal Opportunity Difference value: 0.037707669878003225


In [27]:
compute_fairness2('disability')

ABROCA value: 0.0010926757142766252
Average Odds Difference value: 0.05273230113442286
Equal Opportunity Difference value: 0.04727831124369286


In [28]:
compute_fairness2('age_band')

ABROCA value: 0.004582464076055115
Average Odds Difference value: -0.03813503369202035
Equal Opportunity Difference value: -0.01616614846647646
