In [117]:
# Importing the basics
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch.nn.functional as F
import matplotlib.pyplot as plt

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

TEST_PROPORTION = 0.15
VAL_PROPORTION = 0.1
FILE_PATH = "clean_dataset.csv"

cpu


In [118]:
#Job resume categorization, callback or no callback, categorize without attributes to bias it such as names or gender,
#dataset is randomized and sent to real job applications that gave or did not give callbacks.

#A test set where we can verify there is no bias to name or gender would be great, only based on experience or schooling, since there is a chance there could
#be bias in the callbacks in the data

#Maybe a more general model by combining similar datasets? As this one is strictly 2 cities in the USA during 2001-2002

#If the vector at the end can be linked back to either name or gender or [insert other attribute that should not have bias], the AI is still biased.

#If anyone is reading this, would it be possible to drop the chosen dataset's/datasets' zip file/s in the discord?

In [119]:
# Read Dataset
data = pd.read_csv(FILE_PATH)

# Drop zipcodes
data = data.drop(["ZipCode","Ethnicity"], axis=1)

# One hot encode categorical attributes.
columns = ["Industry", "Citizen"]
data = pd.get_dummies(data=data, columns=columns, dtype=int)
ones = 0
zeros = 0
for index, row in data.iterrows():
    if row['Gender']==1:
        ones+=1
    else:
        zeros+=1

ones2=0
zeros2=0
for index, row in data.iterrows():
    if row['Gender']==1:
        if ones2 > min(ones,zeros):
            #print(index)
            data = data.drop(index=index)
        ones2+=1
    else:
        if zeros2 > min(ones,zeros):
            #print(index)
            data = data.drop(index=index)
        zeros2+=1


ones = 0
zeros = 0
for index, row in data.iterrows():
    if row['Gender']==1:
        ones+=1
    else:
        zeros+=1

print("ones:",ones)
print("zeros:",zeros)


# To pytorch.tensor format
X_tensor = torch.tensor(
    data.drop(["Approved", "Gender"], axis=1).values, dtype=torch.float32
)
y_tensor_approved = F.one_hot(torch.tensor(data["Approved"].values, dtype=torch.int64))
y_tensor_gender = F.one_hot(torch.tensor(data["Gender"].values, dtype=torch.int64))

# Define a custom dataset
class Custom_Dataset(Dataset):
    def __init__(self, features, targets_credit, targets_gender):
        self.features = features
        self.targets_credit = targets_credit
        self.targets_gender = targets_gender

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.targets_credit[idx], self.targets_gender[idx]

# Init Dataset
dataset = Custom_Dataset(X_tensor, y_tensor_approved, y_tensor_gender)

# Calculate Dataset proportions
test_size = int(TEST_PROPORTION * len(dataset))
val_size = int(VAL_PROPORTION * len(dataset))
train_size = len(dataset) - test_size - val_size

# Creates datasets
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_size, val_size, test_size]
)

# Creates DataLoaders.
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=16)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=16)

ones: 211
zeros: 210


In [120]:
# CREDIT CARD ANN
class ANN_approved(nn.Module):
    def __init__(self):
        super().__init__()
        self.act = nn.LeakyReLU()
        self.drop = nn.Dropout(p=0.4)

        self.l1 = nn.Linear(27, 32)
        self.l2 = nn.Linear(32, 16)
        self.l3 = nn.Linear(16, 2)

        self.soft = nn.Softmax(1)

    def forward(self, x):
        x = self.l1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.l2(x)
        out = self.act(x)
        out = self.drop(out)
        out = self.soft(out)
        out = self.l3(out)
        return out, x
    

class Custom_ANN_approved_loss(nn.Module):
    def __init__(self):
        super(Custom_ANN_approved_loss, self).__init__()

    def forward(self, output_credit, output_gender, target):
        c=1
        loss = (-output_credit[range(target.shape[0]), (target).argmax(dim=1)].log()-c*0.5*(output_gender[range(target.shape[0]), target.argmax(dim=1)]-output_gender[range(target.shape[0]), target.argmax(dim=1)]**2).log()).mean()
        return loss
    


In [121]:
# GENDER BIAS ANN
class ANN(nn.Module):
    def __init__(self):
        super().__init__()
        self.act = nn.LeakyReLU()
        self.drop = nn.Dropout(p=0.5)

        self.l1 = nn.Linear(16, 32)
        self.l2 = nn.Linear(32, 32)
        self.l3 = nn.Linear(32, 16)
        self.l4 = nn.Linear(16, 2)

        self.soft = nn.Softmax(1)

    def forward(self, x):
        x = self.l1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.l2(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.l3(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.l4(x)
        x = self.soft(x)
        return x
    

class Custom_ANN_loss(nn.Module):
    def __init__(self):
        super(Custom_ANN_loss, self).__init__()

    def forward(self, output_gender, target):
        loss = (-0.5*(output_gender[range(target.shape[0]), target.argmax(dim=1)]-output_gender[range(target.shape[0]), target.argmax(dim=1)]**2).log()).mean()
        return loss


In [122]:
def train_and_validation(model_credit,model_gender,train_loader,val_loader,criterion_credit,criterion_gender,optimizer_credit,optimizer_gender,epochs, k1 = 1, k2 = 1):
    credit_accuracy = []
    gender_accuracy = []
    
    for epoch in range(epochs):
        print(epoch,"/",epochs)
        # Training
        for batch_nr, (features,label_credit,label_gender) in enumerate(train_loader):
            # Run models
            predictions_credit, last_features = model_credit(features)
            predictions_gender = model_gender(last_features)

            # Step credit
            optimizer_credit.zero_grad()
            loss_credit = criterion_credit(predictions_credit, predictions_gender, label_credit)
            loss_credit.backward()
            optimizer_credit.step()
            
            # Run models
            predictions_credit, last_features = model_credit(features)
            predictions_gender = model_gender(last_features)

            # Step gender
            optimizer_gender.zero_grad()
            loss_gender = criterion_gender(predictions_gender, label_gender)
            loss_gender.backward()
            optimizer_gender.step()

        # Validation
        correct_credit = 0
        correct_gender = 0
        total_credit = 0
        total_gender = 0

        for batch_nr, (features,label_credit,label_gender) in enumerate(val_loader):
            # Run models
            predictions_credit, last_features = model_credit(features)
            predictions_gender = model_gender(last_features)

            correct_credit += torch.sum(predictions_credit.argmax()==label_credit.argmax())
            correct_gender += torch.sum(predictions_gender.argmax()==label_gender.argmax())

            total_credit += label_credit.shape[0]
            total_gender += label_gender.shape[0]
        
        print("credit accuracy:",correct_credit/total_credit)
        credit_accuracy.append(correct_credit/total_credit)
        print("gender accuracy:",correct_gender/total_gender)
        gender_accuracy.append(correct_gender/total_gender)

    return credit_accuracy,gender_accuracy

In [124]:
epochs = 2000

model_credit = ANN_approved()
model_gender = ANN()
criterion_credit = Custom_ANN_approved_loss()
criterion_gender = Custom_ANN_loss()
optimizer_credit = torch.optim.Adam(model_credit.parameters())
optimizer_gender = torch.optim.Adam(model_gender.parameters())

train_and_validation(model_credit=model_credit,
                     model_gender=model_gender,
                     train_loader=train_loader,
                     val_loader=val_loader,
                     criterion_credit=criterion_credit,
                     criterion_gender=criterion_gender,
                     optimizer_credit=optimizer_credit,
                     optimizer_gender=optimizer_gender,
                     epochs=epochs,
                     )

0 / 2000
credit accuracy: tensor(0.)
gender accuracy: tensor(0.0476)
1 / 2000
credit accuracy: tensor(0.0238)
gender accuracy: tensor(0.0714)
2 / 2000
credit accuracy: tensor(0.0238)
gender accuracy: tensor(0.0238)
3 / 2000
credit accuracy: tensor(0.0714)
gender accuracy: tensor(0.0238)
4 / 2000
credit accuracy: tensor(0.)
gender accuracy: tensor(0.0238)
5 / 2000
credit accuracy: tensor(0.)
gender accuracy: tensor(0.0476)
6 / 2000
credit accuracy: tensor(0.0238)
gender accuracy: tensor(0.)
7 / 2000
credit accuracy: tensor(0.0714)
gender accuracy: tensor(0.0476)
8 / 2000
credit accuracy: tensor(0.0476)
gender accuracy: tensor(0.0714)
9 / 2000
credit accuracy: tensor(0.0476)
gender accuracy: tensor(0.0476)
10 / 2000
credit accuracy: tensor(0.0476)
gender accuracy: tensor(0.0714)
11 / 2000
credit accuracy: tensor(0.0238)
gender accuracy: tensor(0.0714)
12 / 2000
credit accuracy: tensor(0.0238)
gender accuracy: tensor(0.0476)
13 / 2000
credit accuracy: tensor(0.)
gender accuracy: tensor(0.

KeyboardInterrupt: 