In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#models
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim

#model evaluation
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, average_precision_score, roc_curve, auc

import pickle


# load data

In [12]:
# #load full data
# data = pd.read_csv('data/compas-scores-train.csv')  

# #split train-test 70-30
# train, test = train_test_split(data, test_size=0.30, random_state=123)

# #save splits
# train.to_csv('data/compas-scores-train-70.csv', index=False)
# test.to_csv('data/compas-scores-test-30.csv', index=False)

In [13]:
#load split data
data_train = pd.read_csv('data/compas-scores-train-70.csv')  
data_test = pd.read_csv('data/compas-scores-test-30.csv')  

display(data_train.head())

print('data, train:', data_train.shape)
print('data, test:', data_test.shape)

#split into X, y
X_train = data_train.loc[:, data_train.columns != 'risk']
y_train = data_train['risk']

X_test = data_test.loc[:, data_test.columns != 'risk']
y_test = data_test['risk']


X=X_train
y=y_train
print('----- TRAIN -----')
print('X, shape:', X.shape)
print('y, shape:', y.shape)
print('#class1: ', sum(y), f', prop = {sum(y)/len(y)}')
print('#class0:', sum(y==0), f', prop = {sum(y==0)/len(y)}')

X=X_test
y=y_test
print('----- TEST -----')
print('X, shape:', X.shape)
print('y, shape:', y.shape)
print('#class1: ', sum(y), f', prop = {sum(y)/len(y)}')
print('#class0:', sum(y==0), f', prop = {sum(y==0)/len(y)}')

Unnamed: 0,age,two_year_recid,priors_count,length_of_stay,c_charge_degree_F,sex_Female,race,risk
0,28,1,6,80,1,0,1,0
1,25,1,9,87,1,0,0,0
2,33,1,5,0,1,1,1,0
3,30,0,0,6,1,0,0,1
4,37,1,0,1,0,0,1,1


data, train: (3455, 8)
data, test: (1482, 8)
----- TRAIN -----
X, shape: (3455, 7)
y, shape: (3455,)
#class1:  2805 , prop = 0.8118668596237337
#class0: 650 , prop = 0.18813314037626627
----- TEST -----
X, shape: (1482, 7)
y, shape: (1482,)
#class1:  1217 , prop = 0.8211875843454791
#class0: 265 , prop = 0.1788124156545209


# model 1: logistic regression

In [14]:
#model 1: logistic regression

#fit model
model_logistic = LogisticRegression(penalty='none').fit(X_train, y_train)

#save model
model_filename = 'models/model_logistic.pkl'
# pickle.dump(model_logistic, open(model_filename, 'wb'))

#load model
model_logistic = pickle.load(open(model_filename, 'rb'))

In [15]:
def evaluate_model(y_true, y_pred, y_prob_class1):
    #overall
    print('acccuracy:', accuracy_score(y_true, y_pred))
    fpr, tpr, thresholds = roc_curve(y_true, y_prob_class1) #y_prob
    print('AUC:', auc(fpr, tpr))
    
    #class 1
    print('\n***class 1***')
    pos_label = 1
    print('recall:', recall_score(y_true, y_pred, pos_label=pos_label))
    print('precision:', precision_score(y_true, y_pred, pos_label=pos_label))
    print('F1 score:', f1_score(y_true, y_pred, pos_label=pos_label))
    print('AP:', average_precision_score(y_true, y_pred, pos_label=pos_label))
    
    #class 0
    print('\n***class 0***')
    pos_label = 0
    print('recall:', recall_score(y_true, y_pred, pos_label=pos_label))
    print('precision:', precision_score(y_true, y_pred, pos_label=pos_label))
    print('F1 score:', f1_score(y_true, y_pred, pos_label=pos_label))
    print('AP:', average_precision_score(y_true, y_pred, pos_label=pos_label))

In [16]:
#evaluate model

###training set
X=X_train
y=y_train
print('----- TRAIN -----')
evaluate_model(y_true=y, 
               y_pred=model_logistic.predict(X), 
               y_prob_class1=model_logistic.predict_proba(X)[:, 1])

###test set
X=X_test
y=y_test
print('\n----- TEST -----')
evaluate_model(y_true=y, 
               y_pred=model_logistic.predict(X), 
               y_prob_class1=model_logistic.predict_proba(X)[:, 1])

----- TRAIN -----
acccuracy: 0.851519536903039
AUC: 0.8473292197998081

***class 1***
recall: 0.9693404634581105
precision: 0.8642720915448188
F1 score: 0.9137960006721559
AP: 0.8626653714217482

***class 0***
recall: 0.34307692307692306
precision: 0.7216828478964401
F1 score: 0.465067778936392
AP: 0.15370693417579961

----- TEST -----
acccuracy: 0.8394062078272605
AUC: 0.8261949427140663

***class 1***
recall: 0.9556285949055053
precision: 0.8634001484780994
F1 score: 0.9071762870514821
AP: 0.8615271176948937

***class 0***
recall: 0.30566037735849055
precision: 0.6
F1 score: 0.4049999999999999
AP: 0.14950255980394675


# Model 2: gradient boosted tree

In [17]:
#model 2: gradient boosted tree

#train model
model_gb = XGBClassifier(n_estimators=50, random_state=12345, use_label_encoder=False, eval_metric='logloss')
#use_label_encoder=False, remove warning (no impact on model performance)
#eval_metric='logloss', remove warning (this is the default, but need to specify)
model_gb.fit(X_train, y_train)

#save model
model_filename = 'models/model_gb.pkl'
# pickle.dump(model_gb, open(model_filename, 'wb'))

#load model
model_gb = pickle.load(open(model_filename, 'rb'))


In [18]:
#evaluate model

###training set
X=X_train
y=y_train
print('----- TRAIN -----')
evaluate_model(y_true=y, 
               y_pred=model_gb.predict(X), 
               y_prob_class1=model_gb.predict_proba(X)[:, 1])

###test set
X=X_test
y=y_test
print('\n----- TEST -----')
evaluate_model(y_true=y, 
               y_pred=model_gb.predict(X), 
               y_prob_class1=model_gb.predict_proba(X)[:, 1])

----- TRAIN -----
acccuracy: 0.9186685962373372
AUC: 0.9534114904703139

***class 1***
recall: 0.983957219251337
precision: 0.9212283044058746
F1 score: 0.9515600758489916
AP: 0.9194738427248776

***class 0***
recall: 0.6369230769230769
precision: 0.9019607843137255
F1 score: 0.7466185752930568
AP: 0.14842652350151977

----- TEST -----
acccuracy: 0.8346828609986505
AUC: 0.8172757011519202

***class 1***
recall: 0.9268693508627773
precision: 0.8785046728971962
F1 score: 0.9020391843262695
AP: 0.8743130370047535

***class 0***
recall: 0.41132075471698115
precision: 0.5505050505050505
F1 score: 0.4708423326133909
AP: 0.1450710352240761


# Model 3: random forest

In [19]:
#model 3: random forest

#train model
model_rf = RandomForestClassifier(n_estimators=50, random_state=12345)
model_rf.fit(X_train, y_train)

#save model
model_filename = 'models/model_rf.pkl'
# pickle.dump(model_rf, open(model_filename, 'wb'))

#load model
model_rf = pickle.load(open(model_filename, 'rb'))


In [20]:
#evaluate model

###training set
X=X_train
y=y_train
print('----- TRAIN -----')
evaluate_model(y_true=y, 
               y_pred=model_rf.predict(X), 
               y_prob_class1=model_rf.predict_proba(X)[:, 1])

###test set
X=X_test
y=y_test
print('\n----- TEST -----')
evaluate_model(y_true=y, 
               y_pred=model_rf.predict(X), 
               y_prob_class1=model_rf.predict_proba(X)[:, 1])

----- TRAIN -----
acccuracy: 0.9797395079594791
AUC: 0.9976999862882215

***class 1***
recall: 0.9935828877005347
precision: 0.9816836914406482
F1 score: 0.9875974486180014
AP: 0.9805939577605396

***class 0***
recall: 0.92
precision: 0.9707792207792207
F1 score: 0.9447077409162716
AP: 0.17454779383091312

----- TEST -----
acccuracy: 0.8164642375168691
AUC: 0.7941334242879956

***class 1***
recall: 0.9063270336894002
precision: 0.8747026169706582
F1 score: 0.8902340597255851
AP: 0.8696897051224493

***class 0***
recall: 0.4037735849056604
precision: 0.4841628959276018
F1 score: 0.44032921810699593
AP: 0.14690533959875374


# Model 4: neural network

In [21]:
#model 4: neural network

In [22]:
#create COMPASDataset class
class COMPASDataset(Dataset):
    
    def __init__(self, X, y):
        self.X = X.to_numpy()
        self.y = y.to_numpy()
        
    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx, :], self.y[idx]
    
#create datasets
train_ds = COMPASDataset(X_train, y_train)
test_ds = COMPASDataset(X_test, y_test)

#create dataloaders
train_dl = DataLoader(train_ds, batch_size=64, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=64)

In [23]:
#create model class
class FFNN(nn.Module):
    def __init__(self, input_size, hidden_size, seed=12345):
        super().__init__()
        
        torch.manual_seed(seed)
        
        #variables
        self.input_size = input_size
        self.hidden_size = hidden_size
        #layers architecture
        self.linear_layer1 = nn.Linear(self.input_size, self.hidden_size)
        self.linear_layer2 = nn.Linear(self.hidden_size, self.hidden_size*2)
        self.linear_layer3 = nn.Linear(self.hidden_size*2, self.hidden_size)
        self.linear_layer4 = nn.Linear(self.hidden_size, 1)
        
    def forward(self, inputs):
        out = self.linear_layer1(inputs)
        out = nn.functional.relu(out)
        out = self.linear_layer2(out)
        out = nn.functional.relu(out)
        out = self.linear_layer3(out)
        out = nn.functional.relu(out)
        out = self.linear_layer4(out)
        out = torch.sigmoid(out)
        return out
    
    def predict_proba(self, X):
        X = torch.tensor(X).type(torch.FloatTensor)
        class1_probs = self.forward(X).detach().numpy()
        class0_probs = 1-class1_probs
        return np.hstack((class0_probs, class1_probs))
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)


In [24]:
#train model

#model
model = FFNN(input_size=X_train.shape[1], hidden_size=50)

n_epochs = 20
seed = 12345
torch.manual_seed(seed)
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.BCELoss()

#function to calculate accuracy
def compute_n_correct(y_true, pred_prob): 
    y_pred = (pred_prob.squeeze()>0.5)*1
    return sum(y_pred == y_true)

train_losses = []
train_accs = []
#val_losses = []
#val_accs = []


#loop through epochs and batches
for epoch in range(n_epochs):
    running_loss = 0
    running_n_correct = 0
    
    for batch_input, batch_output in train_dl:
        batch_input = batch_input.type(torch.FloatTensor)
        batch_output = batch_output.type(torch.FloatTensor)
        #forward pass: compute model output and loss
        preds = model(batch_input)
        loss = loss_fn(preds.squeeze(), batch_output)
        #backward pass
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        #store metrics
        running_loss += loss.cpu().item()
        n_correct = compute_n_correct(y_true=batch_output, pred_prob=preds)
        running_n_correct += n_correct
    
    train_losses.append(running_loss / len(train_ds))
    train_accs.append(running_n_correct / len(train_ds))
    print('-'*20)
    print(f'Epoch {epoch+1}/{n_epochs} Train Loss: {running_loss / len(train_ds)}')
    print(f'Epoch {epoch+1}/{n_epochs} Train Accuracy: {running_n_correct / len(train_ds)}')


--------------------
Epoch 1/20 Train Loss: 0.006740891208869504
Epoch 1/20 Train Accuracy: 0.8040521144866943
--------------------
Epoch 2/20 Train Loss: 0.005784285055573183
Epoch 2/20 Train Accuracy: 0.8356005549430847
--------------------
Epoch 3/20 Train Loss: 0.005639109809043266
Epoch 3/20 Train Accuracy: 0.8425470590591431
--------------------
Epoch 4/20 Train Loss: 0.005677875397691851
Epoch 4/20 Train Accuracy: 0.840520977973938
--------------------
Epoch 5/20 Train Loss: 0.005605095197943979
Epoch 5/20 Train Accuracy: 0.8364688754081726
--------------------
Epoch 6/20 Train Loss: 0.00557932937800798
Epoch 6/20 Train Accuracy: 0.8419681787490845
--------------------
Epoch 7/20 Train Loss: 0.005564298251085792
Epoch 7/20 Train Accuracy: 0.8448625206947327
--------------------
Epoch 8/20 Train Loss: 0.005549409869093281
Epoch 8/20 Train Accuracy: 0.8454414010047913
--------------------
Epoch 9/20 Train Loss: 0.005531572566708677
Epoch 9/20 Train Accuracy: 0.840520977973938
----

In [25]:
#save model
model_filename = 'models/model_nn.pkl'
# torch.save(model, model_filename)

#load model
model_nn = torch.load(model_filename)


In [26]:
#evaluate model

###training set
X=train_ds.X
y=train_ds.y
print('----- TRAIN -----')
evaluate_model(y_true=y, 
               y_pred=model.predict(X), 
               y_prob_class1=model.predict_proba(X)[:, 1])

###test set
X=test_ds.X
y=test_ds.y
print('\n----- TEST -----')
evaluate_model(y_true=y, 
               y_pred=model.predict(X), 
               y_prob_class1=model.predict_proba(X)[:, 1])

----- TRAIN -----
acccuracy: 0.8468885672937772
AUC: 0.855534896476073

***class 1***
recall: 0.9743315508021391
precision: 0.8567398119122257
F1 score: 0.9117597998331942
AP: 0.8555879928160505

***class 0***
recall: 0.2969230769230769
precision: 0.7283018867924528
F1 score: 0.4218579234972677
AP: 0.15658400315189577

----- TEST -----
acccuracy: 0.8421052631578947
AUC: 0.8368831490984636

***class 1***
recall: 0.9622021364009861
precision: 0.8616629874908021
F1 score: 0.9091614906832299
AP: 0.8601331037236001

***class 0***
recall: 0.29056603773584905
precision: 0.6260162601626016
F1 score: 0.3969072164948454
AP: 0.15009779002689555


# Model 5: logistic regression as NN

In [34]:
#create model class
class LogisticRegressionNN(nn.Module):
    def __init__(self, input_size, seed=12345):
        super().__init__()
        
        torch.manual_seed(seed)
        
        #variables
        self.input_size = input_size
        #layers
        self.linear_layer = nn.Linear(self.input_size, 1)
        
    def forward(self, inputs):
        out = self.linear_layer(inputs)
        out = torch.sigmoid(out)
        return out
    
    def predict_proba(self, X):
        X = torch.tensor(X).type(torch.FloatTensor)
        class1_probs = self.forward(X).detach().numpy()
        class0_probs = 1-class1_probs
        return np.hstack((class0_probs, class1_probs))
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)



In [35]:
#logistic regression parameters, from logistic regression
print(model_logistic.coef_)
print(model_logistic.intercept_)


[[ 0.09747653 -0.81363522 -0.18923409 -0.00689584 -0.35678913  0.05127832
  -0.63332291]]
[0.44116897]


In [36]:
#change NN model weights to logistic regression coefficient

#instantiate model
model_nn_logistic = LogisticRegressionNN(input_size=X_train.shape[1])

#change NN model weights
lr_coefs = torch.tensor(model_logistic.coef_, requires_grad=True).type(torch.FloatTensor)
lr_intercept = torch.tensor(model_logistic.intercept_, requires_grad=True).type(torch.FloatTensor)

model_nn_logistic.linear_layer.weight = nn.Parameter(lr_coefs)
model_nn_logistic.linear_layer.bias = nn.Parameter(lr_intercept)

print(model_nn_logistic.linear_layer.weight)
print(model_nn_logistic.linear_layer.bias)

Parameter containing:
tensor([[ 0.0975, -0.8136, -0.1892, -0.0069, -0.3568,  0.0513, -0.6333]],
       requires_grad=True)
Parameter containing:
tensor([0.4412], requires_grad=True)


In [37]:
#save model
model_filename = 'models/model_nn_logistic.pkl'
torch.save(model_nn_logistic, model_filename)

#load model
model_nn_logistic = torch.load(model_filename)


In [38]:
#evaluate model -- same results as logistic regression model (model #1)

###training set
X=train_ds.X
y=train_ds.y
print('----- TRAIN -----')
evaluate_model(y_true=y, 
               y_pred=model.predict(X), 
               y_prob_class1=model.predict_proba(X)[:, 1])

###test set
X=test_ds.X
y=test_ds.y
print('\n----- TEST -----')
evaluate_model(y_true=y, 
               y_pred=model.predict(X), 
               y_prob_class1=model.predict_proba(X)[:, 1])

----- TRAIN -----
acccuracy: 0.8468885672937772
AUC: 0.855534896476073

***class 1***
recall: 0.9743315508021391
precision: 0.8567398119122257
F1 score: 0.9117597998331942
AP: 0.8555879928160505

***class 0***
recall: 0.2969230769230769
precision: 0.7283018867924528
F1 score: 0.4218579234972677
AP: 0.15658400315189577

----- TEST -----
acccuracy: 0.8421052631578947
AUC: 0.8368831490984636

***class 1***
recall: 0.9622021364009861
precision: 0.8616629874908021
F1 score: 0.9091614906832299
AP: 0.8601331037236001

***class 0***
recall: 0.29056603773584905
precision: 0.6260162601626016
F1 score: 0.3969072164948454
AP: 0.15009779002689555
