In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import dice_ml
from dice_ml.utils import helpers
from itertools import combinations
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import gower
import random as rd
from numpy import vstack
import torch, torch.nn as nn
from torch.utils.data import DataLoader, Dataset, Subset
from torch import Tensor
from statistics import mean
from torch.utils.data import random_split

In [2]:
np.set_printoptions(suppress=True)

In [3]:
df = pd.read_csv('german_credit_data.csv')

In [4]:
risk_value_map = {'good':1, 'bad':0}
df['Risk'] = df['Risk'].map(risk_value_map)

In [5]:
del df['ID']
df = df.fillna(df['Saving accounts'].value_counts().index[0])
df = df.fillna(df['Checking account'].value_counts().index[0])

In [6]:
df['Risk'].value_counts()

1    700
0    300
Name: Risk, dtype: int64

In [7]:
dataset = df.copy()
target = dataset["Risk"]
train_dataset, test_dataset, y_train, y_test = train_test_split(dataset,
                                                                target,
                                                                test_size=0.2,
                                                                random_state=0,
                                                                stratify=target)

X_train = train_dataset.drop('Risk', axis=1)
y_train = train_dataset.loc[:, train_dataset.columns == 'Risk']
X_test = test_dataset.drop('Risk', axis=1)
y_test = test_dataset.loc[:, test_dataset.columns == 'Risk']

In [8]:
numerical = ['Age','Job','Credit amount', 'Duration']
categorical = X_train.columns.difference(numerical)

In [9]:
numeric_transformer =  MinMaxScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
         ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)])
 
logistic_pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])
clf = logistic_pipe.fit(X_train, y_train.values.ravel())
accuracy = logistic_pipe.score(X_test,y_test)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 71.50


In [10]:
d = dice_ml.Data(dataframe=train_dataset, continuous_features=['Age','Job','Credit amount', 'Duration'], outcome_name='Risk')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data_df[feature] = self.data_df[feature].apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data_df[feature] = self.data_df[feature].astype(


In [11]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_predicted = clf.predict(X_test)
print(confusion_matrix(y_test,y_predicted))
print(classification_report(y_test,y_predicted))
print(accuracy_score(y_test, y_predicted))

[[  8  52]
 [  5 135]]
              precision    recall  f1-score   support

           0       0.62      0.13      0.22        60
           1       0.72      0.96      0.83       140

    accuracy                           0.71       200
   macro avg       0.67      0.55      0.52       200
weighted avg       0.69      0.71      0.64       200

0.715


In [12]:
m = dice_ml.Model(model=clf, backend="sklearn")

exp = dice_ml.Dice(d, m, method="random")

In [13]:
cfs=[]

for index, row in dataset.iterrows():
    e1 = exp.generate_counterfactuals(row.iloc[:-1].to_frame().T, total_CFs=4, desired_class="opposite")
    cfs.append([row.to_frame().T,e1.cf_examples_list[0].final_cfs_df])

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  4.99it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.83it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.73it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.43it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.68it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  8.82it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.80it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 10.34it/s]
100%|███████████████████████████████████

In [14]:
e1.visualize_as_dataframe(show_only_changes=True)

Query instance (original outcome : 0)


Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,27,male,2,own,moderate,moderate,4576,45,car,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,-,female,-,-,-,-,-,-,radio/TV,1
1,40.0,-,0.0,-,-,-,-,-,-,1
2,-,female,-,-,-,-,-,13.0,-,1
3,-,-,-,-,-,-,-,17.0,-,1


In [15]:
temp=[]


for item in cfs:
    item[0]['cf'] = 0
    item[1]['cf'] = 1
    ex_class = item[0]['Risk']
    cf_class = int(1-ex_class)
    try:
        cf_filtered = item[1][item[1]['Risk']==cf_class].reset_index(drop=True)
        bestcf_id = gower.gower_topn(item[0], cf_filtered, n = 1)['index'][0]
        bestcf = cf_filtered[(bestcf_id):(bestcf_id+1)]
        temp.append(item[0])
        temp.append(bestcf)
    except:
        continue


temp_df = pd.concat(temp).reset_index(drop=True)
temp_onehot = pd.get_dummies(temp_df, columns=['Sex','Housing','Saving accounts','Checking account','Purpose'], prefix = ['Sex','Housing','Saving accounts','Checking account','Purpose'])



In [16]:
X = temp_onehot[temp_onehot['cf']==0].drop(columns=['cf','Risk']).copy()
Xprime = temp_onehot[temp_onehot['cf']==1].drop(columns=['cf','Risk']).copy()

In [17]:
matches = int(len(temp_onehot)/2)

In [18]:
comb = list(combinations(range(matches),2))
a_ind_list = []
b_ind_list = []
c_ind_list = []
d_ind_list = []

for (ex,cf) in comb:
            a_ind_list.append(ex * 2)
            b_ind_list.append(ex * 2 + 1)
            c_ind_list.append(cf * 2)
            d_ind_list.append(cf * 2 + 1)
            
A_df = X.loc[a_ind_list,:].reset_index(drop=True)
B_df = Xprime.loc[b_ind_list,:].reset_index(drop=True)
C_df = X.loc[c_ind_list,:].reset_index(drop=True)
D_df = Xprime.loc[d_ind_list,:].reset_index(drop=True)


In [19]:
column_list = A_df.columns

In [20]:
A_df.to_csv (r'data\A.csv', index = False, header=False)
B_df.to_csv (r'data\B.csv', index = False, header=False)
C_df.to_csv (r'data\C.csv', index = False, header=False)
D_df.to_csv (r'data\D.csv', index = False, header=False)

In [21]:
print(A_df.shape,B_df.shape,C_df.shape,D_df.shape)

(263175, 24) (263175, 24) (263175, 24) (263175, 24)


# Dataset Class / Analogy Solver

In [22]:
class CFDataset(Dataset):
    def __init__(self, path):
        self.A = pd.read_csv(path+'A.csv').values[:, :].astype('float32')
        self.B = pd.read_csv(path+'B.csv').values[:, :].astype('float32')
        self.C = pd.read_csv(path+'C.csv').values[:, :].astype('float32')
        self.D = pd.read_csv(path+'D.csv').values[:, :].astype('float32')
 
    def __len__(self):
        return len(self.A)
 
    def __getitem__(self, idx):
        return [self.A[idx], self.B[idx], self.C[idx], self.D[idx]]
    
    def get_splits(self, n_test=0.2):
        test_size = round(n_test * len(self.A))
        train_size = len(self.A) - test_size
        return random_split(self, [train_size, test_size])

In [23]:
class AnalogyRegression(nn.Module):
    
    def __init__(self, nfeatures):
        super().__init__()
        self.nfeatures = nfeatures
        self.proj = nn.Linear(self.nfeatures, self.nfeatures)
        self.ab = nn.Linear(2 * self.nfeatures, 2 * self.nfeatures)
        self.ac = nn.Linear(2 * self.nfeatures, 2 * self.nfeatures)
        self.d = nn.Linear(4 * self.nfeatures, self.nfeatures)
        self.proj_end = nn.Linear(self.nfeatures, self.nfeatures)

    def forward(self, a, b, c):
        a = self.proj(a)
        b = self.proj(b)
        c = self.proj(c)
        ab = self.ab(torch.cat([a, b], dim = -1))
        ac = self.ab(torch.cat([a, c], dim = -1))
        d = self.d(torch.cat([ab, ac], dim = -1))
        d = self.proj_end(d)
        return d

In [24]:
def enrich(a, b, c, d):
    yield a, b, c, d
    yield c, d, a, b
    yield c, a, d, b
    yield d, b, c, a
    yield d, c, b, a
    yield b, a, d, c
    yield b, d, a, c
    yield a, c, b, d

In [25]:
def prepare_data(path):
    dataset = CFDataset(path)
    train, test = dataset.get_splits()
    train_dl = DataLoader(train, batch_size=42, shuffle=True)
    test_dl = DataLoader(test, batch_size=42, shuffle=False)
    return train_dl, test_dl

In [26]:
def train_solver(trainloader,regression_model,epochs):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    optimizer = torch.optim.Adam(regression_model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()
    regression_model.to(device)
   
    for epoch in range(epochs):
            losses = []
            for i, (a,b,c,d) in enumerate(train_dl):
                optimizer.zero_grad()
                loss = torch.tensor(0).to(device).float()
                for a, b, c, d in enrich(a, b, c, d):
                    d_pred = regression_model(a, b, c)
                    loss = criterion(d_pred, d)
                loss.backward()
                optimizer.step()


In [27]:
def predict(a,b,c, regression_model):
    a = Tensor([a])
    b = Tensor([b])
    c = Tensor([c])
    d_pred = regression_model(a,b,c)
    d_pred = d_pred.detach().numpy()
    return d_pred

In [28]:
def undummify(df, prefix_sep="_"):
    df.columns = column_list
    cols2collapse = {
        item.split(prefix_sep)[0]: (prefix_sep in item) for item in df.columns
    }
    series_list = []
    for col, needs_to_collapse in cols2collapse.items():
        if needs_to_collapse:
            undummified = (
                df.filter(like=col)
                .idxmax(axis=1)
                .apply(lambda x: x.split(prefix_sep, maxsplit=1)[1])
                .rename(col)
            )
            series_list.append(undummified)
        else:
            series_list.append(df[col])
    undummified_df = pd.concat(series_list, axis=1)
    undummified_df = undummified_df[X_train.columns]
    return undummified_df

In [29]:
def evaluate_model(test_dl,regression_model,clf):
    cf_goodness=[]
    cf_classmatch=[]
    for i,(a,b,c,d) in enumerate(test_dl):
        d_pred = regression_model(a,b,c)
        d_pred = pd.DataFrame(d_pred.detach().numpy())
        c_df = pd.DataFrame(c).astype("float") 
        d_pred = np.round(d_pred)
        diff = c_df == d_pred
        cf_quality = 1/(~diff).sum(axis=1)
        cf_goodness.append(mean(cf_quality))
        class_match = clf.predict(undummify(d_pred))==clf.predict(undummify(c_df))
        cf_classmatch.append((~class_match).sum()/len(c_df))
    return(np.mean(cf_goodness),np.mean(cf_classmatch))   

In [30]:
path = 'data/'
train_dl, test_dl = prepare_data(path)
print(len(train_dl.dataset), len(test_dl.dataset))

210539 52635


In [31]:
regression_model = AnalogyRegression(nfeatures=24)
train_solver(train_dl,regression_model, 100)

In [32]:
evaluate_model(test_dl,regression_model,clf)

(0.08666582179827191, 0.9140971114655325)