# Load Data and preprocess

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [2]:
#pip install ucimlrepo

In [3]:
from src.data.load_data import load_input_data
from src.data.load_data import preprocess_data

In [4]:
X_train, X_test, y_train, y_test, metadata, variables, datasetdf = load_input_data()

In [5]:
import pandas as pd

num_cols = ['Attribute2', 'Attribute5', 'Attribute8', 'Attribute11',
            'Attribute13', 'Attribute16', 'Attribute18']

X_train_raw,X_test_raw = X_train,X_test
X_train, X_test, datasetdf = preprocess_data(X_train, X_test, datasetdf, num_cols)
#pd.set_option('display.max_columns', None)
#print(X_train.head())

# Train model

In [6]:
from src.model.train_model import train_classifier

In [7]:
clf = train_classifier(X_train,y_train,n_estimators=100, random_state=42)

# Test model

In [8]:
from src.model.test_model import test_classifier_prediction

In [9]:
y_pred,classification_report,accuracy_score = test_classifier_prediction(clf,X_test,y_test)

In [10]:
print(accuracy_score)

0.76


In [11]:
print(classification_report)

              precision    recall  f1-score   support

           1       0.76      0.95      0.85       210
           2       0.74      0.31      0.44        90

    accuracy                           0.76       300
   macro avg       0.75      0.63      0.64       300
weighted avg       0.76      0.76      0.72       300


### User constraints / Ontology constraints

In [12]:
from src.ontology.generate_rules import *

In [13]:
X_train_positive = X_train_raw[y_train==1]
X_train_negative = X_train_raw[y_train==2]

In [14]:
positive_profile = summarize_features(X_train_positive)

In [15]:
positive_profile

{'Attribute1': {'A14': 0.49795918367346936,
  'A12': 0.23265306122448978,
  'A11': 0.20408163265306123,
  'A13': 0.0653061224489796},
 'Attribute2': {'mean': np.float64(19.240816326530613),
  'min': np.int64(4),
  'max': np.int64(60)},
 'Attribute3': {'A32': 0.5040816326530613,
  'A34': 0.3448979591836735,
  'A33': 0.09183673469387756,
  'A31': 0.03469387755102041,
  'A30': 0.024489795918367346},
 'Attribute4': {'A43': 0.2938775510204082,
  'A40': 0.23061224489795917,
  'A42': 0.17346938775510204,
  'A41': 0.12448979591836734,
  'A49': 0.09183673469387756,
  'A46': 0.03469387755102041,
  'A45': 0.02040816326530612,
  'A44': 0.014285714285714285,
  'A410': 0.00816326530612245,
  'A48': 0.00816326530612245},
 'Attribute5': {'mean': np.float64(2983.004081632653),
  'min': np.int64(338),
  'max': np.int64(15653)},
 'Attribute6': {'A61': 0.5387755102040817,
  'A65': 0.21428571428571427,
  'A62': 0.11224489795918367,
  'A63': 0.06938775510204082,
  'A64': 0.0653061224489796},
 'Attribute7': 

In [16]:
negative_profile = summarize_features(X_train_negative)

In [17]:
negative_profile

{'Attribute1': {'A11': 0.4380952380952381,
  'A12': 0.3523809523809524,
  'A14': 0.1523809523809524,
  'A13': 0.05714285714285714},
 'Attribute2': {'mean': np.float64(24.333333333333332),
  'min': np.int64(6),
  'max': np.int64(60)},
 'Attribute3': {'A32': 0.5476190476190477,
  'A34': 0.17142857142857143,
  'A33': 0.10476190476190476,
  'A31': 0.1,
  'A30': 0.0761904761904762},
 'Attribute4': {'A40': 0.2761904761904762,
  'A43': 0.2,
  'A42': 0.19523809523809524,
  'A49': 0.11428571428571428,
  'A46': 0.08571428571428572,
  'A41': 0.07142857142857142,
  'A45': 0.02857142857142857,
  'A44': 0.014285714285714285,
  'A410': 0.009523809523809525,
  'A48': 0.004761904761904762},
 'Attribute5': {'mean': np.float64(3709.233333333333),
  'min': np.int64(433),
  'max': np.int64(15945)},
 'Attribute6': {'A61': 0.7285714285714285,
  'A62': 0.12380952380952381,
  'A65': 0.09047619047619047,
  'A63': 0.03333333333333333,
  'A64': 0.023809523809523808},
 'Attribute7': {'A73': 0.3333333333333333,
  '

In [18]:
p_statistical = extract_statistical_rules(X_train_positive)

In [19]:
p_statistical

{'Attribute2': (np.float64(6.0), np.float64(36.0)),
 'Attribute5': (np.float64(929.9), np.float64(6473.9)),
 'Attribute8': (np.float64(1.0), np.float64(4.0)),
 'Attribute10': 'A101',
 'Attribute11': (np.float64(1.0), np.float64(4.0)),
 'Attribute13': (np.float64(24.0), np.float64(51.0)),
 'Attribute14': 'A143',
 'Attribute15': 'A152',
 'Attribute16': (np.float64(1.0), np.float64(2.0)),
 'Attribute18': (np.float64(1.0), np.float64(2.0)),
 'Attribute20': 'A201'}

In [20]:
p_permutation_imp_top_features = top_features(clf, X_train, y_train, n_repeats=5, random_state=42)

In [21]:
p_permutation_imp_top_features

['Attribute1_A14',
 'Attribute2',
 'Attribute5',
 'Attribute13',
 'Attribute6_A65',
 'Attribute14_A143']

In [22]:
p = combined_features(p_permutation_imp_top_features,p_statistical)

In [23]:
p

{'Attribute2': (np.float64(6.0), np.float64(36.0)),
 'Attribute5': (np.float64(929.9), np.float64(6473.9)),
 'Attribute8': (np.float64(1.0), np.float64(4.0)),
 'Attribute10': 'A101',
 'Attribute11': (np.float64(1.0), np.float64(4.0)),
 'Attribute13': (np.float64(24.0), np.float64(51.0)),
 'Attribute14': 'A143',
 'Attribute15': 'A152',
 'Attribute16': (np.float64(1.0), np.float64(2.0)),
 'Attribute18': (np.float64(1.0), np.float64(2.0)),
 'Attribute20': 'A201',
 'Attribute1': 'A14',
 'Attribute6': 'A65'}

In [25]:
p_num = {key: value for key, value in p.items() if isinstance(value, tuple)}
p_cat = {key: value for key, value in p.items() if isinstance(value, str)}
#print(p_num)
#print(p_cat)

{'Attribute2': (np.float64(6.0), np.float64(36.0)), 'Attribute5': (np.float64(929.9), np.float64(6473.9)), 'Attribute8': (np.float64(1.0), np.float64(4.0)), 'Attribute11': (np.float64(1.0), np.float64(4.0)), 'Attribute13': (np.float64(24.0), np.float64(51.0)), 'Attribute16': (np.float64(1.0), np.float64(2.0)), 'Attribute18': (np.float64(1.0), np.float64(2.0))}

{'Attribute10': 'A101', 'Attribute14': 'A143', 'Attribute15': 'A152', 'Attribute20': 'A201', 'Attribute1': 'A14', 'Attribute6': 'A65'}


### Final outputs of Input section of pipeline
For experimentation, here are some values

In [26]:
from src.utilities.desired_space import desired_space
x= X_test[0:1] #test instance #select first row as dataFrame
p_num = p_num
p_cat = p_cat
#p= {
    #'Attribute5':(0, 1) #credit amount #I put a normalized amount here
#}#perturbation map
#desired_space= desired_space(2, datasetdf) #all the feature vectors with output=2 #not preprocessed! Potential issue from authors!
#print(datasetdf.head())
desired_space = desired_space(2,datasetdf) #did preprocessing also to datasetdf
#print(X_train.head())
#print(desired_space.head())
cat_f= X_train.select_dtypes(include=['bool']) #categorical features
#print(cat_f)
num_f= X_train.select_dtypes(include=['int64', 'float64']) #numerical features
#print(num_f)
protect_f= ['Attribute9_A92'  'Attribute9_A93'  'Attribute9_A94'] # Personal status and sex (X_train.head() shows only 3 values (columns) while there are 5) #protected features
features= X_train.columns #all features
t= 2 #desired outcome
f= clf #black-box model for which we want to find CF
X= X_train #training data
step= {
    'Attribute5': 0.01 #credit amount steps #tiny amount bc of normalized feature values
} #dictionary holding the feature distribution to be used for single feature perturbation method

### Derived variables

In [14]:
f2change = p.keys()

# UFCE

### Feature Selection: Mutual Information (MI)

In [15]:
from itertools import combinations
from src.utilities.mutual_information import mi_score
from src.utilities.mutual_information import MI
feature_pairs = list(combinations(list(features), 2))
print(feature_pairs[:5])
mi_list = MI(feature_pairs, X_train) #used all features available #original authors do over entire X but it is not preprocessed... Which i think is nessecary? #also works for bool one-hot encoding 
# top 5 
print(mi_list[:5])

[('Attribute2', 'Attribute5'), ('Attribute2', 'Attribute8'), ('Attribute2', 'Attribute11'), ('Attribute2', 'Attribute13'), ('Attribute2', 'Attribute16')]
[('Attribute9_A92', 'Attribute9_A93'), ('Attribute3_A32', 'Attribute3_A34'), ('Attribute2', 'Attribute5'), ('Attribute17_A172', 'Attribute17_A173'), ('Attribute12_A124', 'Attribute15_A153')]


### Nearest Neighbour: KD Tree

In [16]:
from src.utilities.nearest_neighbours import FNN
nn = FNN(desired_space,x,3.5) #radius arbitrarily set; can experiment with how close we want it to be to x #3.5 for euclidean distance is around 0.5 for each of the 48 features difference from x, get 3 datapoints
print(nn)

     Attribute2  Attribute5  Attribute8  Attribute11  Attribute13  \
652    0.270317   -0.335901    0.945404    -1.660121     0.870748   
622    2.277975    0.266786    0.945404    -0.747967     0.231260   
301    1.274146    0.225634    0.945404    -1.660121     0.596682   

     Attribute16  Attribute18  Attribute1_A12  Attribute1_A13  Attribute1_A14  \
652    -0.724565    -0.434114           False           False           False   
622    -0.724565    -0.434114           False           False            True   
301    -0.724565    -0.434114            True           False           False   

     ...  Attribute12_A124  Attribute14_A142  Attribute14_A143  \
652  ...             False             False              True   
622  ...             False             False             False   
301  ...             False             False              True   

     Attribute15_A152  Attribute15_A153  Attribute17_A172  Attribute17_A173  \
652              True             False             Fa

### Subspace: Intervals

In [17]:
from src.utilities.nearest_neighbours import intervals
subspace = intervals(nn,p,f2change,x) 
print(subspace) #correct output

0
1
{'Attribute5': [0, np.float64(0.26678599187094654)]}


### CF Generator: Single Feature

In [18]:
from src.utilities.UFCE import SF
cf = SF(x,X,cat_f,p,f,t,step)
print(cf)

None


### CF Generator: Double Features

In [None]:
from src.utilities.UFCE import DF
z = DF(X,x,subspace,mi_list,cat_f,num_f,features,protect_f,f,t)