In [2]:
# Mount gdrive folders on this colab session
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras import Sequential
from keras import layers


In [5]:
train_file_path = '/content/drive/MyDrive/influence-analysis/dataset/titanic.csv'
original_df = pd.read_csv(train_file_path)
original_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [6]:
df = original_df.copy()
df.rename(columns={'PassengerId': 'ID'}, inplace=True)
df = df.drop(columns = ["Name", "Ticket", "Cabin"])
df

Unnamed: 0,ID,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,,1,2,23.4500,S
889,890,1,1,male,26.0,0,0,30.0000,C


In [7]:
print(df.isna().sum())

ID            0
Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64


In [8]:
df = df.dropna(subset=['Embarked'])
df

Unnamed: 0,ID,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,,1,2,23.4500,S
889,890,1,1,male,26.0,0,0,30.0000,C


In [9]:
df = df.dropna(subset=['Embarked'])
df

Unnamed: 0,ID,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,,1,2,23.4500,S
889,890,1,1,male,26.0,0,0,30.0000,C


In [10]:
df = df.copy()
df['Missage'] = df['Age'].isna().astype(int)
df.fillna({'Age':0}, inplace=True)
df

Unnamed: 0,ID,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Missage
0,1,0,3,male,22.0,1,0,7.2500,S,0
1,2,1,1,female,38.0,1,0,71.2833,C,0
2,3,1,3,female,26.0,0,0,7.9250,S,0
3,4,1,1,female,35.0,1,0,53.1000,S,0
4,5,0,3,male,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S,0
887,888,1,1,female,19.0,0,0,30.0000,S,0
888,889,0,3,female,0.0,1,2,23.4500,S,1
889,890,1,1,male,26.0,0,0,30.0000,C,0


In [11]:
print(df.isna().sum())

ID          0
Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Missage     0
dtype: int64


In [12]:
sex_trans = LabelEncoder()
df['Sex'] = sex_trans.fit_transform(df['Sex'])

Emb_trans = LabelEncoder()
df['Embarked'] = Emb_trans.fit_transform(df['Embarked'])

df

Unnamed: 0,ID,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Missage
0,1,0,3,1,22.0,1,0,7.2500,2,0
1,2,1,1,0,38.0,1,0,71.2833,0,0
2,3,1,3,0,26.0,0,0,7.9250,2,0
3,4,1,1,0,35.0,1,0,53.1000,2,0
4,5,0,3,1,35.0,0,0,8.0500,2,0
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,1,27.0,0,0,13.0000,2,0
887,888,1,1,0,19.0,0,0,30.0000,2,0
888,889,0,3,0,0.0,1,2,23.4500,2,1
889,890,1,1,1,26.0,0,0,30.0000,0,0


In [13]:
Normalize = StandardScaler()
Normalize_cols = ["Age", "Fare"]
df[Normalize_cols] = Normalize.fit_transform(df[Normalize_cols])
df

Unnamed: 0,ID,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Missage
0,1,0,3,1,-0.099150,1,0,-0.500240,2,0
1,2,1,1,0,0.812389,1,0,0.788947,0,0
2,3,1,3,0,0.128735,0,0,-0.486650,2,0
3,4,1,1,0,0.641476,1,0,0.422861,2,0
4,5,0,3,1,0.641476,0,0,-0.484133,2,0
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,1,0.185706,0,0,-0.384475,2,0
887,888,1,1,0,-0.270063,0,0,-0.042213,2,0
888,889,0,3,0,-1.352516,1,2,-0.174084,2,1
889,890,1,1,1,0.128735,0,0,-0.042213,0,0


In [14]:
X = df.drop(columns=['Survived'])
y = df['Survived']
IDs = X['ID'].values.reshape(-1,1).astype(np.float32)
IDs = IDs / 1e7

X_withNotID= X.drop(columns=['ID']).values.astype(np.float32)
X_withNotID= np.hstack((X_withNotID,IDs))
y_oneHot = to_categorical(y.values, num_classes=2)

X_withNotID.shape

(889, 9)

In [15]:
y_oneHot.shape

(889, 2)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_withNotID, y_oneHot, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(711, 9)
(178, 9)
(711, 2)
(178, 2)


In [17]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [18]:
print(len(train_ds))
print(len(test_ds))

711
178


In [19]:
model = Sequential([
        layers.Dense(32, activation='relu', input_shape=(9,)),
        layers.Dense(16, activation='relu'),
        layers.Dense(2, activation='sigmoid')
    ])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(train_ds.batch(32), epochs=10, validation_data=test_ds.batch(32), verbose=2)
model.evaluate(test_ds.batch(32), verbose=2)

Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


23/23 - 2s - 77ms/step - accuracy: 0.5345 - loss: 0.6921 - val_accuracy: 0.6124 - val_loss: 0.6655
Epoch 2/10
23/23 - 0s - 6ms/step - accuracy: 0.6174 - loss: 0.6522 - val_accuracy: 0.6124 - val_loss: 0.6396
Epoch 3/10
23/23 - 0s - 2ms/step - accuracy: 0.6273 - loss: 0.6300 - val_accuracy: 0.6236 - val_loss: 0.6191
Epoch 4/10
23/23 - 0s - 2ms/step - accuracy: 0.6526 - loss: 0.6106 - val_accuracy: 0.6629 - val_loss: 0.5992
Epoch 5/10
23/23 - 0s - 2ms/step - accuracy: 0.6934 - loss: 0.5913 - val_accuracy: 0.6966 - val_loss: 0.5810
Epoch 6/10
23/23 - 0s - 2ms/step - accuracy: 0.7131 - loss: 0.5724 - val_accuracy: 0.7528 - val_loss: 0.5639
Epoch 7/10
23/23 - 0s - 2ms/step - accuracy: 0.7243 - loss: 0.5525 - val_accuracy: 0.7528 - val_loss: 0.5450
Epoch 8/10
23/23 - 0s - 2ms/step - accuracy: 0.7412 - loss: 0.5315 - val_accuracy: 0.7697 - val_loss: 0.5291
Epoch 9/10
23/23 - 0s - 2ms/step - accuracy: 0.7567 - loss: 0.5138 - val_accuracy: 0.8034 - val_loss: 0.5172
Epoch 10/10
23/23 - 0s - 2ms/

[0.5078285932540894, 0.8089887499809265]

In [20]:
from deel.influenciae.common import InfluenceModel, ExactIHVP
from deel.influenciae.influence import FirstOrderInfluenceCalculator
from deel.influenciae.utils import ORDER
from deel.influenciae.trac_in import TracIn
from keras.losses import BinaryCrossentropy
import warnings
warnings.filterwarnings('ignore')

In [26]:
unreduced_loss = BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
influence_model = InfluenceModel(model, start_layer=-1, loss_function=unreduced_loss)
ihvp_calculator = ExactIHVP(influence_model, train_dataset=train_ds.shuffle(100).batch(4))
influence_calculator = FirstOrderInfluenceCalculator(influence_model, train_ds.batch(8), ihvp_calculator)

sample_to_explain = test_ds.take(5).batch(1)
explanation_ds = influence_calculator.top_k(samples_to_explain, traind_ds.batch(8), k = 3, order = ORDER.DESCENDING)


for(sample,label), top_k_values, top_k_samples in explanation_ds.as_numpy_iterator():
  sample_id = round(sample[0][-1]*1e7)
  sample_original = original_df(original_df['PassengerId'] == sample_id)
  print(f"\nTest Sample ID: {sample_id}")
  print(f"Original Sample from DataFrame:")
  print(sample_original[["Survived"]])
  influential_ids = [round(s[-1] * 1e7) for s in top_k_samples[0]]
  for i , (inf_id, score) in enumerate(zip(infleuntial_ids, top_k_values[0])):
    inf_sample_original = original_df[original_df['PassengerId'] == inf_id]
    print(f"Influential Sample {i + 1} -> ID: {inf_id}, Influence Score: {score}")
    print(inf_sample_original[["Survived"]])


NotImplementedError: in user code:

    File "/usr/local/lib/python3.11/dist-packages/deel/influenciae/common/inverse_hessian_vector_product.py", line 220, in hessian_sum  *
        tape_hess.watch(weights)
    File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/common/variables.py", line 372, in __repr__
        value = backend.core.convert_to_numpy(self._value)
    File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/core.py", line 155, in convert_to_numpy
        return np.array(x)

    NotImplementedError: numpy() is only available when eager execution is enabled.


In [29]:
model = Sequential([
        layers.Dense(32, activation='relu', input_shape=(9,)),
        layers.Dense(16, activation='relu'),
        layers.Dense(2, activation='sigmoid')
    ])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

epochs = 10
unreduced_loss_fn = BinaryCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.NONE)
model_list = []
model_list.append(InfluenceModel(model, start_layer=-1, loss_function=unreduced_loss_fn))
for i in range(epochs):
  model.fit(train_ds.batch(32), epochs=1, validation_data=test_ds.batch(32), verbose=2)
  updated_model = tf.keras.models.clone_model(model)
  updated_model.set_weights(model.get_weights())
  model_list.append(InfluenceModel(model, start_layer=-1, loss_function=unreduced_loss_fn))
model.evaluate(test_ds.batch(32), verbose=2)

23/23 - 1s - 52ms/step - accuracy: 0.5837 - loss: 0.6863 - val_accuracy: 0.6404 - val_loss: 0.6630
23/23 - 0s - 2ms/step - accuracy: 0.6245 - loss: 0.6489 - val_accuracy: 0.6404 - val_loss: 0.6376
23/23 - 0s - 2ms/step - accuracy: 0.6329 - loss: 0.6257 - val_accuracy: 0.6517 - val_loss: 0.6202
23/23 - 0s - 2ms/step - accuracy: 0.6484 - loss: 0.6105 - val_accuracy: 0.6798 - val_loss: 0.6058
23/23 - 0s - 2ms/step - accuracy: 0.6779 - loss: 0.5956 - val_accuracy: 0.7022 - val_loss: 0.5894
23/23 - 0s - 3ms/step - accuracy: 0.7004 - loss: 0.5782 - val_accuracy: 0.7191 - val_loss: 0.5692
23/23 - 0s - 2ms/step - accuracy: 0.7201 - loss: 0.5586 - val_accuracy: 0.7303 - val_loss: 0.5475
23/23 - 0s - 2ms/step - accuracy: 0.7511 - loss: 0.5390 - val_accuracy: 0.7697 - val_loss: 0.5273
23/23 - 0s - 2ms/step - accuracy: 0.7792 - loss: 0.5217 - val_accuracy: 0.7921 - val_loss: 0.5100
23/23 - 0s - 2ms/step - accuracy: 0.7904 - loss: 0.5074 - val_accuracy: 0.8034 - val_loss: 0.4965
6/6 - 0s - 2ms/step

[0.4965496361255646, 0.8033707737922668]

In [31]:
influence_calculator = TracIn(model_list, 0.01)

samples_to_explain = test_ds.take(5).batch(1)
explanation_ds = influence_calculator.top_k(samples_to_explain,train_ds.batch(8), k = 3, order = ORDER.DESCENDING)

for(sample,label), top_k_values, top_k_samples in explanation_ds.as_numpy_iterator():
  sample_id = round(sample[0][-1]*1e7)
  sample_original = original_df(original_df['PassengerId'] == sample_id)
  print(f"\nTest Sample ID: {sample_id}")
  print(f"Original Sample from DataFrame:")
  print(sample_original[["Survived"]])
  influential_ids = [round(s[-1] * 1e7) for s in top_k_samples[0]]
  for i , (inf_id, score) in enumerate(zip(infleuntial_ids, top_k_values[0])):
    inf_sample_original = original_df[original_df['PassengerId'] == inf_id]
    print(f"Influential Sample {i + 1} -> ID: {inf_id}, Influence Score: {score}")
    print(inf_sample_original[["Survived"]])

NotImplementedError: in user code:

    File "/usr/local/lib/python3.11/dist-packages/deel/influenciae/utils/tf_operations.py", line 272, in map_fun_device  *
        result = map_fun(*args)
    File "/usr/local/lib/python3.11/dist-packages/deel/influenciae/common/base_influence.py", line 263, in None  *
        lambda *batch: (batch, self._compute_influence_vector(batch))
    File "/usr/local/lib/python3.11/dist-packages/deel/influenciae/trac_in/tracin.py", line 69, in _compute_influence_vector  *
        g_train = model.batch_jacobian_tensor(train_samples)
    File "/usr/local/lib/python3.11/dist-packages/deel/influenciae/common/model_wrappers.py", line 275, in batch_jacobian_tensor  *
        jacobians = BaseInfluenceModel._jacobian(self.model, self.weights, self.loss_function,
    File "/usr/local/lib/python3.11/dist-packages/tensorflow/core/function/polymorphism/function_type.py", line 583, in canonicalize_to_monomorphic
        _make_validated_mono_param(name, arg, poly_parameter.kind,
    File "/usr/local/lib/python3.11/dist-packages/tensorflow/core/function/polymorphism/function_type.py", line 522, in _make_validated_mono_param
        mono_type = trace_type.from_value(value, type_context)
    File "/usr/local/lib/python3.11/dist-packages/tensorflow/core/function/trace_type/trace_type_builder.py", line 162, in from_value
        return default_types.List(*(from_value(c, context) for c in value))
    File "/usr/local/lib/python3.11/dist-packages/tensorflow/core/function/trace_type/trace_type_builder.py", line 162, in <genexpr>
        return default_types.List(*(from_value(c, context) for c in value))
    File "/usr/local/lib/python3.11/dist-packages/tensorflow/core/function/trace_type/trace_type_builder.py", line 185, in from_value
        ndarray = value.__array__()
    File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/common/variables.py", line 415, in __array__
        return np.asarray(self.value.__array__(dtype))

    NotImplementedError: numpy() is only available when eager execution is enabled.
