In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_selection import chi2, SelectKBest, f_classif, mutual_info_classif
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import tensorflow.keras.backend as k
from tensorflow.keras import Model, Input, layers
import lime
from lime import lime_tabular
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Flatten, Dropout

In [None]:
df_dataClass = pd.read_csv('DataClass.csv')

In [None]:
df_dataClass.head()

In [None]:
df_dataClass.shape

In [None]:
df_dataClass.dtypes

In [None]:
df_dataClass['numeric_label'], unique_lables = pd.factorize(df_dataClass['is_data_class'])
df_dataClass.head()

In [None]:
df_dataClass = df_dataClass.drop(columns=['is_data_class'])

In [None]:
df_dataClass.head()

In [None]:
negative_values = df_dataClass[df_dataClass < 0].any().any()
if negative_values:
    print("Dataset contains negative values!")
else:
    print("No negative values found.")

In [None]:
df_dataClass_cleaned = df_dataClass[(df_dataClass >= 0).all(axis=1)]

In [None]:
df_dataClass_cleaned.shape

In [None]:
x = df_dataClass_cleaned.drop(columns=['numeric_label'])
y = df_dataClass_cleaned['numeric_label']

Model Accuracy Without Feature Selection

train test split

In [None]:
selected_independent_df = x
selected_independent_df

In [None]:
independent_array = selected_independent_df.to_numpy()
independent_array

In [None]:
selected_dependent_df = y
selected_dependent_df

In [None]:
dependent_array = selected_dependent_df.to_numpy()
dependent_array

In [None]:
x_train, x_test, y_train, y_test = train_test_split(independent_array, dependent_array, test_size=0.2, stratify=y, random_state=42)

K Fold Cross Validation

In [None]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

ADDING ATTENTION LAYER TO THE MODEL

In [None]:
def attention_block(inputs):
    input_shape = k.int_shape(inputs)
    attention_probs = layers.Dense(input_shape[-1], activation='softmax')(inputs)
    attention_mul = layers.multiply([inputs, attention_probs])
    return attention_mul

In [None]:
fold_no = 1
train_acc_per_fold = []
val_acc_per_fold = []
test_acc_per_fold = []

In [None]:
for train, val in cv.split(x_train, y_train):

    print('     ')
    print(f'Training for fold {fold_no}: ')

    input_layer = Input(shape=(df_dataClass_cleaned.shape[1] - 1, 1))
    conv1 = Conv1D(128, kernel_size=1, activation='relu')(input_layer)
    # dropout1 = Dropout(0.2)(conv1)
    # conv2 = Conv1D(64, kernel_size=1, activation='tanh')(dropout1)
    conv2 = Conv1D(64, kernel_size=1, activation='tanh')(conv1)
    # dropout2 = Dropout(0.3)(conv2)
    # attention_output = attention_block(dropout2)
    attention_output = attention_block(conv2)
    flatten = Flatten()(attention_output)
    dense1 = Dense(64, activation='relu')(flatten)
    dropout3 = Dropout(0.4)(dense1)
    output = Dense(1, activation='sigmoid')(dropout3)
    # output = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=input_layer, outputs=output)

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    history = model.fit(x_train[train], y_train[train], epochs=32, batch_size=2, validation_data=(x_train[val], y_train[val]))

    scores_train = model.evaluate(x_train[train], y_train[train])
    train_acc_per_fold.append(scores_train[1] * 100)

    scores_val = model.evaluate(x_train[val], y_train[val])
    val_acc_per_fold.append(scores_val[1] * 100)

    scores_test = model.evaluate(x_test, y_test)
    test_acc_per_fold.append(scores_test[1] * 100)

    fold_no = fold_no + 1



In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'y', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and Validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
plt.plot(epochs, acc, 'y', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and Validation accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

In [None]:
serial = 1
for acc in train_acc_per_fold:
    print(f"{serial}. Training Accuracy with attention layer wihtout FS: ", acc)
    serial = serial + 1

In [None]:
serial = 1
for acc in val_acc_per_fold:
    print(f"{serial}. Validation Accuracy with attention layer without FS: ", acc)
    serial = serial + 1

In [None]:
serial = 1
for acc in test_acc_per_fold:
    print(f"{serial}. Test Accuracy with attention layer without FS: ", acc)
    serial = serial + 1

In [None]:
print("Training accuracy with attention layer without FS: ", np.mean(train_acc_per_fold))

In [None]:
print("Validation accuracy with attention layer without FS: ", np.mean(val_acc_per_fold))

In [None]:
print("Testing accuracy with attention layer without FS: ", np.mean(test_acc_per_fold))

drawing the confusion matrix

In [None]:
y_pred = model.predict(x_test)
y_pred_class = (y_pred > 0.5).astype("int32")

In [None]:
cm = confusion_matrix(y_test, y_pred_class, labels=[0,1])
sns.heatmap(cm, cmap='Greens', annot=True)


Evaluation Matrices

In [None]:
print(classification_report(y_test, y_pred_class))

chi square technique

In [None]:
chi_score = chi2(x,y)

In [None]:
chi_values = pd.Series(chi_score[0], index=x.columns)
chi_values.sort_values(ascending=False, inplace=True)
plt.figure(figsize=(16, 6)) 
chi_values.plot.bar()

In [None]:
chi_selector = SelectKBest(chi2, k=25)
x_chi_selected = chi_selector.fit_transform(x,y)
chi_support = chi_selector.get_support()
chi_features_dataClass = x.columns[chi_support]

In [None]:
print("selected features using chi2: ", chi_features_dataClass)

correlation technique

In [None]:
cor_dataClass = df_dataClass_cleaned.corr()
cor_dataClass

In [None]:
target = abs(cor_dataClass['numeric_label'])
target = target.sort_values(ascending=False)
target

In [None]:
corr_features_dataClass = target[target > 0.23]
corr_features_dataClass

In [None]:
len(corr_features_dataClass)

In [None]:
corr_feature_list_dataClass = corr_features_dataClass.index.tolist()
corr_feature_list_dataClass.pop(0)
corr_feature_list_dataClass

In [None]:
len(corr_feature_list_dataClass)

anova f-test

In [None]:
fvalue_selector = SelectKBest(f_classif, k=25)
x_kbest = fvalue_selector.fit_transform(x,y)
x_kbest

In [None]:
selected_features_anova_mask = fvalue_selector.get_support()
selected_features_anova_dataClass = x.columns[selected_features_anova_mask]
selected_features_anova_dataClass

mutual information gain

In [None]:
mutual_info = mutual_info_classif(x,y)
mutual_info

In [None]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = x.columns
mutual_info.sort_values(ascending=False).head(20)

In [None]:
select_mutual = SelectKBest(mutual_info_classif, k=25)
select_mutual.fit(x,y)
select_mutual_dataClass = x.columns[select_mutual.get_support()]
select_mutual_dataClass

union approach

In [None]:
common_features = list(set(chi_features_dataClass) & set(selected_features_anova_dataClass) & set(select_mutual_dataClass) & set(corr_feature_list_dataClass) )
print("Common features (Intersection of Chi-square and RFE):", common_features)

In [None]:
len(common_features)

voting approach

In [None]:
all_features_dataClass = list(chi_features_dataClass) + list(selected_features_anova_dataClass) + list(select_mutual_dataClass) + list(corr_feature_list_dataClass)
voted_features_dataClass = Counter(all_features_dataClass)
voted_features_dataClass

In [None]:
best_features_dataClass = [feature for feature, count in voted_features_dataClass.items() if count >= 3]
print("Best features through voting:", best_features_dataClass)

In [None]:
len(best_features_dataClass)

train test split

In [None]:
selected_independent_df = df_dataClass_cleaned[best_features_dataClass]
selected_independent_df

In [None]:
independent_array = selected_independent_df.to_numpy()
independent_array

In [None]:
selected_dependent_df = df_dataClass_cleaned['numeric_label']
selected_dependent_df

In [None]:
dependent_array = selected_dependent_df.to_numpy()
dependent_array

In [None]:
x_train, x_test, y_train, y_test = train_test_split(independent_array, dependent_array, test_size=0.2, stratify=y, random_state=42)

K Fold Cross Validation

In [None]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

Deep Learning Model

In [None]:
fold_no = 1
train_acc_per_fold = []
val_acc_per_fold = []
test_acc_per_fold = []

In [None]:
for train, val in cv.split(x_train, y_train):

    print('     ')
    print(f'Training for fold {fold_no}: ')

    model = Sequential([
        Conv1D(128, kernel_size=1, activation='relu', input_shape = (len(best_features_dataClass),1)),

        # Dropout(0.2),

        Conv1D(64, kernel_size=1, activation='tanh'),

        # Dropout(0.3),

        Flatten(),

        Dense(64, activation='relu'),

        Dropout(0.4),

        Dense(1, activation='sigmoid')    
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    history = model.fit(x_train[train], y_train[train], epochs=32, batch_size=2, validation_data=(x_train[val], y_train[val]))

    scores_train = model.evaluate(x_train[train], y_train[train])
    train_acc_per_fold.append(scores_train[1] * 100)

    scores_val = model.evaluate(x_train[val], y_train[val])
    val_acc_per_fold.append(scores_val[1] * 100)

    scores_test = model.evaluate(x_test, y_test)
    test_acc_per_fold.append(scores_test[1] * 100)

    fold_no = fold_no + 1



In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'y', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and Validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
plt.plot(epochs, acc, 'y', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and Validation accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

In [None]:
serial = 1
for acc in train_acc_per_fold:
    print(f"{serial}. Training Accuracy: ", acc)
    serial = serial + 1

In [None]:
serial = 1
for acc in val_acc_per_fold:
    print(f"{serial}. Validation Accuracy: ", acc)
    serial = serial + 1

In [None]:
serial = 1
for acc in test_acc_per_fold:
    print(f"{serial}. Test Accuracy with attention layer: ", acc)
    serial = serial + 1

In [None]:
print("Training accuracy: ", np.mean(train_acc_per_fold))

In [None]:
print("Validation accuracy: ", np.mean(val_acc_per_fold))

In [None]:
print("Testing accuracy: ", np.mean(test_acc_per_fold))

drawing the confusion matrix

In [None]:
y_pred = model.predict(x_test)
y_pred_class = (y_pred > 0.5).astype("int32")

In [None]:
cm = confusion_matrix(y_test, y_pred_class, labels=[0,1])
sns.heatmap(cm, cmap='Greens', annot=True)


Evaluation Matrices

In [None]:
print(classification_report(y_test, y_pred_class))

Lime implementation

In [None]:
output_class_names = ['False', 'True']
interpretor = lime_tabular.LimeTabularExplainer(
    x_train,
    class_names=output_class_names,
    feature_names= best_features_dataClass,
    mode='classification'
)

In [None]:
prediction_case = 4

In [None]:
exp = interpretor.explain_instance(
    x_test[prediction_case],
    model.predict,
    num_features=len(best_features_dataClass),
    top_labels=2
)
exp.show_in_notebook(show_table=True)

In [None]:
y_test[prediction_case]

ADDING ATTENTION LAYER TO THE MODEL

In [None]:
def attention_block(inputs):
    input_shape = k.int_shape(inputs)
    attention_probs = layers.Dense(input_shape[-1], activation='softmax')(inputs)
    attention_mul = layers.multiply([inputs, attention_probs])
    return attention_mul

In [None]:
fold_no = 1
train_acc_per_fold = []
val_acc_per_fold = []
test_acc_per_fold = []

In [None]:
for train, val in cv.split(x_train, y_train):

    print('     ')
    print(f'Training for fold {fold_no}: ')

    input_layer = Input(shape=(len(best_features_dataClass), 1))
    conv1 = Conv1D(128, kernel_size=1, activation='relu')(input_layer)
    # dropout1 = Dropout(0.2)(conv1)
    # conv2 = Conv1D(64, kernel_size=1, activation='tanh')(dropout1)
    conv2 = Conv1D(64, kernel_size=1, activation='tanh')(conv1)
    # dropout2 = Dropout(0.3)(conv2)
    # attention_output = attention_block(dropout2)
    attention_output = attention_block(conv2)
    flatten = Flatten()(attention_output)
    dense1 = Dense(64, activation='relu')(flatten)
    dropout3 = Dropout(0.4)(dense1)
    output = Dense(1, activation='sigmoid')(dropout3)
    # output = Dense(1, activation='sigmoid')(dense1)
    model = Model(inputs=input_layer, outputs=output)

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    history = model.fit(x_train[train], y_train[train], epochs=32, batch_size=2, validation_data=(x_train[val], y_train[val]))

    scores_train = model.evaluate(x_train[train], y_train[train])
    train_acc_per_fold.append(scores_train[1] * 100)

    scores_val = model.evaluate(x_train[val], y_train[val])
    val_acc_per_fold.append(scores_val[1] * 100)

    scores_test = model.evaluate(x_test, y_test)
    test_acc_per_fold.append(scores_test[1] * 100)

    fold_no = fold_no + 1



In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.plot(epochs, loss, 'y', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and Validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
plt.plot(epochs, acc, 'y', label='Training acc')
plt.plot(epochs, val_acc, 'r', label='Validation acc')
plt.title('Training and Validation accuracy')
plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

In [None]:
serial = 1
for acc in train_acc_per_fold:
    print(f"{serial}. Training Accuracy with attention layer: ", acc)
    serial = serial + 1

In [None]:
serial = 1
for acc in val_acc_per_fold:
    print(f"{serial}. Validation Accuracy with attention layer: ", acc)
    serial = serial + 1

In [None]:
serial = 1
for acc in test_acc_per_fold:
    print(f"{serial}. Test Accuracy with attention layer: ", acc)
    serial = serial + 1

In [None]:
print("Training accuracy with attention layer: ", np.mean(train_acc_per_fold))

In [None]:
print("Validation accuracy with attention layer: ", np.mean(val_acc_per_fold))

In [None]:
print("Testing accuracy with attention layer: ", np.mean(test_acc_per_fold))

drawing the confusion matrix

In [None]:
y_pred = model.predict(x_test)
y_pred_class = (y_pred > 0.5).astype("int32")

In [None]:
cm = confusion_matrix(y_test, y_pred_class, labels=[0,1])
sns.heatmap(cm, cmap='Greens', annot=True)


Evaluation Matrices

In [None]:
print(classification_report(y_test, y_pred_class))

Lime Implementation For Attention Layer Model

In [None]:
output_class_names = ['False', 'True']
interpretor = lime_tabular.LimeTabularExplainer(
    x_train,
    class_names=output_class_names,
    feature_names= best_features_dataClass,
    mode='classification'
)

In [None]:
prediction_case = 4

In [None]:
exp = interpretor.explain_instance(
    x_test[prediction_case],
    model.predict,
    num_features=len(best_features_dataClass),
    top_labels=2
)
exp.show_in_notebook(show_table=True)

In [None]:
y_test[prediction_case]

In [None]:
from keras.utils import plot_model

In [None]:
plot_model(model,to_file='my_model.png')