Step 1: Import Python libraries and define function necessaries for the experiments.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from scipy.spatial import cKDTree
import warnings
warnings.filterwarnings("ignore")

# function to calculate epsilon-representativeness of a subset.
def find_epsilon(X,y,X_res,y_res):
    epsilon = 0
    classes = np.unique(y)
    for cl in classes:
        A = X_res[y_res==cl]
        if A.shape[0] > 0:
            B = X[y==cl]
            kdtree = cKDTree(A)
            epsilon = max(epsilon,max(kdtree.query(B,p=np.inf)[0]))
    return epsilon

# function to generate a subset
def reduce(X,y,perc,seed):
    X_red, X_valid , y_red, y_valid = train_test_split(X,y,train_size=perc,shuffle=True,random_state=seed) 
    return X_red, y_red

# function for the metric that determine the similarities between the ordering of the feature importance of two trees.
def compute_similarity_importanceFeatures(importance1, importance2):
    if len(importance1) != len(importance2):
        raise ValueError("The importance vectors must have the same length.")

    total_distance = 0
    for i in range(len(importance1)):
        indice1 = importance1.index(importance2[i]) 
        distance = abs(indice1 - i) 
        total_distance += distance

    similarity = total_distance / len(importance1)
    return similarity

We load the data. The columns "N" and "m" are eliminated, since they are constants, so they do not contribute anything. MinMaxScaler is used to scales and translates each feature individually in a given range. In this case between zero and one.

In [2]:
df = pd.read_excel('collision.xlsx')
y= df['collision'].to_numpy()
df =df.drop(columns=['N','m','collision'])
X= df.to_numpy()
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
feature_names=df.columns

We split the dataset into training set, which is composed of the 75% of the data, and test set, which is composed with the remaining 25%

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=1, shuffle=True, stratify=y)
print(X_train.shape)
print(y_train.shape)
print(pd.value_counts(y_train))
print(X_test.shape)
print(y_test.shape)
print(pd.value_counts(y_test))

(80407, 23)
(80407,)
1    52011
0    28396
Name: count, dtype: int64
(26803, 23)
(26803,)
1    17337
0     9466
Name: count, dtype: int64


Generate two random subsets, both containing 40% of the training set, and compute their epsilon-representativeness according to training set

In [4]:
perc1=0.1
perc2=0.1
X1,y1 = reduce(X_train,y_train,perc1,2)
print(f"Epsilon between Train set and Subset 1: {find_epsilon(X_train,y_train,X1,y1)}")
X2,y2 = reduce(X_train,y_train,perc2,7)
print(f"Epsilon between Train set and Subset 2: {find_epsilon(X_train,y_train,X2,y2)}")

Epsilon between Train set and Subset 1: 0.5394747548330112
Epsilon between Train set and Subset 2: 0.6554193881839727


# 1 Case: Decision Tree
We create the decision tree(DT) for each set, all of them with the same parameters, train them with their corresponding set and evaluate their accuracy on test set.

In [5]:
treeC = DecisionTreeClassifier(max_depth= 10,random_state=0)
tree1 = DecisionTreeClassifier(max_depth= 10,random_state=0)
tree2 = DecisionTreeClassifier(max_depth= 10,random_state=0)
treeC.fit(X_train, y_train)
tree1.fit(X1, y1)
tree2.fit(X2, y2)
predictedC,predicted1,predicted2 = treeC.predict(X_test),tree1.predict(X_test), tree2.predict(X_test)
cl_repC,cl_rep1,cl_rep2 = classification_report(y_test, predictedC, output_dict=True, zero_division = 0),classification_report(y_test, predicted1, output_dict=True, zero_division = 0),classification_report(y_test, predicted2, output_dict=True, zero_division = 0)
print(f'Tree train with whole training set: Accuracy: {cl_repC["accuracy"]}, F1-S: {cl_repC["macro avg"]["f1-score"]}')
print(f'Tree train with subset 1: Accuracy {cl_rep1["accuracy"]}, F1-S: {cl_rep1["macro avg"]["f1-score"]}')
print(f'Tree train with subset 2: Accuracy: {cl_rep2["accuracy"]}, F1-S: {cl_rep2["macro avg"]["f1-score"]}')

Tree train with whole training set: Accuracy: 0.874081259560497, F1-S: 0.861833961861421
Tree train with subset 1: Accuracy 0.8412491139051599, F1-S: 0.82687963782284
Tree train with subset 2: Accuracy: 0.8397940529045256, F1-S: 0.8248421209101444


Features importance vectors and compute similarity between them.

In [6]:
importancesDTC = treeC.feature_importances_
importancesDT1 = tree1.feature_importances_
importancesDT2 = tree2.feature_importances_
feature_names=df.columns

sorted_indexDTC,  sorted_indexDT1, sorted_indexDT2= np.argsort(importancesDTC)[::-1],np.argsort(importancesDT1)[::-1],np.argsort(importancesDT2)[::-1]
sorted_featuresDTC = [feature_names[i] for i in sorted_indexDTC]
sorted_featuresDT1 = [feature_names[i] for i in sorted_indexDT1]
sorted_featuresDT2 = [feature_names[i] for i in sorted_indexDT2]


print("Order of importance of the features:")
print("DT trained with whole Trainig set : ", sorted_featuresDTC)
print("DT trained with subset 1: ", sorted_featuresDT1)
print("DT trained with subset 2: ", sorted_featuresDT2)

similarity_feature_importanceDTC_1 = compute_similarity_importanceFeatures(sorted_featuresDTC, sorted_featuresDT1)
similarity_feature_importanceDTC_2 = compute_similarity_importanceFeatures(sorted_featuresDTC, sorted_featuresDT2)
print(similarity_feature_importanceDTC_1)
print(similarity_feature_importanceDTC_2)

Order of importance of the features:
DT trained with whole Trainig set :  ['Int_dv7', 'F0', 'Int_dd4', 'freq', 'prob', 'd0', 'Int_dv4', 'ampl', 'KInt', 'd_ms', 'Int_dd7', 'Int_dv2', 'Int_dd5', 'Int_dv3', 'Int_dv1', 'Int_dv6', 'Int_dd6', 'Int_dd2', 'Int_dd3', 'Int_dv5', 'Fresponse', 'duration', 'v0']
DT trained with subset 1:  ['Int_dv7', 'freq', 'Int_dd4', 'F0', 'prob', 'KInt', 'ampl', 'd0', 'Int_dv4', 'd_ms', 'Int_dd7', 'Int_dv5', 'Int_dd5', 'Int_dv1', 'Int_dv2', 'Int_dv6', 'Int_dv3', 'Int_dd3', 'Fresponse', 'duration', 'Int_dd6', 'Int_dd2', 'v0']
DT trained with subset 2:  ['Int_dv7', 'F0', 'Int_dd4', 'freq', 'KInt', 'ampl', 'Int_dv4', 'prob', 'd0', 'd_ms', 'Int_dd7', 'Int_dv1', 'Int_dd5', 'Int_dv3', 'Int_dd3', 'Int_dd2', 'Int_dd6', 'Fresponse', 'Int_dv2', 'v0', 'Int_dv5', 'duration', 'Int_dv6']
1.7391304347826086
1.826086956521739


Importance Feature ordering for DT trained with training
set and random sets. The number indicates the position in the importance order. For
example, the most important feature for XGBoost trained on the training set is d_ms.

In [7]:
importances_dt = pd.DataFrame(index=feature_names, columns=['Training set', 'Subset 1', 'Subset 2'])
for i in range(X.shape[1]):
    importances_dt.iloc[i,0]=sorted_featuresDTC.index(feature_names[i])+1
    importances_dt.iloc[i,1]=sorted_featuresDT1.index(feature_names[i])+1
    importances_dt.iloc[i,2]=sorted_featuresDT2.index(feature_names[i])+1
importances_dt

Unnamed: 0,Training set,Subset 1,Subset 2
F0,2,4,2
d_ms,10,10,10
d0,6,8,9
v0,23,23,20
prob,5,5,8
Int_dv1,15,14,12
Int_dv2,12,15,19
Int_dv3,14,17,14
Int_dv4,7,9,7
Int_dv5,20,12,21


# 2 case: XGBoost

We create the decision tree(DT) for each set, all of them with the same parameters, train them with their corresponding set and evaluate their accuracy on test set.

In [8]:
numero=24
xgboostC = GradientBoostingClassifier(n_estimators = numero+1,max_depth= 10,random_state=1)
xgboost1 = GradientBoostingClassifier(n_estimators = numero+1,max_depth= 10,random_state=1)
xgboost2 = GradientBoostingClassifier(n_estimators = numero+1,max_depth= 10,random_state=1)
xgboostC.fit(X_train, y_train)
xgboost1.fit(X1, y1)
xgboost2.fit(X2, y2)
predictedXC,predictedX1,predictedX2 = xgboostC.predict(X_test),xgboost1.predict(X_test), xgboost2.predict(X_test)
cl_repXC,cl_repX1,cl_repX2 = classification_report(y_test, predictedXC, output_dict=True, zero_division = 0),classification_report(y_test, predictedX1, output_dict=True, zero_division = 0),classification_report(y_test, predictedX2, output_dict=True, zero_division = 0)
print(f'Tree train with whole training set: Accuracy: {cl_repXC["accuracy"]}, F1-S: {cl_repXC["macro avg"]["f1-score"]}')
print(f'Tree train with subset 1: Accuracy: {cl_repX1["accuracy"]}, F1-S: {cl_repX1["macro avg"]["f1-score"]}')
print(f'Tree train with subset 1: {cl_repX2["accuracy"]}, F1-S: {cl_repX2["macro avg"]["f1-score"]}')

Tree train with whole training set: Accuracy: 0.9118009178077081, F1-S: 0.9027349285578894
Tree train with subset 1: Accuracy: 0.8821773682050517, F1-S: 0.8689580586517132
Tree train with subset 1: 0.8764690519717943, F1-S: 0.8634504475031339


Features importance vectors and compute similarity between them.

In [9]:
importancesXGC = xgboostC.feature_importances_
importancesXG1 = xgboost1.feature_importances_
importancesXG2 = xgboost2.feature_importances_
feature_names=df.columns

sorted_indexXGC,  sorted_indexXG1, sorted_indexXG2= np.argsort(importancesXGC)[::-1],np.argsort(importancesXG1)[::-1],np.argsort(importancesXG2)[::-1]
sorted_featuresXGC = [feature_names[i] for i in sorted_indexXGC]
sorted_featuresXG1 = [feature_names[i] for i in sorted_indexXG1]
sorted_featuresXG2 = [feature_names[i] for i in sorted_indexXG2]

print("Order of importance of the features:")
print("DT trained with whole Trainig set : ", sorted_featuresXGC)
print("DT trained with subset 1: ", sorted_featuresXG1)
print("DT trained with subset 2: ", sorted_featuresXG2)

similarity_feature_importanceXGC_1 = compute_similarity_importanceFeatures(sorted_featuresXGC, sorted_featuresXG1)
similarity_feature_importanceXGC_2 = compute_similarity_importanceFeatures(sorted_featuresXGC, sorted_featuresXG2)
print(similarity_feature_importanceXGC_1)
print(similarity_feature_importanceXGC_2)

Order of importance of the features:
DT trained with whole Trainig set :  ['Int_dv7', 'F0', 'freq', 'Int_dd4', 'd0', 'prob', 'ampl', 'KInt', 'Int_dv4', 'd_ms', 'Int_dd7', 'Int_dd5', 'Int_dv1', 'Int_dv3', 'Int_dv6', 'Int_dv2', 'Int_dd6', 'Int_dv5', 'Int_dd2', 'Int_dd3', 'duration', 'Fresponse', 'v0']
DT trained with subset 1:  ['Int_dv7', 'freq', 'F0', 'Int_dd4', 'prob', 'KInt', 'd0', 'ampl', 'Int_dv4', 'd_ms', 'Int_dd7', 'Int_dd5', 'Int_dv1', 'Int_dv3', 'Int_dv2', 'Int_dv5', 'Int_dv6', 'Int_dd6', 'Int_dd2', 'Int_dd3', 'Fresponse', 'duration', 'v0']
DT trained with subset 2:  ['Int_dv7', 'F0', 'freq', 'Int_dd4', 'KInt', 'ampl', 'prob', 'Int_dv4', 'd0', 'd_ms', 'Int_dd7', 'Int_dv1', 'Int_dv3', 'Int_dd2', 'Int_dd5', 'Int_dd3', 'Fresponse', 'Int_dd6', 'Int_dv5', 'Int_dv6', 'Int_dv2', 'duration', 'v0']
0.6956521739130435
1.826086956521739


Importance Feature ordering for XGBoost trained with training
set and random sets. The number indicates the position in the importance order. For
example, the most important feature for XGBoost trained on the training set is d_ms.

In [10]:
importances_xg = pd.DataFrame(index=feature_names, columns=['Training set', 'Subset 1', 'Subset 2'])
for i in range(X.shape[1]):
    importances_xg.iloc[i,0]=sorted_featuresXGC.index(feature_names[i])+1
    importances_xg.iloc[i,1]=sorted_featuresXG1.index(feature_names[i])+1
    importances_xg.iloc[i,2]=sorted_featuresXG2.index(feature_names[i])+1
importances_xg

Unnamed: 0,Training set,Subset 1,Subset 2
F0,2,3,2
d_ms,10,10,10
d0,5,7,9
v0,23,23,23
prob,6,5,7
Int_dv1,13,13,12
Int_dv2,16,15,21
Int_dv3,14,14,13
Int_dv4,9,9,8
Int_dv5,18,16,19
