In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from scipy.spatial import cKDTree
import random
import warnings
from scipy.stats import spearmanr
warnings.filterwarnings("ignore")


def resultado_correlacion(variable1,variable2):
    # Calcular el coeficiente de correlación de Spearman y el p-valor
    corr_spearman, p_value = spearmanr(variable1, variable2)

    # Imprimir los resultados
    print(f"Coeficiente de correlación de Spearman: {corr_spearman}")
    print(f"P-valor: {p_value}")

    # Interpretar los resultados
    if p_value < 0.05:
        print("\033[92mLa correlación es estadísticamente significativa.\033[0m")
    else:
        print("No hay evidencia suficiente para rechazar la hipótesis nula de no correlación.")

def find_epsilon(X,y,X_res,y_res):
    epsilon = 0
    classes = np.unique(y)
    for cl in classes:
        A = X_res[y_res==cl]
        if A.shape[0] > 0:
            B = X[y==cl]
            kdtree = cKDTree(A)
            epsilon = max(epsilon,max(kdtree.query(B,p=np.inf)[0]))
    return epsilon

def reduce(X,y,perc):
    X_red, X_valid , y_red, y_valid = train_test_split(X,y,train_size=perc,shuffle=True) 
    return X_red, y_red

def compute_similarity_importanceFeatures(importance1, importance2):
    if len(importance1) != len(importance2):
        raise ValueError("The importance vectors must have the same length.")

    total_distance = 0
    for i in range(len(importance1)):
        indice1 = importance1.index(importance2[i]) 
        distance = abs(indice1 - i) 
        total_distance += distance

    similarity = total_distance / len(importance1)
    return similarity

In [2]:
df = pd.read_excel('collision.xlsx')
y= df['collision'].to_numpy()
df =df.drop(columns=['N','m','collision'])
X= df.to_numpy()
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
feature_names=df.columns

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=1, shuffle=True, stratify=y)
print(X_train.shape)
print(y_train.shape)
print(pd.value_counts(y_train))
print(X_test.shape)
print(y_test.shape)
print(pd.value_counts(y_test))

(80407, 23)
(80407,)
1    52011
0    28396
Name: count, dtype: int64
(26803, 23)
(26803,)
1    17337
0     9466
Name: count, dtype: int64


In [4]:
numero = 100
epsilons=[]
similitud_feature_order=[]
treeC = DecisionTreeClassifier(max_depth= 10,random_state=0)
treeC.fit(X_train, y_train)
importancesDTC = treeC.feature_importances_
feature_names=df.columns
sorted_indexDTC = np.argsort(importancesDTC)[::-1]
sorted_featuresDTC = [feature_names[i] for i in sorted_indexDTC]
for i in range(numero):
    numero_aleatorio = random.uniform(0.05, 0.7)
    X1,y1 = reduce(X_train,y_train,numero_aleatorio)
    epsilon = find_epsilon(X_train,y_train,X1,y1)
    epsilons.append(epsilon)
    print(f"Iter {i} Epsilon between Train set and Subset 1: {epsilon}")
    tree = DecisionTreeClassifier(max_depth= 10)
    tree.fit(X1, y1)
    importancesDT = tree.feature_importances_
    sorted_indexDT = np.argsort(importancesDT)[::-1]
    sorted_featuresDT = [feature_names[i] for i in sorted_indexDT] 
    similarity_feature_importanceDTC_x = compute_similarity_importanceFeatures(sorted_featuresDTC, sorted_featuresDT)   
    similitud_feature_order.append(similarity_feature_importanceDTC_x)

Iter 0 Epsilon between Train set and Subset 1: 0.4555555555555555
Iter 1 Epsilon between Train set and Subset 1: 0.6394197529748549
Iter 2 Epsilon between Train set and Subset 1: 0.6247741302591019
Iter 3 Epsilon between Train set and Subset 1: 0.45226721943499726
Iter 4 Epsilon between Train set and Subset 1: 0.45866275862068967
Iter 5 Epsilon between Train set and Subset 1: 0.5444444444444444
Iter 6 Epsilon between Train set and Subset 1: 0.5866108828647378
Iter 7 Epsilon between Train set and Subset 1: 0.5394747548330112
Iter 8 Epsilon between Train set and Subset 1: 0.5333333333333333
Iter 9 Epsilon between Train set and Subset 1: 0.4209119322185219
Iter 10 Epsilon between Train set and Subset 1: 0.45912329280247877
Iter 11 Epsilon between Train set and Subset 1: 0.5394747548330112
Iter 12 Epsilon between Train set and Subset 1: 0.5333333333333333
Iter 13 Epsilon between Train set and Subset 1: 0.5866108828647378
Iter 14 Epsilon between Train set and Subset 1: 0.45571428571428574
I

In [5]:
resultado_correlacion(epsilons,similitud_feature_order)


Coeficiente de correlación de Spearman: 0.5119634899212578
P-valor: 5.20219529703728e-08
[92mLa correlación es estadísticamente significativa.[0m


In [6]:
epsilons=[]
similitud_feature_order=[]
treeC = GradientBoostingClassifier(n_estimators = 25,max_depth= 10)
treeC.fit(X_train, y_train)
importancesDTC = treeC.feature_importances_
feature_names=df.columns
sorted_indexDTC = np.argsort(importancesDTC)[::-1]
sorted_featuresDTC = [feature_names[i] for i in sorted_indexDTC]
for i in range(numero):
    numero_aleatorio = random.uniform(0.05, 0.7)
    X1,y1 = reduce(X_train,y_train,numero_aleatorio)
    epsilon = find_epsilon(X_train,y_train,X1,y1)
    epsilons.append(epsilon)
    print(f"Iter {i} Epsilon between Train set and Subset 1: {epsilon}")
    tree = GradientBoostingClassifier(n_estimators = 25,max_depth= 10)
    tree.fit(X1, y1)
    importancesDT = tree.feature_importances_
    sorted_indexDT = np.argsort(importancesDT)[::-1]
    sorted_featuresDT = [feature_names[i] for i in sorted_indexDT] 
    similarity_feature_importanceDTC_x = compute_similarity_importanceFeatures(sorted_featuresDTC, sorted_featuresDT)   
    similitud_feature_order.append(similarity_feature_importanceDTC_x)

Iter 0 Epsilon between Train set and Subset 1: 0.5558589707667974
Iter 1 Epsilon between Train set and Subset 1: 0.4555555555555555
Iter 2 Epsilon between Train set and Subset 1: 0.527402931418001
Iter 3 Epsilon between Train set and Subset 1: 0.45789799796190545
Iter 4 Epsilon between Train set and Subset 1: 0.5558589707667974
Iter 5 Epsilon between Train set and Subset 1: 0.4555555555555555
Iter 6 Epsilon between Train set and Subset 1: 0.6487142857142858
Iter 7 Epsilon between Train set and Subset 1: 0.43293551724137935
Iter 8 Epsilon between Train set and Subset 1: 0.4441820015373684
Iter 9 Epsilon between Train set and Subset 1: 0.4666666666666667
Iter 10 Epsilon between Train set and Subset 1: 0.5820724892104867
Iter 11 Epsilon between Train set and Subset 1: 0.5333333333333333
Iter 12 Epsilon between Train set and Subset 1: 0.5333333333333333
Iter 13 Epsilon between Train set and Subset 1: 0.49234441664275475
Iter 14 Epsilon between Train set and Subset 1: 0.5495995302926728
Ite

In [7]:
resultado_correlacion(epsilons,similitud_feature_order)

Coeficiente de correlación de Spearman: 0.672669210392699
P-valor: 1.793736584993926e-14
[92mLa correlación es estadísticamente significativa.[0m
