# **IMPORTS:**

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
!pip install dalex -U

In [None]:
!pip install lime

In [None]:
#imports necesarios
import pandas as pd
import numpy as np
from scipy.stats import ttest_rel
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance
import statsmodels.api as sm
import dalex as dx
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from imblearn.ensemble import BalancedRandomForestClassifier, RUSBoostClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix, ConfusionMatrixDisplay, roc_curve, roc_auc_score, RocCurveDisplay

# **Dataset preprocesado:**

In [None]:
#carga del dataset
data=pd.read_csv("/content/drive/MyDrive/TFM/Merge/python_data.csv")
label=pd.read_csv("/content/drive/MyDrive/TFM/Merge/python_label.csv")
#file_path_data = r'D:\Descargas\merge conflict\python_data.csv'
#file_path_label = r'D:\Descargas\merge conflict\python_label.csv'
#data = pd.read_csv(file_path_data)
#label= pd.read_csv(file_path_label)

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
data.head() #comprobar el dataframe

In [None]:
label.head()

In [None]:
#comprobar si existen valores null o ?
print("¿Existen valores nulos en los datos?:", data.isnull().values.any())
print("¿Existen valores  '?'  en los datos?:", (data == '?').values.any())
print("¿Existen valores nulos en las etiquetas?:", label.isnull().values.any())
print("¿Existen valores  '?'  en las etiquetas?:", (label == '?').values.any())

In [None]:
print(data.shape)
print(label.shape)

#obtener lista de atributos sin la variable dependiente
feature_names=list(data)
print(feature_names)
print(list(label))

In [None]:
# Calcular la frecuencia de cada clase
num_clases = label['is_conflict'].value_counts()
print(num_clases)

In [None]:
# Combinamos los datos con las etiquetas en un solo df
df = pd.concat([data, label], axis=1)

In [None]:
df.head()

In [None]:
df.info() #comprobar el tipo de variables y su cantidad. Parece que no hay ninguna variable objeto

In [None]:
df.describe() #obtener algunas estadísitcas de cada atributo

In [None]:
# Parece que file_renamed y file_copied tienen el mismo valor siempre. Vamos a comprobarlo
val_renamed = df['file_renamed'].value_counts()
val_copied = df['file_copied'].value_counts()
print("Para file_renamed:\n", val_renamed)
print("Para file_copied:\n", val_copied)
columnas_constantes = df.columns[df.describe().loc['std'] == 0].tolist()
columnas_constantes

In [None]:
# Al ser caracteristicas constantes decido eliminarlas
df=df.drop('file_renamed', axis=1)
df=df.drop('file_copied', axis=1)

In [None]:
# Actualizamos los nombres de las caracteristicas
feature_names=list(df)
feature_names.remove('is_conflict')
print(feature_names)

In [None]:
correlation_matrix=df.corr()
print(correlation_matrix) #coeficientes de correlación

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt=".2f", annot_kws={"size": 7}, linewidths=0.5)
plt.title("Matriz de correlación")
plt.show()

In [None]:
#Pares de atributos con correlación superior a 0.80
high_correlation_pairs = []
atributos = []

#Recorre la matriz de correlación y verifica los valores
for i in range(len(correlation_matrix.columns)):
    for j in range(i + 1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.80:
            attribute_pair = (correlation_matrix.columns[i], correlation_matrix.columns[j])
            high_correlation_pairs.append(attribute_pair)

# Imprime los pares de atributos con correlación superior a 0.80 y crear una lista de atributos sin repetir
for pair in high_correlation_pairs:
    print("Correlación alta entre: ", pair[0], "y", pair[1])
    atributos.extend(pair)

# Eliminar duplicados y mantener el orden
atributos = list(dict.fromkeys(atributos))

# Imprimir la lista de atributos
print("Lista de atributos con correlación alta: ", atributos)

**VALIDACIÓN CRUZADA PARA COMPROBAR EL EFECTO DE LOS PARÁMETROS:**

In [None]:
# Defino los algoritmos base para evaluar
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)
ada = AdaBoostClassifier(random_state=42)
brf = BalancedRandomForestClassifier(random_state=42)
rus = RUSBoostClassifier(random_state=42)
modelos=[rf, brf, gb, ada, rus]

In [None]:
for i in atributos:
  print("\n##########################################################################################################")
  print("\nPARA EL ATRIBUTO: ", i)

  # Preparando los datos
  X=df[feature_names]
  Y=df['is_conflict']

  x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, test_size=0.3, random_state=42, stratify=Y)
  x_train_sin=x_train.drop(i, axis=1)

  for j in modelos:
    # Realizar validación cruzada con y sin el atributo con alta correlación:
    # Con el atributo
    cv_scores_con = cross_val_score(j, x_train, y_train, cv=10, scoring='accuracy')

    # Sin el atributo
    cv_scores_sin = cross_val_score(j, x_train_sin, y_train, cv=10, scoring='accuracy')

    # Calcular el estadístico t y el p-valor
    t_statistic, p_value = ttest_rel(cv_scores_con, cv_scores_sin)

    # Mostrar los resultados
    print("-------------------------------------------------------------------------------------------------------------")
    print("El modelo: ", j)
    print("Valor de validación cruzada (con atributo):", cv_scores_con)
    print("Valor de validación cruzada (sin atributo):", cv_scores_sin)
    print("Valor medio con atributo es: ", np.mean(cv_scores_con), "\nEl valor medio sin atributo es: ", np.mean(cv_scores_sin))
    if np.mean(cv_scores_con) > np.mean(cv_scores_sin):
      print("El atributo mejora el desempeño del modelo, con una diferencia de: ", np.mean(cv_scores_con)-np.mean(cv_scores_sin))
    else:
      print("El atributo no mejora el desempeño del modelo, con una diferencia de: ", np.mean(cv_scores_con)-np.mean(cv_scores_sin))

    print("Estadístico t:", t_statistic)
    print("p-valor:", p_value)

In [None]:
# Elimina los atributos con demasiada correlación y menor relevancia que su par
feature_names.remove('messages_mean')
print(feature_names)

In [None]:
from google.colab.data_table import DataTable
DataTable.max_columns = 30

df_display=df.drop("messages_mean", axis=1)
df_display.describe()

In [None]:
# Tomar los valores de la variable dependiente (y) y las independientes (x)
X=df[feature_names]
Y=df['is_conflict']

In [None]:
#dividir en conjunto de entrenamiento y test 70%-30%
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, test_size=0.3, random_state=42, stratify=Y)

In [None]:
# Comprobar la division de clases
num_train=pd.Series(y_train).value_counts()
num_test=pd.Series(y_test).value_counts()
porc_train=pd.Series(y_train).value_counts(normalize=True)
porc_test=pd.Series(y_test).value_counts(normalize=True)
comparacion = pd.DataFrame({'% Train': porc_train, 'Num Train': num_train, '% Test': porc_test,'Num Test': num_test})
print(comparacion)

# **Algoritmos:**

### **RandomForest:**

In [None]:
modelo=RandomForestClassifier(random_state=42, class_weight='balanced')
params={'n_estimators':[10,25,50,75,100,200],'min_samples_split':[2,3,5,10,20],'min_samples_leaf':[1,3,5,10,20],'max_depth':[1,3,5,10,20,30]}
grid=GridSearchCV(modelo,params,cv=10,scoring='f1',verbose=1)
grid.fit(x_train,y_train)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)

In [None]:
rf = RandomForestClassifier(class_weight='balanced', max_depth=30, min_samples_leaf=5, random_state=42)

In [None]:
modelo_rf=rf.fit(x_train,y_train)

In [None]:
y_pred_rf = modelo_rf.predict(x_test)

**EVALUACIÓN MODELO:**

In [None]:
#comprobación resultados
df_pred = pd.DataFrame({'Actual': y_test.squeeze(), 'Predicted': y_pred_rf.squeeze()})
print(df_pred)

In [None]:
# Matriz de confusion
cm_rf = confusion_matrix(y_test, y_pred_rf)
cm_display = ConfusionMatrixDisplay(cm_rf).plot()

In [None]:
# Calcular métricas para evaluar el rendimiento del algoritmo
accuracy = accuracy_score(y_test, y_pred_rf)
recall = recall_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1-score:", f1)

In [None]:
modelo_rf.score(x_test, y_test)

In [None]:
modelo_rf.score(x_train, y_train)

In [None]:
# AUROC
y_score = modelo_rf.predict_proba(x_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=modelo_rf.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [None]:
auc=roc_auc_score(y_test, y_score)
auc

**CLASES INVERTIDAS**

In [None]:
y_test_inv = y_test.values
y_test_inv = 1 - y_test_inv
y_test_inv = pd.Series(y_test_inv)
y_pred_rf_inv = 1 - y_pred_rf
df_pred_inv = pd.DataFrame({'Actual': y_test_inv.squeeze(), 'Predicted': y_pred_rf_inv.squeeze()})

In [None]:
df_pred.head(20)

In [None]:
df_pred_inv.head(20)

In [None]:
# Calcular métricas al invertir las clases
accuracy_inv = accuracy_score(y_test_inv, y_pred_rf_inv)
recall_inv = recall_score(y_test_inv, y_pred_rf_inv)
precision_inv = precision_score(y_test_inv, y_pred_rf_inv)
f1_inv = f1_score(y_test_inv, y_pred_rf_inv)
print("Accuracy:", accuracy_inv)
print("Recall:", recall_inv)
print("Precision:", precision_inv)
print("F1-score:", f1_inv)

In [None]:
score_inv_result={"Modelo":["RandomForest"],"Accuracy":[accuracy_inv], "Precision":[precision_inv], "Recall":[recall_inv], "F1-score":[f1_inv]}
score_inv_result=pd.DataFrame(score_inv_result)

**VALIDACIÓN CRUZADA**

In [None]:
# El número de folds
n_folds = 10

# Inicializar listas para almacenar las puntuaciones de las métricas. Nos centramos en las métricas comunes en la literatura
acc_sc = []
prec_sc = []
rcll_sc = []
f1_sc = []

# Crear los objetos KFold
skf = StratifiedKFold(n_splits=n_folds)

for train_index, test_index in skf.split(X, Y):
    # Dividir los datos en entrenamiento y prueba
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Entrenar el modelo
    cv_rf= RandomForestClassifier(class_weight='balanced', max_depth=30, min_samples_leaf=5, random_state=42)
    cv_rf.fit(X_train, Y_train)

    # Realizar las predicciones en el conjunto de prueba
    Y_pred = cv_rf.predict(X_test)

    # Calcular las métricas
    accuracy = accuracy_score(Y_test, Y_pred)
    recall = recall_score(Y_test, Y_pred)
    precision = precision_score(Y_test, Y_pred)
    f1 = f1_score(Y_test, Y_pred)

    # Agregar las puntuaciones a las listas
    acc_sc.append(accuracy)
    prec_sc.append(precision)
    rcll_sc.append(recall)
    f1_sc.append(f1)

# Calcular la media de las puntuaciones
mean_acc = np.mean(acc_sc)
mean_prec = np.mean(prec_sc)
mean_rcll = np.mean(rcll_sc)
mean_f1 = np.mean(f1_sc)

print(f'CV Accuracy: {mean_acc:.4f}')
print(f'CV Precision: {mean_prec:.4f}')
print(f'CV Recall: {mean_rcll:.4f}')
print(f'CV F1-Score: {mean_f1:.4f}')

In [None]:
cv_results = cross_validate(rf, X, Y, cv=10, scoring=('accuracy','precision', 'recall','f1'))

In [None]:
sorted(cv_results.keys())

In [None]:
print("Accuracy:\n", cv_results['test_accuracy'], "\n Con una media de: ", np.mean(cv_results['test_accuracy']))
print("Precision:\n" , cv_results['test_precision'], "\n Con una media de: ", np.mean(cv_results['test_precision']))
print("Recall:\n", cv_results['test_recall'], "\n Con una media de: ", np.mean(cv_results['test_recall']))
print("F1-score:\n", cv_results['test_f1'], "\n Con una media de: ", np.mean(cv_results['test_f1']))

In [None]:
score_result={"Modelo":["RandomForest"],"Accuracy":[np.mean(cv_results['test_accuracy'])], "Precision":[np.mean(cv_results['test_precision'])], "Recall":[np.mean(cv_results['test_recall'])], "F1-score":[np.mean(cv_results['test_f1'])]}
score_result=pd.DataFrame(score_result)

### **BalancedRandomForest:**

In [None]:
modelo=BalancedRandomForestClassifier(random_state=42)
params={'n_estimators':[10,25,50,75,100,200],'min_samples_split':[2,3,5,10,20],'min_samples_leaf':[1,3,5,10,20],'max_depth':[1,3,5,10,20,30],'class_weight':[None,'balanced']}
grid=GridSearchCV(modelo,params,cv=10,scoring='f1',verbose=1)
grid.fit(x_train,y_train)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)

In [None]:
brf = BalancedRandomForestClassifier(class_weight='balanced', n_estimators=10,random_state=42)

In [None]:
modelo_brf=brf.fit(x_train,y_train)

In [None]:
y_pred_brf = modelo_brf.predict(x_test)

**EVALUACIÓN MODELO:**

In [None]:
#comprobación resultados
df_pred = pd.DataFrame({'Actual': y_test.squeeze(), 'Predicted': y_pred_brf.squeeze()})
print(df_pred)

In [None]:
# Matriz de confusion
cm_brf = confusion_matrix(y_test, y_pred_brf)
cm_display = ConfusionMatrixDisplay(cm_brf).plot()

In [None]:
#calcular métricas para evaluar el rendimiento del algoritmo
accuracy = accuracy_score(y_test, y_pred_brf)
recall = recall_score(y_test, y_pred_brf)
precision = precision_score(y_test, y_pred_brf)
f1 = f1_score(y_test, y_pred_brf)
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1-score:", f1)

In [None]:
modelo_brf.score(x_test, y_test)

In [None]:
modelo_brf.score(x_train, y_train)

In [None]:
# AUROC
y_score = modelo_brf.predict_proba(x_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=modelo_brf.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [None]:
auc=roc_auc_score(y_test, y_score)
auc

**CLASES INVERTIDAS**

In [None]:
y_test_inv = y_test.values
y_test_inv = 1 - y_test_inv
y_test_inv = pd.Series(y_test_inv)
y_pred_brf_inv = 1 - y_pred_brf
df_pred_inv = pd.DataFrame({'Actual': y_test_inv.squeeze(), 'Predicted': y_pred_brf_inv.squeeze()})

In [None]:
df_pred.head(20)

In [None]:
df_pred_inv.head(20)

In [None]:
# Calcular métricas al invertir las clases
accuracy_inv = accuracy_score(y_test_inv, y_pred_brf_inv)
recall_inv = recall_score(y_test_inv, y_pred_brf_inv)
precision_inv = precision_score(y_test_inv, y_pred_brf_inv)
f1_inv = f1_score(y_test_inv, y_pred_brf_inv)
print("Accuracy:", accuracy_inv)
print("Recall:", recall_inv)
print("Precision:", precision_inv)
print("F1-score:", f1_inv)

In [None]:
score_inv_result=score_inv_result.append({"Modelo":"BalancedRF","Accuracy":accuracy_inv, "Precision":precision_inv, "Recall":recall_inv, "F1-score":f1_inv}, ignore_index=True)

**VALIDACIÓN CRUZADA**

In [None]:
# El número de folds
n_folds = 10

# Inicializar listas para almacenar las puntuaciones de las métricas. Nos centramos en las métricas comunes en la literatura
acc_sc = []
prec_sc = []
rcll_sc = []
f1_sc = []

# Crear los objetos KFold
skf = StratifiedKFold(n_splits=n_folds)

for train_index, test_index in skf.split(X, Y):
    # Dividir los datos en entrenamiento y prueba
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Entrenar el modelo
    cv_brf= BalancedRandomForestClassifier(class_weight='balanced', n_estimators=10,random_state=42)
    cv_brf.fit(X_train, Y_train)

    # Realizar las predicciones en el conjunto de prueba
    Y_pred = cv_brf.predict(X_test)

    # Calcular las métricas
    accuracy = accuracy_score(Y_test, Y_pred)
    recall = recall_score(Y_test, Y_pred)
    precision = precision_score(Y_test, Y_pred)
    f1 = f1_score(Y_test, Y_pred)

    # Agregar las puntuaciones a las listas
    acc_sc.append(accuracy)
    prec_sc.append(precision)
    rcll_sc.append(recall)
    f1_sc.append(f1)

# Calcular la media de las puntuaciones
mean_acc = np.mean(acc_sc)
mean_prec = np.mean(prec_sc)
mean_rcll = np.mean(rcll_sc)
mean_f1 = np.mean(f1_sc)

print(f'CV Accuracy: {mean_acc:.4f}')
print(f'CV Precision: {mean_prec:.4f}')
print(f'CV Recall: {mean_rcll:.4f}')
print(f'CV F1-Score: {mean_f1:.4f}')

In [None]:
cv_results = cross_validate(brf, X, Y, cv=10, scoring=('accuracy','precision', 'recall','f1'))

In [None]:
sorted(cv_results.keys())

In [None]:
print("Accuracy:\n", cv_results['test_accuracy'], "\n Con una media de: ", np.mean(cv_results['test_accuracy']))
print("Precision:\n" , cv_results['test_precision'], "\n Con una media de: ", np.mean(cv_results['test_precision']))
print("Recall:\n", cv_results['test_recall'], "\n Con una media de: ", np.mean(cv_results['test_recall']))
print("F1-score:\n", cv_results['test_f1'], "\n Con una media de: ", np.mean(cv_results['test_f1']))

In [None]:
score_result=score_result.append({"Modelo":"BalancedRF","Accuracy":np.mean(cv_results['test_accuracy']), "Precision":np.mean(cv_results['test_precision']), "Recall":np.mean(cv_results['test_recall']), "F1-score":np.mean(cv_results['test_f1'])}, ignore_index=True)

### **GradientBoosting:**

In [None]:
modelo=GradientBoostingClassifier(random_state=42)
params={'n_estimators':[10,25,50,75,100,200],'min_samples_split':[2,3,5,10,20],'min_samples_leaf':[1,3,5,10,20],'max_depth':[1,3,5,10,20,30]}
grid=GridSearchCV(modelo,params,cv=10,scoring='f1',verbose=1)
grid.fit(x_train,y_train)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)

In [None]:
gb = GradientBoostingClassifier(max_depth=10, min_samples_leaf=10, n_estimators=200, random_state=42)

In [None]:
modelo_gb=gb.fit(x_train,y_train)

In [None]:
y_pred_gb = modelo_gb.predict(x_test)

**EVALUACIÓN MODELO:**

In [None]:
#comprobación resultados
df_pred = pd.DataFrame({'Actual': y_test.squeeze(), 'Predicted': y_pred_gb.squeeze()})
print(df_pred)

In [None]:
# Matriz de confusion
cm_gb = confusion_matrix(y_test, y_pred_gb)
cm_display = ConfusionMatrixDisplay(cm_gb).plot()

In [None]:
#calcular métricas para evaluar el rendimiento del algoritmo
accuracy = accuracy_score(y_test, y_pred_gb)
recall = recall_score(y_test, y_pred_gb)
precision = precision_score(y_test, y_pred_gb)
f1 = f1_score(y_test, y_pred_gb)
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1-score:", f1)

In [None]:
modelo_gb.score(x_test, y_test)

In [None]:
modelo_gb.score(x_train, y_train)

In [None]:
# AUROC
y_score = modelo_gb.predict_proba(x_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=modelo_gb.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [None]:
auc=roc_auc_score(y_test, y_score)
auc

**CLASES INVERTIDAS**

In [None]:
y_test_inv = y_test.values
y_test_inv = 1 - y_test_inv
y_test_inv = pd.Series(y_test_inv)
y_pred_gb_inv = 1 - y_pred_gb
df_pred_inv = pd.DataFrame({'Actual': y_test_inv.squeeze(), 'Predicted': y_pred_gb_inv.squeeze()})

In [None]:
df_pred.head(20)

In [None]:
df_pred_inv.head(20)

In [None]:
# Calcular métricas al invertir las clases
accuracy_inv = accuracy_score(y_test_inv, y_pred_gb_inv)
recall_inv = recall_score(y_test_inv, y_pred_gb_inv)
precision_inv = precision_score(y_test_inv, y_pred_gb_inv)
f1_inv = f1_score(y_test_inv, y_pred_gb_inv)
print("Accuracy:", accuracy_inv)
print("Recall:", recall_inv)
print("Precision:", precision_inv)
print("F1-score:", f1_inv)

In [None]:
score_inv_result=score_inv_result.append({"Modelo":"GradientBoost","Accuracy":accuracy_inv, "Precision":precision_inv, "Recall":recall_inv, "F1-score":f1_inv}, ignore_index=True)

**VALIDACIÓN CRUZADA**

In [None]:
# El número de folds
n_folds = 10

# Inicializar listas para almacenar las puntuaciones de las métricas. Nos centramos en las métricas comunes en la literatura
acc_sc = []
prec_sc = []
rcll_sc = []
f1_sc = []

# Crear los objetos KFold
skf = StratifiedKFold(n_splits=n_folds)

for train_index, test_index in skf.split(X, Y):
    # Dividir los datos en entrenamiento y prueba
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Entrenar el modelo
    cv_gb= GradientBoostingClassifier(max_depth=10, min_samples_leaf=10, n_estimators=200, random_state=42)
    cv_gb.fit(X_train, Y_train)

    # Realizar las predicciones en el conjunto de prueba
    Y_pred = cv_gb.predict(X_test)

    # Calcular las métricas
    accuracy = accuracy_score(Y_test, Y_pred)
    recall = recall_score(Y_test, Y_pred)
    precision = precision_score(Y_test, Y_pred)
    f1 = f1_score(Y_test, Y_pred)

    # Agregar las puntuaciones a las listas
    acc_sc.append(accuracy)
    prec_sc.append(precision)
    rcll_sc.append(recall)
    f1_sc.append(f1)

# Calcular la media de las puntuaciones
mean_acc = np.mean(acc_sc)
mean_prec = np.mean(prec_sc)
mean_rcll = np.mean(rcll_sc)
mean_f1 = np.mean(f1_sc)

print(f'CV Accuracy: {mean_acc:.4f}')
print(f'CV Precision: {mean_prec:.4f}')
print(f'CV Recall: {mean_rcll:.4f}')
print(f'CV F1-Score: {mean_f1:.4f}')

In [None]:
cv_results = cross_validate(gb, X, Y, cv=10, scoring=('accuracy','precision', 'recall','f1'))

In [None]:
sorted(cv_results.keys())

In [None]:
print("Accuracy:\n", cv_results['test_accuracy'], "\n Con una media de: ", np.mean(cv_results['test_accuracy']))
print("Precision:\n" , cv_results['test_precision'], "\n Con una media de: ", np.mean(cv_results['test_precision']))
print("Recall:\n", cv_results['test_recall'], "\n Con una media de: ", np.mean(cv_results['test_recall']))
print("F1-score:\n", cv_results['test_f1'], "\n Con una media de: ", np.mean(cv_results['test_f1']))

In [None]:
score_result=score_result.append({"Modelo":"GradientBoosting","Accuracy":np.mean(cv_results['test_accuracy']), "Precision":np.mean(cv_results['test_precision']), "Recall":np.mean(cv_results['test_recall']), "F1-score":np.mean(cv_results['test_f1'])}, ignore_index=True)

### **AdaBoosting:**

In [None]:
modelo=AdaBoostClassifier(random_state=42)
params={'n_estimators':[5,10,25,50,75,100,200],'learning_rate':[0.2,0.4,0.6,0.8,1.0,1.2,1.4,1.6,1.8,2.0,3.0,5.0], 'algorithm':['SAMME','SAMME.R']}
grid=GridSearchCV(modelo,params,cv=10,scoring='f1',verbose=1)
grid.fit(x_train,y_train)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)

In [None]:
ada=AdaBoostClassifier(learning_rate=1.2, n_estimators=200, random_state=42)

In [None]:
modelo_ada=ada.fit(x_train,y_train)

In [None]:
y_pred_ada = modelo_ada.predict(x_test)

**EVALUACIÓN MODELO:**

In [None]:
#comprobación resultados
df_pred = pd.DataFrame({'Actual': y_test.squeeze(), 'Predicted': y_pred_ada.squeeze()})
print(df_pred)

In [None]:
# Matriz de confusion
cm_ada = confusion_matrix(y_test, y_pred_ada)
cm_display = ConfusionMatrixDisplay(cm_ada).plot()

In [None]:
#calcular métricas para evaluar el rendimiento del algoritmo
accuracy = accuracy_score(y_test, y_pred_ada)
recall = recall_score(y_test, y_pred_ada)
precision = precision_score(y_test, y_pred_ada)
f1 = f1_score(y_test, y_pred_ada)
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1-score:", f1)

In [None]:
modelo_ada.score(x_test, y_test)

In [None]:
modelo_ada.score(x_train, y_train)

In [None]:
# AUROC
y_score = modelo_ada.predict_proba(x_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=modelo_ada.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [None]:
auc=roc_auc_score(y_test, y_score)
auc

**CLASES INVERTIDAS**

In [None]:
y_test_inv = y_test.values
y_test_inv = 1 - y_test_inv
y_test_inv = pd.Series(y_test_inv)
y_pred_ada_inv = 1 - y_pred_ada
df_pred_inv = pd.DataFrame({'Actual': y_test_inv.squeeze(), 'Predicted': y_pred_ada_inv.squeeze()})

In [None]:
df_pred.head(20)

In [None]:
df_pred_inv.head(20)

In [None]:
# Calcular métricas al invertir las clases
accuracy_inv = accuracy_score(y_test_inv, y_pred_ada_inv)
recall_inv = recall_score(y_test_inv, y_pred_ada_inv)
precision_inv = precision_score(y_test_inv, y_pred_ada_inv)
f1_inv = f1_score(y_test_inv, y_pred_ada_inv)
print("Accuracy:", accuracy_inv)
print("Recall:", recall_inv)
print("Precision:", precision_inv)
print("F1-score:", f1_inv)

In [None]:
score_inv_result=score_inv_result.append({"Modelo":"AdaBoost","Accuracy":accuracy_inv, "Precision":precision_inv, "Recall":recall_inv, "F1-score":f1_inv}, ignore_index=True)

**VALIDACIÓN CRUZADA**

In [None]:
# El número de folds
n_folds = 10

# Inicializar listas para almacenar las puntuaciones de las métricas. Nos centramos en las métricas comunes en la literatura
acc_sc = []
prec_sc = []
rcll_sc = []
f1_sc = []

# Crear los objetos KFold
skf = StratifiedKFold(n_splits=n_folds)

for train_index, test_index in skf.split(X, Y):
    # Dividir los datos en entrenamiento y prueba
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Entrenar el modelo
    cv_ada= AdaBoostClassifier(learning_rate=1.2, n_estimators=200, random_state=42)
    cv_ada.fit(X_train, Y_train)

    # Realizar las predicciones en el conjunto de prueba
    Y_pred = cv_ada.predict(X_test)

    # Calcular las métricas
    accuracy = accuracy_score(Y_test, Y_pred)
    recall = recall_score(Y_test, Y_pred)
    precision = precision_score(Y_test, Y_pred)
    f1 = f1_score(Y_test, Y_pred)

    # Agregar las puntuaciones a las listas
    acc_sc.append(accuracy)
    prec_sc.append(precision)
    rcll_sc.append(recall)
    f1_sc.append(f1)

# Calcular la media de las puntuaciones
mean_acc = np.mean(acc_sc)
mean_prec = np.mean(prec_sc)
mean_rcll = np.mean(rcll_sc)
mean_f1 = np.mean(f1_sc)

print(f'CV Accuracy: {mean_acc:.4f}')
print(f'CV Precision: {mean_prec:.4f}')
print(f'CV Recall: {mean_rcll:.4f}')
print(f'CV F1-Score: {mean_f1:.4f}')

In [None]:
cv_results = cross_validate(ada, X, Y, cv=10, scoring=('accuracy','precision', 'recall','f1'))

In [None]:
sorted(cv_results.keys())

In [None]:
print("Accuracy:\n", cv_results['test_accuracy'], "\n Con una media de: ", np.mean(cv_results['test_accuracy']))
print("Precision:\n" , cv_results['test_precision'], "\n Con una media de: ", np.mean(cv_results['test_precision']))
print("Recall:\n", cv_results['test_recall'], "\n Con una media de: ", np.mean(cv_results['test_recall']))
print("F1-score:\n", cv_results['test_f1'], "\n Con una media de: ", np.mean(cv_results['test_f1']))

In [None]:
score_result=score_result.append({"Modelo":"AdaBoost","Accuracy":np.mean(cv_results['test_accuracy']), "Precision":np.mean(cv_results['test_precision']), "Recall":np.mean(cv_results['test_recall']), "F1-score":np.mean(cv_results['test_f1'])}, ignore_index=True)

### **RUSBoost:**

In [None]:
modelo=RUSBoostClassifier(random_state=42)
params={'n_estimators':[5,10,25,50,75,100,200], 'learning_rate':[0.2,0.4,0.6,0.8,1.0,1.2,1.4,1.6,1.8,2.0,3.0,5.0], 'algorithm':['SAMME','SAMME.R'],'replacement':[False, True]}
grid=GridSearchCV(modelo,params,cv=10,scoring='f1',verbose=1)
grid.fit(x_train,y_train)

In [None]:
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)

In [None]:
rus =RUSBoostClassifier(learning_rate=1.6, n_estimators=10, random_state=42, replacement=False, algorithm='SAMME.R')

In [None]:
modelo_rus=rus.fit(x_train,y_train)

In [None]:
y_pred_rus = modelo_rus.predict(x_test)

**EVALUACIÓN MODELO:**

In [None]:
#comprobación resultados
df_pred = pd.DataFrame({'Actual': y_test.squeeze(), 'Predicted': y_pred_rus.squeeze()})
print(df_pred)

In [None]:
# Matriz de confusion
cm_rus = confusion_matrix(y_test, y_pred_rus)
cm_display = ConfusionMatrixDisplay(cm_rus).plot()

In [None]:
#calcular métricas para evaluar el rendimiento del algoritmo
accuracy = accuracy_score(y_test, y_pred_rus)
recall = recall_score(y_test, y_pred_rus)
precision = precision_score(y_test, y_pred_rus)
f1 = f1_score(y_test, y_pred_rus)
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1-score:", f1)

In [None]:
modelo_rus.score(x_test, y_test)

In [None]:
modelo_rus.score(x_train, y_train)

In [None]:
# AUROC
y_score = modelo_rus.predict_proba(x_test)[:,1]
fpr, tpr, _ = roc_curve(y_test, y_score, pos_label=modelo_rus.classes_[1])
roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()

In [None]:
auc=roc_auc_score(y_test, y_score)
auc

**CLASES INVERTIDAS**

In [None]:
y_test_inv = y_test.values
y_test_inv = 1 - y_test_inv
y_test_inv = pd.Series(y_test_inv)
y_pred_rus_inv = 1 - y_pred_rus
df_pred_inv = pd.DataFrame({'Actual': y_test_inv.squeeze(), 'Predicted': y_pred_rus_inv.squeeze()})

In [None]:
df_pred.head(20)

In [None]:
df_pred_inv.head(20)

In [None]:
# Calcular métricas al invertir las clases
accuracy_inv = accuracy_score(y_test_inv, y_pred_rus_inv)
recall_inv = recall_score(y_test_inv, y_pred_rus_inv)
precision_inv = precision_score(y_test_inv, y_pred_rus_inv)
f1_inv = f1_score(y_test_inv, y_pred_rus_inv)
print("Accuracy:", accuracy_inv)
print("Recall:", recall_inv)
print("Precision:", precision_inv)
print("F1-score:", f1_inv)

In [None]:
score_inv_result=score_inv_result.append({"Modelo":"RUSBoost","Accuracy":accuracy_inv, "Precision":precision_inv, "Recall":recall_inv, "F1-score":f1_inv}, ignore_index=True)

**VALIDACIÓN CRUZADA**

In [None]:
# El número de folds
n_folds = 10

# Inicializar listas para almacenar las puntuaciones de las métricas. Nos centramos en las métricas comunes en la literatura
acc_sc = []
prec_sc = []
rcll_sc = []
f1_sc = []

# Crear los objetos KFold
skf = StratifiedKFold(n_splits=n_folds)

for train_index, test_index in skf.split(X, Y):
    # Dividir los datos en entrenamiento y prueba
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    Y_train, Y_test = Y.iloc[train_index], Y.iloc[test_index]

    # Entrenar el modelo
    cv_rus= RUSBoostClassifier(learning_rate=1.6, n_estimators=10, random_state=42)
    cv_rus.fit(X_train, Y_train)

    # Realizar las predicciones en el conjunto de prueba
    Y_pred = cv_rus.predict(X_test)

    # Calcular las métricas
    accuracy = accuracy_score(Y_test, Y_pred)
    recall = recall_score(Y_test, Y_pred)
    precision = precision_score(Y_test, Y_pred)
    f1 = f1_score(Y_test, Y_pred)

    # Agregar las puntuaciones a las listas
    acc_sc.append(accuracy)
    prec_sc.append(precision)
    rcll_sc.append(recall)
    f1_sc.append(f1)

# Calcular la media de las puntuaciones
mean_acc = np.mean(acc_sc)
mean_prec = np.mean(prec_sc)
mean_rcll = np.mean(rcll_sc)
mean_f1 = np.mean(f1_sc)

print(f'CV Accuracy: {mean_acc:.4f}')
print(f'CV Precision: {mean_prec:.4f}')
print(f'CV Recall: {mean_rcll:.4f}')
print(f'CV F1-Score: {mean_f1:.4f}')

In [None]:
cv_results = cross_validate(rus, X, Y, cv=10, scoring=('accuracy','precision', 'recall','f1'))

In [None]:
sorted(cv_results.keys())

In [None]:
print("Accuracy:\n", cv_results['test_accuracy'], "\n Con una media de: ", np.mean(cv_results['test_accuracy']))
print("Precision:\n" , cv_results['test_precision'], "\n Con una media de: ", np.mean(cv_results['test_precision']))
print("Recall:\n", cv_results['test_recall'], "\n Con una media de: ", np.mean(cv_results['test_recall']))
print("F1-score:\n", cv_results['test_f1'], "\n Con una media de: ", np.mean(cv_results['test_f1']))

In [None]:
score_result=score_result.append({"Modelo":"RUSBoost","Accuracy":np.mean(cv_results['test_accuracy']), "Precision":np.mean(cv_results['test_precision']), "Recall":np.mean(cv_results['test_recall']), "F1-score":np.mean(cv_results['test_f1'])}, ignore_index=True)

# **Instancias comunes:**

In [None]:
# Crear un DataFrame con las predicciones de cada modelo y la etiqueta real
pred_modelos_df = pd.DataFrame({
    'RandomForest': y_pred_rf,
    'BalancedRF': y_pred_brf,
    'GradientBoost': y_pred_gb,
    'AdaBoost': y_pred_ada,
    'RUSBoost': y_pred_rus,
    'Etiqueta': y_test
})

In [None]:
pred_modelos_df

In [None]:
# Filtrar las instancias que tienen valor 1 en todas las columnas de los modelos
filtro_1 = pred_modelos_df[['RandomForest', 'BalancedRF', 'GradientBoost', 'AdaBoost', 'RUSBoost']].all(axis=1)

# Filtrar las instancias que tienen valor 0 en todas las columnas de los modelos
filtro_0 = (~pred_modelos_df[['RandomForest', 'BalancedRF', 'GradientBoost', 'AdaBoost', 'RUSBoost']].any(axis=1)) & (~filtro_1)

# Aplicar los filtros al DataFrame de las predicciones
df_comunes = pred_modelos_df[filtro_1 | filtro_0]

In [None]:
df_comunes

In [None]:
# Filtrar las instancias que son Verdadero Positivo
vp_df = pred_modelos_df[(pred_modelos_df['RandomForest'] == 1) & (pred_modelos_df['BalancedRF'] == 1) & (pred_modelos_df['GradientBoost'] == 1) & (pred_modelos_df['AdaBoost'] == 1) & (pred_modelos_df['RUSBoost'] == 1) & (pred_modelos_df['Etiqueta'] == 1)]

# Filtrar las instancias que son Verdadero Negativo
vn_df = pred_modelos_df[(pred_modelos_df['RandomForest'] == 0) & (pred_modelos_df['BalancedRF'] == 0) & (pred_modelos_df['GradientBoost'] == 0) & (pred_modelos_df['AdaBoost'] == 0) & (pred_modelos_df['RUSBoost'] == 0) & (pred_modelos_df['Etiqueta'] == 0)]

# Filtrar las instancias que son Falso Positivo
fp_df = pred_modelos_df[(pred_modelos_df['RandomForest'] == 1) & (pred_modelos_df['BalancedRF'] == 1) & (pred_modelos_df['GradientBoost'] == 1) & (pred_modelos_df['AdaBoost'] == 1) & (pred_modelos_df['RUSBoost'] == 1) & (pred_modelos_df['Etiqueta'] == 0)]

# Filtrar las instancias que son Falso Negativo
fn_df = pred_modelos_df[(pred_modelos_df['RandomForest'] == 0) & (pred_modelos_df['BalancedRF'] == 0) & (pred_modelos_df['GradientBoost'] == 0) & (pred_modelos_df['AdaBoost'] == 0) & (pred_modelos_df['RUSBoost'] == 0) & (pred_modelos_df['Etiqueta'] == 1)]


In [None]:
vp_df

In [None]:
vn_df

In [None]:
fp_df

In [None]:
fn_df

In [None]:
indices_vp = vp_df.index.tolist()
indices_vn = vn_df.index.tolist()
indices_fp = fp_df.index.tolist()
indices_fn = fn_df.index.tolist()

In [None]:
vp_instancias = x_test.loc[indices_vp]
vn_instancias = x_test.loc[indices_vn]
fp_instancias = x_test.loc[indices_fp]
fn_instancias = x_test.loc[indices_fn]

**VALOR DE PREDICCION INSTANCIAS COMUNES:**

In [None]:
# Valor de la predicciones para RandomForest
valor_pred_rf_vp = modelo_rf.predict_proba(vp_instancias)
valor_pred_rf_vn = modelo_rf.predict_proba(vn_instancias)
valor_pred_rf_fp = modelo_rf.predict_proba(fp_instancias)
valor_pred_rf_fn = modelo_rf.predict_proba(fn_instancias)

proba_rf_vp = valor_pred_rf_vp[:, 1] # selecciona la probabilidad estimada de la clase positiva de los verdaderos positivos
proba_rf_vn = valor_pred_rf_vn[:, 0] # selecciona la probabilidad estimada de la clase negativa de los verdaderos negativos
proba_rf_fp = valor_pred_rf_fp[:, 1] # selecciona la probabilidad estimada de la clase positiva de los falsos positivos
proba_rf_fn = valor_pred_rf_fn[:, 0] # selecciona la probabilidad estimada de la clase negativa de los falsos negativos

proba_rf_vp = pd.Series(proba_rf_vp, index=vp_df.index) # transformamos en series de pandas
proba_rf_vn = pd.Series(proba_rf_vn, index=vn_df.index)
proba_rf_fp = pd.Series(proba_rf_fp, index=fp_df.index)
proba_rf_fn = pd.Series(proba_rf_fn, index=fn_df.index)

vp_df['RF V.Pred'] = proba_rf_vp # añadir los valores a los dataframe
vn_df['RF V.Pred'] = proba_rf_vn
fp_df['RF V.Pred'] = proba_rf_fp
fn_df['RF V.Pred'] = proba_rf_fn

# Valor de la predicciones para BalancedRandomForest
valor_pred_brf_vp = modelo_brf.predict_proba(vp_instancias)
valor_pred_brf_vn = modelo_brf.predict_proba(vn_instancias)
valor_pred_brf_fp = modelo_brf.predict_proba(fp_instancias)
valor_pred_brf_fn = modelo_brf.predict_proba(fn_instancias)

proba_brf_vp = valor_pred_brf_vp[:, 1]
proba_brf_vn = valor_pred_brf_vn[:, 0]
proba_brf_fp = valor_pred_brf_fp[:, 1]
proba_brf_fn = valor_pred_brf_fn[:, 0]

proba_brf_vp = pd.Series(proba_brf_vp, index=vp_df.index)
proba_brf_vn = pd.Series(proba_brf_vn, index=vn_df.index)
proba_brf_fp = pd.Series(proba_brf_fp, index=fp_df.index)
proba_brf_fn = pd.Series(proba_brf_fn, index=fn_df.index)

vp_df['BRF V.Pred'] = proba_brf_vp
vn_df['BRF V.Pred'] = proba_brf_vn
fp_df['BRF V.Pred'] = proba_brf_fp
fn_df['BRF V.Pred'] = proba_brf_fn

# Valor de la predicciones para GradientBoosting
valor_pred_gb_vp = modelo_gb.predict_proba(vp_instancias)
valor_pred_gb_vn = modelo_gb.predict_proba(vn_instancias)
valor_pred_gb_fp = modelo_gb.predict_proba(fp_instancias)
valor_pred_gb_fn = modelo_gb.predict_proba(fn_instancias)

proba_gb_vp = valor_pred_gb_vp[:, 1]
proba_gb_vn = valor_pred_gb_vn[:, 0]
proba_gb_fp = valor_pred_gb_fp[:, 1]
proba_gb_fn = valor_pred_gb_fn[:, 0]

proba_gb_vp = pd.Series(proba_gb_vp, index=vp_df.index)
proba_gb_vn = pd.Series(proba_gb_vn, index=vn_df.index)
proba_gb_fp = pd.Series(proba_gb_fp, index=fp_df.index)
proba_gb_fn = pd.Series(proba_gb_fn, index=fn_df.index)

vp_df['GB V.Pred'] = proba_gb_vp
vn_df['GB V.Pred'] = proba_gb_vn
fp_df['GB V.Pred'] = proba_gb_fp
fn_df['GB V.Pred'] = proba_gb_fn

# Valor de la predicciones para AdaBoost
valor_pred_ada_vp = modelo_ada.predict_proba(vp_instancias)
valor_pred_ada_vn = modelo_ada.predict_proba(vn_instancias)
valor_pred_ada_fp = modelo_ada.predict_proba(fp_instancias)
valor_pred_ada_fn = modelo_ada.predict_proba(fn_instancias)

proba_ada_vp = valor_pred_ada_vp[:, 1]
proba_ada_vn = valor_pred_ada_vn[:, 0]
proba_ada_fp = valor_pred_ada_fp[:, 1]
proba_ada_fn = valor_pred_ada_fn[:, 0]

proba_ada_vp = pd.Series(proba_ada_vp, index=vp_df.index)
proba_ada_vn = pd.Series(proba_ada_vn, index=vn_df.index)
proba_ada_fp = pd.Series(proba_ada_fp, index=fp_df.index)
proba_ada_fn = pd.Series(proba_ada_fn, index=fn_df.index)

vp_df['AB V.Pred'] = proba_ada_vp
vn_df['AB V.Pred'] = proba_ada_vn
fp_df['AB V.Pred'] = proba_ada_fp
fn_df['AB V.Pred'] = proba_ada_fn

# Valor de la predicciones para RUSBoos
valor_pred_rus_vp = modelo_rus.predict_proba(vp_instancias)
valor_pred_rus_vn = modelo_rus.predict_proba(vn_instancias)
valor_pred_rus_fp = modelo_rus.predict_proba(fp_instancias)
valor_pred_rus_fn = modelo_rus.predict_proba(fn_instancias)

proba_rus_vp = valor_pred_rus_vp[:, 1]
proba_rus_vn = valor_pred_rus_vn[:, 0]
proba_rus_fp = valor_pred_rus_fp[:, 1]
proba_rus_fn = valor_pred_rus_fn[:, 0]

proba_rus_vp = pd.Series(proba_rus_vp, index=vp_df.index)
proba_rus_vn = pd.Series(proba_rus_vn, index=vn_df.index)
proba_rus_fp = pd.Series(proba_rus_fp, index=fp_df.index)
proba_rus_fn = pd.Series(proba_rus_fn, index=fn_df.index)

vp_df['RB V.Pred'] = proba_rus_vp
vn_df['RB V.Pred'] = proba_rus_vn
fp_df['RB V.Pred'] = proba_rus_fp
fn_df['RB V.Pred'] = proba_rus_fn

In [None]:
# Cálculo de la media para cada instancia del dataframe
media_vp = vp_df[['RF V.Pred', 'BRF V.Pred', 'GB V.Pred', 'AB V.Pred', 'RB V.Pred']].mean(axis=1)
vp_df['V.Pred media'] = media_vp

media_vn = vn_df[['RF V.Pred', 'BRF V.Pred', 'GB V.Pred', 'AB V.Pred', 'RB V.Pred']].mean(axis=1)
vn_df['V.Pred media'] = media_vn

media_fp = fp_df[['RF V.Pred', 'BRF V.Pred', 'GB V.Pred', 'AB V.Pred', 'RB V.Pred']].mean(axis=1)
fp_df['V.Pred media'] = media_fp

media_fn = fn_df[['RF V.Pred', 'BRF V.Pred', 'GB V.Pred', 'AB V.Pred', 'RB V.Pred']].mean(axis=1)
fn_df['V.Pred media'] = media_fn

In [None]:
vp_df

In [None]:
vn_df

In [None]:
fp_df

In [None]:
fn_df

In [None]:
# Calcula la instancia con la media máxima
instancia_vp_max = vp_df['V.Pred media'].idxmax()
instancia_vn_max = vn_df['V.Pred media'].idxmax()
instancia_fp_max = fp_df['V.Pred media'].idxmax()
instancia_fn_max = fn_df['V.Pred media'].idxmax()

# Calcula la instancia con la mediana
mediana_vp = vp_df['V.Pred media'].median()
mediana_vn = vn_df['V.Pred media'].median()
mediana_fp = fp_df['V.Pred media'].median()
mediana_fn = fn_df['V.Pred media'].median()

instancia_vp_mediana = vp_df.loc[(vp_df['V.Pred media'] - mediana_vp).abs().idxmin()].name
instancia_vn_mediana = vn_df.loc[(vn_df['V.Pred media'] - mediana_vn).abs().idxmin()].name
instancia_fp_mediana = fp_df.loc[(fp_df['V.Pred media'] - mediana_fp).abs().idxmin()].name
instancia_fn_mediana = fn_df.loc[(fn_df['V.Pred media'] - mediana_fn).abs().idxmin()].name

# Calcula la instancia con la media mínima
instancia_vp_min = vp_df['V.Pred media'].idxmin()
instancia_vn_min = vn_df['V.Pred media'].idxmin()
instancia_fp_min = fp_df['V.Pred media'].idxmin()
instancia_fn_min = fn_df['V.Pred media'].idxmin()

In [None]:
# Mostrar las instancias
print("La instancia verdadero positivo máxima es: ", instancia_vp_max)
print("La instancia verdadero negativo máxima es: ", instancia_vn_max)
print("La instancia falso positivo máxima es: ", instancia_fp_max)
print("La instancia falso negativo máxima es: ", instancia_fn_max)

print("La instancia verdadero positivo en la mediana es: ", instancia_vp_mediana)
print("La instancia verdadero negativo en la mediana es: ", instancia_vn_mediana)
print("La instancia falso positivo en la mediana es: ", instancia_fp_mediana)
print("La instancia falso negativo en la mediana es: ", instancia_fn_mediana)

print("La instancia verdadero positivo minima es: ", instancia_vp_min)
print("La instancia verdadero negativo minima es: ", instancia_vn_min)
print("La instancia falso positivo minima es: ", instancia_fp_min)
print("La instancia falso negativo minima es: ", instancia_fn_min)

In [None]:
# Indices del conjunto de prueba
lista_indices = y_test.index.tolist()

In [None]:
# Posiciones dentro del ndarray de las predicciones
pos_vp_max = lista_indices.index(instancia_vp_max)
pos_vp_min = lista_indices.index(instancia_vp_min)
pos_vp_mediana = lista_indices.index(instancia_vp_mediana)
pos_vn_max = lista_indices.index(instancia_vn_max)
pos_vn_min = lista_indices.index(instancia_vn_min)
pos_vn_mediana = lista_indices.index(instancia_vn_mediana)
pos_fp_max = lista_indices.index(instancia_fp_max)
pos_fp_min = lista_indices.index(instancia_fp_min)
pos_fp_mediana = lista_indices.index(instancia_fp_mediana)
pos_fn_max = lista_indices.index(instancia_fn_max)
pos_fn_min = lista_indices.index(instancia_fn_min)
pos_fn_mediana = lista_indices.index(instancia_fn_mediana)

In [None]:
df_instancia_vp_max = x_test.loc[instancia_vp_max]
df_instancia_vp_min = x_test.loc[instancia_vp_min]
df_instancia_vp_mediana = x_test.loc[instancia_vp_mediana]
df_instancia_vn_max = x_test.loc[instancia_vn_max]
df_instancia_vn_min = x_test.loc[instancia_vn_min]
df_instancia_vn_mediana = x_test.loc[instancia_vn_mediana]
df_instancia_fp_max = x_test.loc[instancia_fp_max]
df_instancia_fp_min = x_test.loc[instancia_fp_min]
df_instancia_fp_mediana = x_test.loc[instancia_fp_mediana]
df_instancia_fn_max = x_test.loc[instancia_fn_max]
df_instancia_fn_min = x_test.loc[instancia_fn_min]
df_instancia_fn_mediana = x_test.loc[instancia_fn_mediana]

# **Explicabilidad:**

In [None]:
# Función para evaluar el signo de los valores
def evaluar_valor(valor):
    if valor >= 0:
        return "Positivo"
    else:
        return "Negativo"

In [None]:
# Crear las columnas del MultiIndex
columns_multi = pd.MultiIndex.from_tuples([
    ('Breakdown', 'Ranking'), ('Breakdown', 'Signo'),
    ('Shapley', 'Ranking'), ('Shapley', 'Signo'),
    ('Lime', 'Ranking'), ('Lime', 'Signo')
])

## **RandomForest:**

**FEATURE IMPORTANCE**

In [None]:
permu = permutation_importance(modelo_rf, x_test, y_test, n_repeats=20, random_state=42, n_jobs=2, scoring='f1')

# Umbral para valores significativos
importance_threshold = 0.01

# Filtrar los caracteristicas
significant_indices = permu.importances_mean > importance_threshold
permu_importances = pd.Series(permu.importances_mean.round(2), index=feature_names)[significant_indices]
permu_std = permu.importances_std[significant_indices]

# Crear la representación gráfica
fig, ax = plt.subplots()
permu_importances.plot.bar(yerr=permu_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
scoring = ['precision', 'recall','f1']
metric_names = ['Precision', 'Recall', 'F1-score']

permu_score = permutation_importance(modelo_rf, x_test, y_test, n_repeats=20, random_state=42, scoring=scoring)
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

# Umbral para importancias significativas
importance_threshold = 0.01

# Itera a través de las métricas
for i, metric in enumerate(scoring):
    permu = permu_score[metric]

    # Filtra las características que cumplen la condición del umbral
    significant_indices = [j for j in range(len(permu.importances_mean)) if permu.importances_mean[j] > importance_threshold]
    sorted_feature_names = [feature_names[j] for j in significant_indices]
    importances_mean = permu.importances_mean[significant_indices]
    importances_std = permu.importances_std[significant_indices]

    # Ordena los datos por importancia de mayor a menor
    sorted_indices = np.argsort(importances_mean)[::1]
    sorted_feature_names = [sorted_feature_names[j] for j in sorted_indices]
    importances_mean = importances_mean[sorted_indices]
    importances_std = importances_std[sorted_indices]

    # Crea la representación gráfica en el subplot correspondiente
    axs[i].barh(range(len(sorted_feature_names)), importances_mean, xerr=importances_std, align='center')
    axs[i].set_yticks(range(len(sorted_feature_names)))
    axs[i].set_yticklabels(sorted_feature_names)
    axs[i].set_xlabel('Valor Importancia')
    axs[i].set_title(f'Importancia por Permutación para {metric_names[i]}')

# Ajusta los espacios entre subplots y muestra la figura
plt.tight_layout()
plt.show()


In [None]:
scoring = ['precision','recall','f1']
metric_names = ['Precision', 'Recall', 'F1-score']

# Umbral para importancias significativas
importance_threshold = 0.01

# Crea un diccionario para almacenar los DataFrames
results_global_rf = {}

permu_score = permutation_importance(modelo_rf, x_test, y_test, n_repeats=20, random_state=42, scoring=scoring)
for i, metric in enumerate(scoring):
  permu = permu_score[metric]

  # Filtra las características que cumplen la condición
  significant_indices = [j for j in range(len(permu.importances_mean)) if permu.importances_mean[j] > importance_threshold]
  sorted_feature_names = [feature_names[j] for j in significant_indices]
  importances_mean = permu.importances_mean[significant_indices]
  importances_std = permu.importances_std[significant_indices]

  # Crear un DataFrame con los resultados
  df_exp_global = pd.DataFrame({'Feature': sorted_feature_names,
                       'Importance_Mean': importances_mean,
                       'Importance_Std': importances_std})

  # Ordenar el DataFrame por importance_mean en orden descendente
  df_exp_global = df_exp_global.sort_values(by='Importance_Mean', ascending=False)

  # Asignar el DataFrame al diccionario con el nombre de la métrica
  results_global_rf[f'df_global_{metric_names[i]}'] = df_exp_global

In [None]:
results_global_rf['df_global_Precision']

In [None]:
results_global_rf['df_global_Recall']

In [None]:
results_global_rf['df_global_F1-score']

**BREAK-DOWN, SHAP Y LIME:**

In [None]:
#primero definimos el explainer
exp = dx.Explainer(modelo_rf, x_train, y_train)

### **Instancia VP MAX:**

In [None]:
breakdown_vp_max = exp.predict_parts(df_instancia_vp_max, type="break_down",random_state=42)
shap_vp_max = exp.predict_parts(df_instancia_vp_max, type="shap",random_state=42)
lime_vp_max = exp.predict_surrogate(df_instancia_vp_max, random_state=42)

breakdown_vp_df_max = breakdown_vp_max.result
shap_vp_df_max = shap_vp_max.result
lime_vp_df_max=lime_vp_max.result

In [None]:
breakdown_vp_max.plot()

In [None]:
shap_vp_max.plot()

In [None]:
lime_vp_max.plot()

In [None]:
breakdown_vp_df_max = breakdown_vp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_max = breakdown_vp_df_max.drop(index=[0, 26])
breakdown_vp_df_max['sign'] = breakdown_vp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_max = breakdown_vp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_max = shap_vp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_max = shap_vp_df_max.tail(25)
shap_vp_df_max['sign'] = shap_vp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_max = shap_vp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_max["Variable"] = lime_vp_df_max["variable"].str.split(" ").str[0]
lime_vp_df_max["Signo"] = lime_vp_df_max["effect"].apply(evaluar_valor)
lime_vp_df_max = lime_vp_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_max = lime_vp_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_max['Ranking'] = breakdown_vp_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_max = breakdown_vp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_max['Ranking'] = shap_vp_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_max = shap_vp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_max['Ranking'] = lime_vp_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_max = lime_vp_df_max.head(5)
lime_vp_df_max = lime_vp_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_max = breakdown_vp_df_max.drop(columns=['contribution'])
shap_vp_df_max = shap_vp_df_max.drop(columns=['contribution'])
lime_vp_df_max = lime_vp_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_max = breakdown_vp_df_max.head(5)
breakdown_vp_df_max = breakdown_vp_df_max.reset_index(drop=True)

shap_vp_df_max = shap_vp_df_max.head(5)
shap_vp_df_max = shap_vp_df_max.reset_index(drop=True)

lime_vp_df_max = lime_vp_df_max.reset_index(drop=True)

print(breakdown_vp_df_max)
print(shap_vp_df_max)
print(lime_vp_df_max)

In [None]:
#lime_vp_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_max['Variable'])
shapley_features = list(shap_vp_df_max['Variable'])
lime_features = list(lime_vp_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_max[breakdown_vp_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_max[shap_vp_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_max[lime_vp_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "file_added", "file_modified", "developer_num", "line_removed", "file_removed"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia VP MEDIANA:**

In [None]:
breakdown_vp_mediana = exp.predict_parts(df_instancia_vp_mediana, type="break_down",random_state=42)
shap_vp_mediana = exp.predict_parts(df_instancia_vp_mediana, type="shap",random_state=42)
lime_vp_mediana = exp.predict_surrogate(df_instancia_vp_mediana, random_state=42)

breakdown_vp_df_mediana = breakdown_vp_mediana.result
shap_vp_df_mediana = shap_vp_mediana.result
lime_vp_df_mediana=lime_vp_mediana.result

In [None]:
breakdown_vp_mediana.plot()

In [None]:
shap_vp_mediana.plot()

In [None]:
lime_vp_mediana.plot()

In [None]:
breakdown_vp_df_mediana = breakdown_vp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_mediana = breakdown_vp_df_mediana.drop(index=[0, 26])
breakdown_vp_df_mediana['sign'] = breakdown_vp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_mediana = breakdown_vp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_mediana = shap_vp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_mediana = shap_vp_df_mediana.tail(25)
shap_vp_df_mediana['sign'] = shap_vp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_mediana = shap_vp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_mediana["Variable"] = lime_vp_df_mediana["variable"].str.split(" ").str[0]
lime_vp_df_mediana["Signo"] = lime_vp_df_mediana["effect"].apply(evaluar_valor)
lime_vp_df_mediana = lime_vp_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_mediana = lime_vp_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_mediana['Ranking'] = breakdown_vp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_mediana = breakdown_vp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_mediana['Ranking'] = shap_vp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_mediana = shap_vp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_mediana['Ranking'] = lime_vp_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_mediana = lime_vp_df_mediana.head(5)
lime_vp_df_mediana = lime_vp_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_mediana = breakdown_vp_df_mediana.drop(columns=['contribution'])
shap_vp_df_mediana = shap_vp_df_mediana.drop(columns=['contribution'])
lime_vp_df_mediana = lime_vp_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_mediana = breakdown_vp_df_mediana.head(5)
breakdown_vp_df_mediana = breakdown_vp_df_mediana.reset_index(drop=True)

shap_vp_df_mediana = shap_vp_df_mediana.head(5)
shap_vp_df_mediana = shap_vp_df_mediana.reset_index(drop=True)

lime_vp_df_mediana = lime_vp_df_mediana.reset_index(drop=True)

print(breakdown_vp_df_mediana)
print(shap_vp_df_mediana)
print(lime_vp_df_mediana)

In [None]:
#lime_vp_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_mediana['Variable'])
shapley_features = list(shap_vp_df_mediana['Variable'])
lime_features = list(lime_vp_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_mediana[breakdown_vp_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_mediana[shap_vp_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_mediana[lime_vp_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "developer_num", "file_added", "duration", "file_modified", "file_removed"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia VP MIN:**

In [None]:
breakdown_vp_min = exp.predict_parts(df_instancia_vp_min, type="break_down",random_state=42)
shap_vp_min = exp.predict_parts(df_instancia_vp_min, type="shap",random_state=42)
lime_vp_min = exp.predict_surrogate(df_instancia_vp_min, random_state=42)

breakdown_vp_df_min = breakdown_vp_min.result
shap_vp_df_min = shap_vp_min.result
lime_vp_df_min = lime_vp_min.result

In [None]:
breakdown_vp_min.plot()

In [None]:
shap_vp_min.plot()

In [None]:
lime_vp_min.plot()

In [None]:
breakdown_vp_df_min = breakdown_vp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_min = breakdown_vp_df_min.drop(index=[0, 26])
breakdown_vp_df_min['sign'] = breakdown_vp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_min = breakdown_vp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_min = shap_vp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_min = shap_vp_df_min.tail(25)
shap_vp_df_min['sign'] = shap_vp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_min = shap_vp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_min["Variable"] = lime_vp_df_min["variable"].str.split(" ").str[0]
lime_vp_df_min["Signo"] = lime_vp_df_min["effect"].apply(evaluar_valor)
lime_vp_df_min = lime_vp_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_min = lime_vp_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_min['Ranking'] = breakdown_vp_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_min = breakdown_vp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_min['Ranking'] = shap_vp_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_min = shap_vp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_min['Ranking'] = lime_vp_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_min = lime_vp_df_min.head(5)
lime_vp_df_min = lime_vp_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_min = breakdown_vp_df_min.drop(columns=['contribution'])
shap_vp_df_min = shap_vp_df_min.drop(columns=['contribution'])
lime_vp_df_min = lime_vp_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_min = breakdown_vp_df_min.head(5)
breakdown_vp_df_min = breakdown_vp_df_min.reset_index(drop=True)

shap_vp_df_min = shap_vp_df_min.head(5)
shap_vp_df_min = shap_vp_df_min.reset_index(drop=True)

lime_vp_df_min = lime_vp_df_min.reset_index(drop=True)

print(breakdown_vp_df_min)
print(shap_vp_df_min)
print(lime_vp_df_min)

In [None]:
#lime_vp_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_min['Variable'])
shapley_features = list(shap_vp_df_min['Variable'])
lime_features = list(lime_vp_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_min[breakdown_vp_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_min[shap_vp_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_min[lime_vp_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "developer_num", "messages_max", "messages_median", "line_removed", "file_removed", "bug_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **VP General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_rf_vp = pd.DataFrame(data_dict, index=all_caract)

df_resumen_rf_vp

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_rf_vp[("General", "Ranking")] = df_resumen_rf_vp[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_rf_vp[("General", "Conteo Total")] = df_resumen_rf_vp[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_rf_vp

In [None]:
# Obtener el número de características
num_caract = df_resumen_rf_vp.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_rf_vp[("General", "Peso Rango")] = 1 - ((df_resumen_rf_vp[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_rf_vp[("General", "Peso Conteo")] = df_resumen_rf_vp[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_rf_vp[("General", "Puntaje")] = df_resumen_rf_vp[("General", "Peso Rango")] + df_resumen_rf_vp[("General", "Peso Conteo")]
df_resumen_rf_vp[("General", "Ranking")] = df_resumen_rf_vp[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_rf_vp.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_rf_vp.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_rf_vp.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_rf_vp

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_rf_vp.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_rf_vp = df_resumen_rf_vp[new_columns]

In [None]:
df_resumen_rf_vp.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_rf_vp[(tech, "Ranking Medio")] = df_resumen_rf_vp[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_rf_vp

### **Instancia VN MAX:**

In [None]:
breakdown_vn_max = exp.predict_parts(df_instancia_vn_max, type="break_down",random_state=42)
shap_vn_max = exp.predict_parts(df_instancia_vn_max, type="shap",random_state=42)
lime_vn_max = exp.predict_surrogate(df_instancia_vn_max, random_state=42)

breakdown_vn_df_max = breakdown_vn_max.result
shap_vn_df_max = shap_vn_max.result
lime_vn_df_max = lime_vn_max.result

In [None]:
breakdown_vn_max.plot()

In [None]:
shap_vn_max.plot()

In [None]:
lime_vn_max.plot()

In [None]:
breakdown_vn_df_max = breakdown_vn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_max = breakdown_vn_df_max.drop(index=[0, 26])
breakdown_vn_df_max['sign'] = breakdown_vn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_max = breakdown_vn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_max = shap_vn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_max = shap_vn_df_max.tail(25)
shap_vn_df_max['sign'] = shap_vn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_max = shap_vn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_max["Variable"] = lime_vn_df_max["variable"].str.split(" ").str[0]
lime_vn_df_max["Signo"] = lime_vn_df_max["effect"].apply(evaluar_valor)
lime_vn_df_max = lime_vn_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_max = lime_vn_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_max['Ranking'] = breakdown_vn_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_max = breakdown_vn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_max['Ranking'] = shap_vn_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_max = shap_vn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_max['Ranking'] = lime_vn_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_max = lime_vn_df_max.head(5)
lime_vn_df_max = lime_vn_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_vn_df_max = breakdown_vn_df_max.drop(columns=['contribution'])
shap_vn_df_max = shap_vn_df_max.drop(columns=['contribution'])
lime_vn_df_max = lime_vn_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_max = breakdown_vn_df_max.head(5)
breakdown_vn_df_max = breakdown_vn_df_max.reset_index(drop=True)

shap_vn_df_max = shap_vn_df_max.head(5)
shap_vn_df_max = shap_vn_df_max.reset_index(drop=True)

lime_vn_df_max = lime_vn_df_max.reset_index(drop=True)
lime_vn_df_max.at[1, 'Variable'] = 'commit_num'
lime_vn_df_max.at[3, 'Variable'] = 'file_added'

print(breakdown_vn_df_max)
print(shap_vn_df_max)
print(lime_vn_df_max)

In [None]:
#lime_vn_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_max['Variable'])
shapley_features = list(shap_vn_df_max['Variable'])
lime_features = list(lime_vn_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_max[breakdown_vn_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_max[shap_vn_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_max[lime_vn_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["commit_num", "parallel_changed_file_num", "duration", "file_added", "line_added", "developer_num", "file_removed", "bug_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia VN MEDIANA:**

In [None]:
breakdown_vn_mediana = exp.predict_parts(df_instancia_vn_mediana, type="break_down",random_state=42)
shap_vn_mediana = exp.predict_parts(df_instancia_vn_mediana, type="shap",random_state=42)
lime_vn_mediana = exp.predict_surrogate(df_instancia_vn_mediana, random_state=42)

breakdown_vn_df_mediana = breakdown_vn_mediana.result
shap_vn_df_mediana = shap_vn_mediana.result
lime_vn_df_mediana = lime_vn_mediana.result

In [None]:
breakdown_vn_mediana.plot()

In [None]:
shap_vn_mediana.plot()

In [None]:
lime_vn_mediana.plot()

In [None]:
breakdown_vn_df_mediana = breakdown_vn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_mediana = breakdown_vn_df_mediana.drop(index=[0, 26])
breakdown_vn_df_mediana['sign'] = breakdown_vn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_mediana = breakdown_vn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_mediana = shap_vn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_mediana = shap_vn_df_mediana.tail(25)
shap_vn_df_mediana['sign'] = shap_vn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_mediana = shap_vn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_mediana["Variable"] = lime_vn_df_mediana["variable"].str.split(" ").str[0]
lime_vn_df_mediana["Signo"] = lime_vn_df_mediana["effect"].apply(evaluar_valor)
lime_vn_df_mediana = lime_vn_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_mediana = lime_vn_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_mediana['Ranking'] = breakdown_vn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_mediana = breakdown_vn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_mediana['Ranking'] = shap_vn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_mediana = shap_vn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_mediana['Ranking'] = lime_vn_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_mediana = lime_vn_df_mediana.head(5)
lime_vn_df_mediana = lime_vn_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_vn_df_mediana = breakdown_vn_df_mediana.drop(columns=['contribution'])
shap_vn_df_mediana = shap_vn_df_mediana.drop(columns=['contribution'])
lime_vn_df_mediana = lime_vn_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_mediana = breakdown_vn_df_mediana.head(5)
breakdown_vn_df_mediana = breakdown_vn_df_mediana.reset_index(drop=True)

shap_vn_df_mediana = shap_vn_df_mediana.head(5)
shap_vn_df_mediana = shap_vn_df_mediana.reset_index(drop=True)

lime_vn_df_mediana = lime_vn_df_mediana.reset_index(drop=True)
lime_vn_df_mediana.at[2, 'Variable'] = 'developer_num'
lime_vn_df_mediana.at[3, 'Variable'] = 'file_added'

print(breakdown_vn_df_mediana)
print(shap_vn_df_mediana)
print(lime_vn_df_mediana)

In [None]:
#lime_vn_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_mediana['Variable'])
shapley_features = list(shap_vn_df_mediana['Variable'])
lime_features = list(lime_vn_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_mediana[breakdown_vn_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_mediana[shap_vn_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_mediana[lime_vn_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "commit_num", "file_removed", "messages_median", "file_added"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia VN MIN:**

In [None]:
breakdown_vn_min = exp.predict_parts(df_instancia_vn_min, type="break_down",random_state=42)
shap_vn_min = exp.predict_parts(df_instancia_vn_min, type="shap",random_state=42)
lime_vn_min = exp.predict_surrogate(df_instancia_vn_min, random_state=42)

breakdown_vn_df_min = breakdown_vn_min.result
shap_vn_df_min = shap_vn_min.result
lime_vn_df_min = lime_vn_min.result

In [None]:
breakdown_vn_min.plot()

In [None]:
shap_vn_min.plot()

In [None]:
lime_vn_min.plot()

In [None]:
breakdown_vn_df_min = breakdown_vn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_min = breakdown_vn_df_min.drop(index=[0, 26])
breakdown_vn_df_min['sign'] = breakdown_vn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_min = breakdown_vn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_min = shap_vn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_min = shap_vn_df_min.tail(25)
shap_vn_df_min['sign'] = shap_vn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_min = shap_vn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_min["Variable"] = lime_vn_df_min["variable"].str.split(" ").str[0]
lime_vn_df_min["Signo"] = lime_vn_df_min["effect"].apply(evaluar_valor)
lime_vn_df_min = lime_vn_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_min = lime_vn_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_min['Ranking'] = breakdown_vn_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_min = breakdown_vn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_min['Ranking'] = shap_vn_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_min = shap_vn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_min['Ranking'] = lime_vn_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_min = lime_vn_df_min.head(5)
lime_vn_df_min = lime_vn_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vn_df_min = breakdown_vn_df_min.drop(columns=['contribution'])
shap_vn_df_min = shap_vn_df_min.drop(columns=['contribution'])
lime_vn_df_min = lime_vn_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_min = breakdown_vn_df_min.head(5)
breakdown_vn_df_min = breakdown_vn_df_min.reset_index(drop=True)

shap_vn_df_min = shap_vn_df_min.head(5)
shap_vn_df_min = shap_vn_df_min.reset_index(drop=True)

lime_vn_df_min = lime_vn_df_min.reset_index(drop=True)

print(breakdown_vn_df_min)
print(shap_vn_df_min)
print(lime_vn_df_min)

In [None]:
#lime_vn_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_min['Variable'])
shapley_features = list(shap_vn_df_min['Variable'])
lime_features = list(lime_vn_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_min[breakdown_vn_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_min[shap_vn_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_min[lime_vn_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "messages_min", "line_added", "file_modified", "messages_max", "file_removed", "file_added"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **VN General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_rf_vn = pd.DataFrame(data_dict, index=all_caract)

df_resumen_rf_vn

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_rf_vn[("General", "Ranking")] = df_resumen_rf_vn[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_rf_vn[("General", "Conteo Total")] = df_resumen_rf_vn[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_rf_vn

In [None]:
# Obtener el número de características
num_caract = df_resumen_rf_vn.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_rf_vn[("General", "Peso Rango")] = 1 - ((df_resumen_rf_vn[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_rf_vn[("General", "Peso Conteo")] = df_resumen_rf_vn[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_rf_vn[("General", "Puntaje")] = df_resumen_rf_vn[("General", "Peso Rango")] + df_resumen_rf_vn[("General", "Peso Conteo")]
df_resumen_rf_vn[("General", "Ranking")] = df_resumen_rf_vn[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_rf_vn.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_rf_vn.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_rf_vn.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_rf_vn

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_rf_vn.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_rf_vn = df_resumen_rf_vn[new_columns]

In [None]:
df_resumen_rf_vn.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_rf_vn[(tech, "Ranking Medio")] = df_resumen_rf_vn[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_rf_vn

### **Instancia FP MAX:**

In [None]:
breakdown_fp_max = exp.predict_parts(df_instancia_fp_max, type="break_down",random_state=42)
shap_fp_max = exp.predict_parts(df_instancia_fp_max, type="shap",random_state=42)
lime_fp_max = exp.predict_surrogate(df_instancia_fp_max, random_state=42)

breakdown_fp_df_max = breakdown_fp_max.result
shap_fp_df_max = shap_fp_max.result
lime_fp_df_max=lime_fp_max.result

In [None]:
breakdown_fp_max.plot()

In [None]:
shap_fp_max.plot()

In [None]:
lime_fp_max.plot()

In [None]:
breakdown_fp_df_max = breakdown_fp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_max = breakdown_fp_df_max.drop(index=[0, 26])
breakdown_fp_df_max['sign'] = breakdown_fp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_max = breakdown_fp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_max = shap_fp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_max = shap_fp_df_max.tail(25)
shap_fp_df_max['sign'] = shap_fp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_max = shap_fp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_max["Variable"] = lime_fp_df_max["variable"].str.split(" ").str[0]
lime_fp_df_max["Signo"] = lime_fp_df_max["effect"].apply(evaluar_valor)
lime_fp_df_max = lime_fp_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_max = lime_fp_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_max['Ranking'] = breakdown_fp_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_max = breakdown_fp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_max['Ranking'] = shap_fp_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_max = shap_fp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_max['Ranking'] = lime_fp_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_max = lime_fp_df_max.head(5)
lime_fp_df_max = lime_fp_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_fp_df_max = breakdown_fp_df_max.drop(columns=['contribution'])
shap_fp_df_max = shap_fp_df_max.drop(columns=['contribution'])
lime_fp_df_max = lime_fp_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_max = breakdown_fp_df_max.head(5)
breakdown_fp_df_max = breakdown_fp_df_max.reset_index(drop=True)

shap_fp_df_max = shap_fp_df_max.head(5)
shap_fp_df_max = shap_fp_df_max.reset_index(drop=True)

lime_fp_df_max = lime_fp_df_max.reset_index(drop=True)

print(breakdown_fp_df_max)
print(shap_fp_df_max)
print(lime_fp_df_max)

In [None]:
#lime_fp_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_max['Variable'])
shapley_features = list(shap_fp_df_max['Variable'])
lime_features = list(lime_fp_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_max[breakdown_fp_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_max[shap_fp_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_max[lime_fp_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "file_added", "developer_num", "file_modified", "line_removed", "file_removed"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia FP MEDIANA:**

In [None]:
breakdown_fp_mediana = exp.predict_parts(df_instancia_fp_mediana, type="break_down",random_state=42)
shap_fp_mediana = exp.predict_parts(df_instancia_fp_mediana, type="shap",random_state=42)
lime_fp_mediana = exp.predict_surrogate(df_instancia_fp_mediana, random_state=42)

breakdown_fp_df_mediana = breakdown_fp_mediana.result
shap_fp_df_mediana = shap_fp_mediana.result
lime_fp_df_mediana=lime_fp_mediana.result

In [None]:
breakdown_fp_mediana.plot()

In [None]:
shap_fp_mediana.plot()

In [None]:
lime_fp_mediana.plot()

In [None]:
breakdown_fp_df_mediana = breakdown_fp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_mediana = breakdown_fp_df_mediana.drop(index=[0, 26])
breakdown_fp_df_mediana['sign'] = breakdown_fp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_mediana = breakdown_fp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_mediana = shap_fp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_mediana = shap_fp_df_mediana.tail(25)
shap_fp_df_mediana['sign'] = shap_fp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_mediana = shap_fp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_mediana["Variable"] = lime_fp_df_mediana["variable"].str.split(" ").str[0]
lime_fp_df_mediana["Signo"] = lime_fp_df_mediana["effect"].apply(evaluar_valor)
lime_fp_df_mediana = lime_fp_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_mediana = lime_fp_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_mediana['Ranking'] = breakdown_fp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_mediana = breakdown_fp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_mediana['Ranking'] = shap_fp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_mediana = shap_fp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_mediana['Ranking'] = lime_fp_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_mediana = lime_fp_df_mediana.head(5)
lime_fp_df_mediana = lime_fp_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_fp_df_mediana = breakdown_fp_df_mediana.drop(columns=['contribution'])
shap_fp_df_mediana = shap_fp_df_mediana.drop(columns=['contribution'])
lime_fp_df_mediana = lime_fp_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_mediana = breakdown_fp_df_mediana.head(5)
breakdown_fp_df_mediana = breakdown_fp_df_mediana.reset_index(drop=True)

shap_fp_df_mediana = shap_fp_df_mediana.head(5)
shap_fp_df_mediana = shap_fp_df_mediana.reset_index(drop=True)

lime_fp_df_mediana = lime_fp_df_mediana.reset_index(drop=True)

print(breakdown_fp_df_mediana)
print(shap_fp_df_mediana)
print(lime_fp_df_mediana)

In [None]:
#lime_fp_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_mediana['Variable'])
shapley_features = list(shap_fp_df_mediana['Variable'])
lime_features = list(lime_fp_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_mediana[breakdown_fp_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_mediana[shap_fp_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_mediana[lime_fp_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "developer_num", "file_removed", "line_removed",  "file_added", "bug_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia FP MIN:**

In [None]:
breakdown_fp_min = exp.predict_parts(df_instancia_fp_min, type="break_down",random_state=42)
shap_fp_min = exp.predict_parts(df_instancia_fp_min, type="shap",random_state=42)
lime_fp_min = exp.predict_surrogate(df_instancia_fp_min, random_state=42)

breakdown_fp_df_min = breakdown_fp_min.result
shap_fp_df_min = shap_fp_min.result
lime_fp_df_min = lime_fp_min.result

In [None]:
breakdown_fp_min.plot()

In [None]:
shap_fp_min.plot()

In [None]:
lime_fp_min.plot()

In [None]:
breakdown_fp_df_min = breakdown_fp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_min = breakdown_fp_df_min.drop(index=[0, 26])
breakdown_fp_df_min['sign'] = breakdown_fp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_min = breakdown_fp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_min = shap_fp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_min = shap_fp_df_min.tail(25)
shap_fp_df_min['sign'] = shap_fp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_min = shap_fp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_min["Variable"] = lime_fp_df_min["variable"].str.split(" ").str[0]
lime_fp_df_min["Signo"] = lime_fp_df_min["effect"].apply(evaluar_valor)
lime_fp_df_min = lime_fp_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_min = lime_fp_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_min['Ranking'] = breakdown_fp_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_min = breakdown_fp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_min['Ranking'] = shap_fp_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_min = shap_fp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_min['Ranking'] = lime_fp_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_min = lime_fp_df_min.head(5)
lime_fp_df_min = lime_fp_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_fp_df_min = breakdown_fp_df_min.drop(columns=['contribution'])
shap_fp_df_min = shap_fp_df_min.drop(columns=['contribution'])
lime_fp_df_min = lime_fp_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_min = breakdown_fp_df_min.head(5)
breakdown_fp_df_min = breakdown_fp_df_min.reset_index(drop=True)

shap_fp_df_min = shap_fp_df_min.head(5)
shap_fp_df_min = shap_fp_df_min.reset_index(drop=True)

lime_fp_df_min = lime_fp_df_min.reset_index(drop=True)

print(breakdown_fp_df_min)
print(shap_fp_df_min)
print(lime_fp_df_min)

In [None]:
#lime_fp_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_min['Variable'])
shapley_features = list(shap_fp_df_min['Variable'])
lime_features = list(lime_fp_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_min[breakdown_fp_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_min[shap_fp_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_min[lime_fp_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "commit_num", "file_modified",  "file_removed", "file_added", "line_removed"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **FP General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_rf_fp = pd.DataFrame(data_dict, index=all_caract)

df_resumen_rf_fp

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_rf_fp[("General", "Ranking")] = df_resumen_rf_fp[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_rf_fp[("General", "Conteo Total")] = df_resumen_rf_fp[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_rf_fp

In [None]:
# Obtener el número de características
num_caract = df_resumen_rf_fp.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_rf_fp[("General", "Peso Rango")] = 1 - ((df_resumen_rf_fp[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_rf_fp[("General", "Peso Conteo")] = df_resumen_rf_fp[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_rf_fp[("General", "Puntaje")] = df_resumen_rf_fp[("General", "Peso Rango")] + df_resumen_rf_fp[("General", "Peso Conteo")]
df_resumen_rf_fp[("General", "Ranking")] = df_resumen_rf_fp[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_rf_fp.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_rf_fp.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_rf_fp.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_rf_fp

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_rf_fp.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_rf_fp = df_resumen_rf_fp[new_columns]

In [None]:
df_resumen_rf_fp.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_rf_fp[(tech, "Ranking Medio")] = df_resumen_rf_fp[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_rf_fp

### **Instancia FN MAX:**

In [None]:
breakdown_fn_max = exp.predict_parts(df_instancia_fn_max, type="break_down",random_state=42)
shap_fn_max = exp.predict_parts(df_instancia_fn_max, type="shap",random_state=42)
lime_fn_max = exp.predict_surrogate(df_instancia_fn_max, random_state=42)

breakdown_fn_df_max = breakdown_fn_max.result
shap_fn_df_max = shap_fn_max.result
lime_fn_df_max=lime_fn_max.result

In [None]:
breakdown_fn_max.plot()

In [None]:
shap_fn_max.plot()

In [None]:
lime_fn_max.plot()

In [None]:
breakdown_fn_df_max = breakdown_fn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_max = breakdown_fn_df_max.drop(index=[0, 26])
breakdown_fn_df_max['sign'] = breakdown_fn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_max = breakdown_fn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_max = shap_fn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_max = shap_fn_df_max.tail(25)
shap_fn_df_max['sign'] = shap_fn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_max = shap_fn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_max["Variable"] = lime_fn_df_max["variable"].str.split(" ").str[0]
lime_fn_df_max["Signo"] = lime_fn_df_max["effect"].apply(evaluar_valor)
lime_fn_df_max = lime_fn_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_max = lime_fn_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_max['Ranking'] = breakdown_fn_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_max = breakdown_fn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_max['Ranking'] = shap_fn_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_max = shap_fn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_max['Ranking'] = lime_fn_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_max = lime_fn_df_max.head(5)
lime_fn_df_max = lime_fn_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_fn_df_max = breakdown_fn_df_max.drop(columns=['contribution'])
shap_fn_df_max = shap_fn_df_max.drop(columns=['contribution'])
lime_fn_df_max = lime_fn_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_max = breakdown_fn_df_max.head(5)
breakdown_fn_df_max = breakdown_fn_df_max.reset_index(drop=True)

shap_fn_df_max = shap_fn_df_max.head(5)
shap_fn_df_max = shap_fn_df_max.reset_index(drop=True)

lime_fn_df_max = lime_fn_df_max.reset_index(drop=True)
lime_fn_df_max.at[2, 'Variable'] = 'developer_num'
lime_fn_df_max.at[3, 'Variable'] = 'file_added'

print(breakdown_fn_df_max)
print(shap_fn_df_max)
print(lime_fn_df_max)

In [None]:
#lime_fn_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_max['Variable'])
shapley_features = list(shap_fn_df_max['Variable'])
lime_features = list(lime_fn_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_max[breakdown_fn_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_max[shap_fn_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_max[lime_fn_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "developer_num", "file_modified", "duration", "file_added"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia FN MEDIANA:**

In [None]:
breakdown_fn_mediana = exp.predict_parts(df_instancia_fn_mediana, type="break_down",random_state=42)
shap_fn_mediana = exp.predict_parts(df_instancia_fn_mediana, type="shap",random_state=42)
lime_fn_mediana = exp.predict_surrogate(df_instancia_fn_mediana, random_state=42)

breakdown_fn_df_mediana = breakdown_fn_mediana.result
shap_fn_df_mediana = shap_fn_mediana.result
lime_fn_df_mediana=lime_fn_mediana.result

In [None]:
breakdown_fn_mediana.plot()

In [None]:
shap_fn_mediana.plot()

In [None]:
lime_fn_mediana.plot()

In [None]:
breakdown_fn_df_mediana = breakdown_fn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_mediana = breakdown_fn_df_mediana.drop(index=[0, 26])
breakdown_fn_df_mediana['sign'] = breakdown_fn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_mediana = breakdown_fn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_mediana = shap_fn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_mediana = shap_fn_df_mediana.tail(25)
shap_fn_df_mediana['sign'] = shap_fn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_mediana = shap_fn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_mediana["Variable"] = lime_fn_df_mediana["variable"].str.split(" ").str[0]
lime_fn_df_mediana["Signo"] = lime_fn_df_mediana["effect"].apply(evaluar_valor)
lime_fn_df_mediana = lime_fn_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_mediana = lime_fn_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_mediana['Ranking'] = breakdown_fn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_mediana = breakdown_fn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_mediana['Ranking'] = shap_fn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_mediana = shap_fn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_mediana['Ranking'] = lime_fn_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_mediana = lime_fn_df_mediana.head(5)
lime_fn_df_mediana = lime_fn_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_fn_df_mediana = breakdown_fn_df_mediana.drop(columns=['contribution'])
shap_fn_df_mediana = shap_fn_df_mediana.drop(columns=['contribution'])
lime_fn_df_mediana = lime_fn_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_mediana = breakdown_fn_df_mediana.head(5)
breakdown_fn_df_mediana = breakdown_fn_df_mediana.reset_index(drop=True)

shap_fn_df_mediana = shap_fn_df_mediana.head(5)
shap_fn_df_mediana = shap_fn_df_mediana.reset_index(drop=True)

lime_fn_df_mediana = lime_fn_df_mediana.reset_index(drop=True)

print(breakdown_fn_df_mediana)
print(shap_fn_df_mediana)
print(lime_fn_df_mediana)

In [None]:
#lime_fn_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_mediana['Variable'])
shapley_features = list(shap_fn_df_mediana['Variable'])
lime_features = list(lime_fn_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_mediana[breakdown_fn_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_mediana[shap_fn_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_mediana[lime_fn_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "commit_num", "duration", "line_added", "line_removed", "file_modified",  "file_removed", "file_added"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia FN MIN:**

In [None]:
breakdown_fn_min = exp.predict_parts(df_instancia_fn_min, type="break_down",random_state=42)
shap_fn_min = exp.predict_parts(df_instancia_fn_min, type="shap",random_state=42)
lime_fn_min = exp.predict_surrogate(df_instancia_fn_min, random_state=42)

breakdown_fn_df_min = breakdown_fn_min.result
shap_fn_df_min = shap_fn_min.result
lime_fn_df_min = lime_fn_min.result

In [None]:
breakdown_fn_min.plot()

In [None]:
shap_fn_min.plot()

In [None]:
lime_fn_min.plot()

In [None]:
breakdown_fn_df_min = breakdown_fn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_min = breakdown_fn_df_min.drop(index=[0, 26])
breakdown_fn_df_min['sign'] = breakdown_fn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_min = breakdown_fn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_min = shap_fn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_min = shap_fn_df_min.tail(25)
shap_fn_df_min['sign'] = shap_fn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_min = shap_fn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_min["Variable"] = lime_fn_df_min["variable"].str.split(" ").str[0]
lime_fn_df_min["Signo"] = lime_fn_df_min["effect"].apply(evaluar_valor)
lime_fn_df_min = lime_fn_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_min = lime_fn_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_min['Ranking'] = breakdown_fn_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_min = breakdown_fn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_min['Ranking'] = shap_fn_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_min = shap_fn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_min['Ranking'] = lime_fn_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_min = lime_fn_df_min.head(5)
lime_fn_df_min = lime_fn_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_fn_df_min = breakdown_fn_df_min.drop(columns=['contribution'])
shap_fn_df_min = shap_fn_df_min.drop(columns=['contribution'])
lime_fn_df_min = lime_fn_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_min = breakdown_fn_df_min.head(5)
breakdown_fn_df_min = breakdown_fn_df_min.reset_index(drop=True)

shap_fn_df_min = shap_fn_df_min.head(5)
shap_fn_df_min = shap_fn_df_min.reset_index(drop=True)

lime_fn_df_min = lime_fn_df_min.reset_index(drop=True)

print(breakdown_fn_df_min)
print(shap_fn_df_min)
print(lime_fn_df_min)

In [None]:
#lime_fn_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_min['Variable'])
shapley_features = list(shap_fn_df_min['Variable'])
lime_features = list(lime_fn_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_min[breakdown_fn_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_min[shap_fn_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_min[lime_fn_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "commit_num", "file_modified",  "file_added", "duration", "file_removed", "bug_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **FN General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_rf_fn = pd.DataFrame(data_dict, index=all_caract)

df_resumen_rf_fn

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_rf_fn[("General", "Ranking")] = df_resumen_rf_fn[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_rf_fn[("General", "Conteo Total")] = df_resumen_rf_fn[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_rf_fn

In [None]:
# Obtener el número de características
num_caract = df_resumen_rf_fn.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_rf_fn[("General", "Peso Rango")] = 1 - ((df_resumen_rf_fn[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_rf_fn[("General", "Peso Conteo")] = df_resumen_rf_fn[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_rf_fn[("General", "Puntaje")] = df_resumen_rf_fn[("General", "Peso Rango")] + df_resumen_rf_fn[("General", "Peso Conteo")]
df_resumen_rf_fn[("General", "Ranking")] = df_resumen_rf_fn[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_rf_fn.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_rf_fn.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_rf_fn.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_rf_fn

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_rf_fn.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_rf_fn = df_resumen_rf_fn[new_columns]

In [None]:
df_resumen_rf_fn.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_rf_fn[(tech, "Ranking Medio")] = df_resumen_rf_fn[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_rf_fn

## **BalancedRandomForest:**

**FEATURE IMPORTANCE**

In [None]:
permu = permutation_importance(modelo_brf, x_test, y_test, n_repeats=20, random_state=42, n_jobs=2, scoring='f1')

# Umbral para valores significativos
importance_threshold = 0.01

# Filtrar los caracteristicas
significant_indices = permu.importances_mean > importance_threshold
permu_importances = pd.Series(permu.importances_mean.round(3), index=feature_names)[significant_indices]
permu_std = permu.importances_std[significant_indices]

# Crear la representación gráfica
fig, ax = plt.subplots()
permu_importances.plot.bar(yerr=permu_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
scoring = ['precision', 'recall','f1']
metric_names = ['Precision', 'Recall', 'F1-score']

permu_score = permutation_importance(modelo_brf, x_test, y_test, n_repeats=20, random_state=42, scoring=scoring)
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

# Umbral para importancias significativas
importance_threshold = 0.01

# Itera a través de las métricas
for i, metric in enumerate(scoring):
    permu = permu_score[metric]

    # Filtra las características que cumplen la condición del umbral
    significant_indices = [j for j in range(len(permu.importances_mean)) if permu.importances_mean[j] > importance_threshold]
    sorted_feature_names = [feature_names[j] for j in significant_indices]
    importances_mean = permu.importances_mean[significant_indices]
    importances_std = permu.importances_std[significant_indices]

    # Ordena los datos por importancia de mayor a menor
    sorted_indices = np.argsort(importances_mean)[::1]
    sorted_feature_names = [sorted_feature_names[j] for j in sorted_indices]
    importances_mean = importances_mean[sorted_indices]
    importances_std = importances_std[sorted_indices]

    # Crea la representación gráfica en el subplot correspondiente
    axs[i].barh(range(len(sorted_feature_names)), importances_mean, xerr=importances_std, align='center')
    axs[i].set_yticks(range(len(sorted_feature_names)))
    axs[i].set_yticklabels(sorted_feature_names)
    axs[i].set_xlabel('Valor Importancia')
    axs[i].set_title(f'Importancia por Permutación para {metric_names[i]}')

# Ajusta los espacios entre subplots y muestra la figura
plt.tight_layout()
plt.show()


In [None]:
scoring = ['precision','recall','f1']
metric_names = ['Precision', 'Recall', 'F1-score']

# Umbral para importancias significativas
importance_threshold = 0.01

# Crea un diccionario para almacenar los DataFrames
results_global_brf = {}

permu_score = permutation_importance(modelo_brf, x_test, y_test, n_repeats=20, random_state=42, scoring=scoring)
for i, metric in enumerate(scoring):
  permu = permu_score[metric]

  # Filtra las características que cumplen la condición
  significant_indices = [j for j in range(len(permu.importances_mean)) if permu.importances_mean[j] > importance_threshold]
  sorted_feature_names = [feature_names[j] for j in significant_indices]
  importances_mean = permu.importances_mean[significant_indices]
  importances_std = permu.importances_std[significant_indices]

  # Crear un DataFrame con los resultados
  df_exp_global = pd.DataFrame({'Feature': sorted_feature_names,
                       'Importance_Mean': importances_mean,
                       'Importance_Std': importances_std})

  # Ordenar el DataFrame por importance_mean en orden descendente
  df_exp_global = df_exp_global.sort_values(by='Importance_Mean', ascending=False)

  # Asignar el DataFrame al diccionario con el nombre de la métrica
  results_global_brf[f'df_global_{metric_names[i]}'] = df_exp_global

In [None]:
results_global_brf['df_global_Precision']

In [None]:
results_global_brf['df_global_Recall']

In [None]:
results_global_brf['df_global_F1-score']

**BREAK-DOWN, SHAP Y LIME:**

In [None]:
#primero definimos el explainer
exp = dx.Explainer(modelo_brf, x_train, y_train)

### **Instancia VP MAX:**

In [None]:
breakdown_vp_max = exp.predict_parts(df_instancia_vp_max, type="break_down",random_state=42)
shap_vp_max = exp.predict_parts(df_instancia_vp_max, type="shap",random_state=42)
lime_vp_max = exp.predict_surrogate(df_instancia_vp_max, random_state=42)

breakdown_vp_df_max = breakdown_vp_max.result
shap_vp_df_max = shap_vp_max.result
lime_vp_df_max=lime_vp_max.result

In [None]:
breakdown_vp_max.plot()

In [None]:
shap_vp_max.plot()

In [None]:
lime_vp_max.plot()

In [None]:
breakdown_vp_df_max = breakdown_vp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_max = breakdown_vp_df_max.drop(index=[0, 26])
breakdown_vp_df_max['sign'] = breakdown_vp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_max = breakdown_vp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_max = shap_vp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_max = shap_vp_df_max.tail(25)
shap_vp_df_max['sign'] = shap_vp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_max = shap_vp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_max["Variable"] = lime_vp_df_max["variable"].str.split(" ").str[0]
lime_vp_df_max["Signo"] = lime_vp_df_max["effect"].apply(evaluar_valor)
lime_vp_df_max = lime_vp_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_max = lime_vp_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_max['Ranking'] = breakdown_vp_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_max = breakdown_vp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_max['Ranking'] = shap_vp_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_max = shap_vp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_max['Ranking'] = lime_vp_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_max = lime_vp_df_max.head(5)
lime_vp_df_max = lime_vp_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_max = breakdown_vp_df_max.drop(columns=['contribution'])
shap_vp_df_max = shap_vp_df_max.drop(columns=['contribution'])
lime_vp_df_max = lime_vp_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_max = breakdown_vp_df_max.head(5)
breakdown_vp_df_max = breakdown_vp_df_max.reset_index(drop=True)

shap_vp_df_max = shap_vp_df_max.head(5)
shap_vp_df_max = shap_vp_df_max.reset_index(drop=True)

lime_vp_df_max = lime_vp_df_max.reset_index(drop=True)

print(breakdown_vp_df_max)
print(shap_vp_df_max)
print(lime_vp_df_max)

In [None]:
#lime_vp_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_max['Variable'])
shapley_features = list(shap_vp_df_max['Variable'])
lime_features = list(lime_vp_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_max[breakdown_vp_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_max[shap_vp_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_max[lime_vp_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "developer_num", "line_removed", "line_added", "file_modified", "duration", "file_removed"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia VP MEDIANA:**

In [None]:
breakdown_vp_mediana = exp.predict_parts(df_instancia_vp_mediana, type="break_down",random_state=42)
shap_vp_mediana = exp.predict_parts(df_instancia_vp_mediana, type="shap",random_state=42)
lime_vp_mediana = exp.predict_surrogate(df_instancia_vp_mediana, random_state=42)

breakdown_vp_df_mediana = breakdown_vp_mediana.result
shap_vp_df_mediana = shap_vp_mediana.result
lime_vp_df_mediana=lime_vp_mediana.result

In [None]:
breakdown_vp_mediana.plot()

In [None]:
shap_vp_mediana.plot()

In [None]:
lime_vp_mediana.plot()

In [None]:
breakdown_vp_df_mediana = breakdown_vp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_mediana = breakdown_vp_df_mediana.drop(index=[0, 26])
breakdown_vp_df_mediana['sign'] = breakdown_vp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_mediana = breakdown_vp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_mediana = shap_vp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_mediana = shap_vp_df_mediana.tail(25)
shap_vp_df_mediana['sign'] = shap_vp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_mediana = shap_vp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_mediana["Variable"] = lime_vp_df_mediana["variable"].str.split(" ").str[0]
lime_vp_df_mediana["Signo"] = lime_vp_df_mediana["effect"].apply(evaluar_valor)
lime_vp_df_mediana = lime_vp_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_mediana = lime_vp_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_mediana['Ranking'] = breakdown_vp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_mediana = breakdown_vp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_mediana['Ranking'] = shap_vp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_mediana = shap_vp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_mediana['Ranking'] = lime_vp_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_mediana = lime_vp_df_mediana.head(5)
lime_vp_df_mediana = lime_vp_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_mediana = breakdown_vp_df_mediana.drop(columns=['contribution'])
shap_vp_df_mediana = shap_vp_df_mediana.drop(columns=['contribution'])
lime_vp_df_mediana = lime_vp_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_mediana = breakdown_vp_df_mediana.head(5)
breakdown_vp_df_mediana = breakdown_vp_df_mediana.reset_index(drop=True)

shap_vp_df_mediana = shap_vp_df_mediana.head(5)
shap_vp_df_mediana = shap_vp_df_mediana.reset_index(drop=True)

lime_vp_df_mediana = lime_vp_df_mediana.reset_index(drop=True)

print(breakdown_vp_df_mediana)
print(shap_vp_df_mediana)
print(lime_vp_df_mediana)

In [None]:
#lime_vp_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_mediana['Variable'])
shapley_features = list(shap_vp_df_mediana['Variable'])
lime_features = list(lime_vp_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_mediana[breakdown_vp_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_mediana[shap_vp_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_mediana[lime_vp_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "file_added", "messages_min", "delete_frequency", "file_removed", "file_modified", "duration", "developer_num"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia VP MIN:**

In [None]:
breakdown_vp_min = exp.predict_parts(df_instancia_vp_min, type="break_down",random_state=42)
shap_vp_min = exp.predict_parts(df_instancia_vp_min, type="shap",random_state=42)
lime_vp_min = exp.predict_surrogate(df_instancia_vp_min, random_state=42)

breakdown_vp_df_min = breakdown_vp_min.result
shap_vp_df_min = shap_vp_min.result
lime_vp_df_min = lime_vp_min.result

In [None]:
breakdown_vp_min.plot()

In [None]:
shap_vp_min.plot()

In [None]:
lime_vp_min.plot()

In [None]:
breakdown_vp_df_min = breakdown_vp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_min = breakdown_vp_df_min.drop(index=[0, 26])
breakdown_vp_df_min['sign'] = breakdown_vp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_min = breakdown_vp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_min = shap_vp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_min = shap_vp_df_min.tail(25)
shap_vp_df_min['sign'] = shap_vp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_min = shap_vp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_min["Variable"] = lime_vp_df_min["variable"].str.split(" ").str[0]
lime_vp_df_min["Signo"] = lime_vp_df_min["effect"].apply(evaluar_valor)
lime_vp_df_min = lime_vp_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_min = lime_vp_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_min['Ranking'] = breakdown_vp_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_min = breakdown_vp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_min['Ranking'] = shap_vp_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_min = shap_vp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_min['Ranking'] = lime_vp_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_min = lime_vp_df_min.head(5)
lime_vp_df_min = lime_vp_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_min = breakdown_vp_df_min.drop(columns=['contribution'])
shap_vp_df_min = shap_vp_df_min.drop(columns=['contribution'])
lime_vp_df_min = lime_vp_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_min = breakdown_vp_df_min.head(5)
breakdown_vp_df_min = breakdown_vp_df_min.reset_index(drop=True)

shap_vp_df_min = shap_vp_df_min.head(5)
shap_vp_df_min = shap_vp_df_min.reset_index(drop=True)

lime_vp_df_min = lime_vp_df_min.reset_index(drop=True)

print(breakdown_vp_df_min)
print(shap_vp_df_min)
print(lime_vp_df_min)

In [None]:
#lime_vp_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_min['Variable'])
shapley_features = list(shap_vp_df_min['Variable'])
lime_features = list(lime_vp_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_min[breakdown_vp_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_min[shap_vp_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_min[lime_vp_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "file_modified", "commit_num", "file_added", "add_frequency", "developer_num", "file_removed", "fix_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **VP General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_brf_vp = pd.DataFrame(data_dict, index=all_caract)

df_resumen_brf_vp

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_brf_vp[("General", "Ranking")] = df_resumen_brf_vp[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_brf_vp[("General", "Conteo Total")] = df_resumen_brf_vp[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_brf_vp

In [None]:
# Obtener el número de características
num_caract = df_resumen_brf_vp.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_brf_vp[("General", "Peso Rango")] = 1 - ((df_resumen_brf_vp[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_brf_vp[("General", "Peso Conteo")] = df_resumen_brf_vp[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_brf_vp[("General", "Puntaje")] = df_resumen_brf_vp[("General", "Peso Rango")] + df_resumen_brf_vp[("General", "Peso Conteo")]
df_resumen_brf_vp[("General", "Ranking")] = df_resumen_brf_vp[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_brf_vp.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_brf_vp.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_brf_vp.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_brf_vp

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_brf_vp.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_brf_vp = df_resumen_brf_vp[new_columns]

In [None]:
df_resumen_brf_vp.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_brf_vp[(tech, "Ranking Medio")] = df_resumen_brf_vp[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_brf_vp

### **Instancia VN MAX:**

In [None]:
breakdown_vn_max = exp.predict_parts(df_instancia_vn_max, type="break_down",random_state=42)
shap_vn_max = exp.predict_parts(df_instancia_vn_max, type="shap",random_state=42)
lime_vn_max = exp.predict_surrogate(df_instancia_vn_max, random_state=42)

breakdown_vn_df_max = breakdown_vn_max.result
shap_vn_df_max = shap_vn_max.result
lime_vn_df_max = lime_vn_max.result

In [None]:
breakdown_vn_max.plot()

In [None]:
shap_vn_max.plot()

In [None]:
lime_vn_max.plot()

In [None]:
breakdown_vn_df_max = breakdown_vn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_max = breakdown_vn_df_max.drop(index=[0, 26])
breakdown_vn_df_max['sign'] = breakdown_vn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_max = breakdown_vn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_max = shap_vn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_max = shap_vn_df_max.tail(25)
shap_vn_df_max['sign'] = shap_vn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_max = shap_vn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_max["Variable"] = lime_vn_df_max["variable"].str.split(" ").str[0]
lime_vn_df_max["Signo"] = lime_vn_df_max["effect"].apply(evaluar_valor)
lime_vn_df_max = lime_vn_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_max = lime_vn_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_max['Ranking'] = breakdown_vn_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_max = breakdown_vn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_max['Ranking'] = shap_vn_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_max = shap_vn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_max['Ranking'] = lime_vn_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_max = lime_vn_df_max.head(5)
lime_vn_df_max = lime_vn_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_vn_df_max = breakdown_vn_df_max.drop(columns=['contribution'])
shap_vn_df_max = shap_vn_df_max.drop(columns=['contribution'])
lime_vn_df_max = lime_vn_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_max = breakdown_vn_df_max.head(5)
breakdown_vn_df_max = breakdown_vn_df_max.reset_index(drop=True)

shap_vn_df_max = shap_vn_df_max.head(5)
shap_vn_df_max = shap_vn_df_max.reset_index(drop=True)

lime_vn_df_max = lime_vn_df_max.reset_index(drop=True)
lime_vn_df_max.at[2, 'Variable'] = 'commit_num'

print(breakdown_vn_df_max)
print(shap_vn_df_max)
print(lime_vn_df_max)

In [None]:
#lime_vn_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_max['Variable'])
shapley_features = list(shap_vn_df_max['Variable'])
lime_features = list(lime_vn_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_max[breakdown_vn_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_max[shap_vn_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_max[lime_vn_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["commit_num", "parallel_changed_file_num", "improve_frequency", "refactor_frequency", "file_removed", "developer_num", "file_added", "duration", "bug_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia VN MEDIANA:**

In [None]:
breakdown_vn_mediana = exp.predict_parts(df_instancia_vn_mediana, type="break_down",random_state=42)
shap_vn_mediana = exp.predict_parts(df_instancia_vn_mediana, type="shap",random_state=42)
lime_vn_mediana = exp.predict_surrogate(df_instancia_vn_mediana, random_state=42)

breakdown_vn_df_mediana = breakdown_vn_mediana.result
shap_vn_df_mediana = shap_vn_mediana.result
lime_vn_df_mediana = lime_vn_mediana.result

In [None]:
breakdown_vn_mediana.plot()

In [None]:
shap_vn_mediana.plot()

In [None]:
lime_vn_mediana.plot()

In [None]:
breakdown_vn_df_mediana = breakdown_vn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_mediana = breakdown_vn_df_mediana.drop(index=[0, 26])
breakdown_vn_df_mediana['sign'] = breakdown_vn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_mediana = breakdown_vn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_mediana = shap_vn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_mediana = shap_vn_df_mediana.tail(25)
shap_vn_df_mediana['sign'] = shap_vn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_mediana = shap_vn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_mediana["Variable"] = lime_vn_df_mediana["variable"].str.split(" ").str[0]
lime_vn_df_mediana["Signo"] = lime_vn_df_mediana["effect"].apply(evaluar_valor)
lime_vn_df_mediana = lime_vn_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_mediana = lime_vn_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_mediana['Ranking'] = breakdown_vn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_mediana = breakdown_vn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_mediana['Ranking'] = shap_vn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_mediana = shap_vn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_mediana['Ranking'] = lime_vn_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_mediana = lime_vn_df_mediana.head(5)
lime_vn_df_mediana = lime_vn_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_vn_df_mediana = breakdown_vn_df_mediana.drop(columns=['contribution'])
shap_vn_df_mediana = shap_vn_df_mediana.drop(columns=['contribution'])
lime_vn_df_mediana = lime_vn_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_mediana = breakdown_vn_df_mediana.head(5)
breakdown_vn_df_mediana = breakdown_vn_df_mediana.reset_index(drop=True)

shap_vn_df_mediana = shap_vn_df_mediana.head(5)
shap_vn_df_mediana = shap_vn_df_mediana.reset_index(drop=True)

lime_vn_df_mediana = lime_vn_df_mediana.reset_index(drop=True)
lime_vn_df_mediana.at[3, 'Variable'] = 'developer_num'

print(breakdown_vn_df_mediana)
print(shap_vn_df_mediana)
print(lime_vn_df_mediana)

In [None]:
#lime_vn_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_mediana['Variable'])
shapley_features = list(shap_vn_df_mediana['Variable'])
lime_features = list(lime_vn_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_mediana[breakdown_vn_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_mediana[shap_vn_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_mediana[lime_vn_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["commit_num", "parallel_changed_file_num", "developer_num", "line_removed", "file_modified", "line_added", "file_added", "file_removed", "bug_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia VN MIN:**

In [None]:
breakdown_vn_min = exp.predict_parts(df_instancia_vn_min, type="break_down",random_state=42)
shap_vn_min = exp.predict_parts(df_instancia_vn_min, type="shap",random_state=42)
lime_vn_min = exp.predict_surrogate(df_instancia_vn_min, random_state=42)

breakdown_vn_df_min = breakdown_vn_min.result
shap_vn_df_min = shap_vn_min.result
lime_vn_df_min = lime_vn_min.result

In [None]:
breakdown_vn_min.plot()

In [None]:
shap_vn_min.plot()

In [None]:
lime_vn_min.plot()

In [None]:
breakdown_vn_df_min = breakdown_vn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_min = breakdown_vn_df_min.drop(index=[0, 26])
breakdown_vn_df_min['sign'] = breakdown_vn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_min = breakdown_vn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_min = shap_vn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_min = shap_vn_df_min.tail(25)
shap_vn_df_min['sign'] = shap_vn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_min = shap_vn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_min["Variable"] = lime_vn_df_min["variable"].str.split(" ").str[0]
lime_vn_df_min["Signo"] = lime_vn_df_min["effect"].apply(evaluar_valor)
lime_vn_df_min = lime_vn_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_min = lime_vn_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_min['Ranking'] = breakdown_vn_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_min = breakdown_vn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_min['Ranking'] = shap_vn_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_min = shap_vn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_min['Ranking'] = lime_vn_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_min = lime_vn_df_min.head(5)
lime_vn_df_min = lime_vn_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vn_df_min = breakdown_vn_df_min.drop(columns=['contribution'])
shap_vn_df_min = shap_vn_df_min.drop(columns=['contribution'])
lime_vn_df_min = lime_vn_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_min = breakdown_vn_df_min.head(5)
breakdown_vn_df_min = breakdown_vn_df_min.reset_index(drop=True)

shap_vn_df_min = shap_vn_df_min.head(5)
shap_vn_df_min = shap_vn_df_min.reset_index(drop=True)

lime_vn_df_min = lime_vn_df_min.reset_index(drop=True)

print(breakdown_vn_df_min)
print(shap_vn_df_min)
print(lime_vn_df_min)

In [None]:
#lime_vn_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_min['Variable'])
shapley_features = list(shap_vn_df_min['Variable'])
lime_features = list(lime_vn_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_min[breakdown_vn_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_min[shap_vn_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_min[lime_vn_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "messages_max", "commit_num", "developer_num", "line_removed", "messages_median", "file_removed", "bug_frequency", "fix_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **VN General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_brf_vn = pd.DataFrame(data_dict, index=all_caract)

df_resumen_brf_vn

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_brf_vn[("General", "Ranking")] = df_resumen_brf_vn[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_brf_vn[("General", "Conteo Total")] = df_resumen_brf_vn[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_brf_vn

In [None]:
# Obtener el número de características
num_caract = df_resumen_brf_vn.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_brf_vn[("General", "Peso Rango")] = 1 - ((df_resumen_brf_vn[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_brf_vn[("General", "Peso Conteo")] = df_resumen_brf_vn[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_brf_vn[("General", "Puntaje")] = df_resumen_brf_vn[("General", "Peso Rango")] + df_resumen_brf_vn[("General", "Peso Conteo")]
df_resumen_brf_vn[("General", "Ranking")] = df_resumen_brf_vn[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_brf_vn.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_brf_vn.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_brf_vn.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_brf_vn

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_brf_vn.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_brf_vn = df_resumen_brf_vn[new_columns]

In [None]:
df_resumen_brf_vn.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_brf_vn[(tech, "Ranking Medio")] = df_resumen_brf_vn[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_brf_vn

### **Instancia FP MAX:**

In [None]:
breakdown_fp_max = exp.predict_parts(df_instancia_fp_max, type="break_down",random_state=42)
shap_fp_max = exp.predict_parts(df_instancia_fp_max, type="shap",random_state=42)
lime_fp_max = exp.predict_surrogate(df_instancia_fp_max, random_state=42)

breakdown_fp_df_max = breakdown_fp_max.result
shap_fp_df_max = shap_fp_max.result
lime_fp_df_max=lime_fp_max.result

In [None]:
breakdown_fp_max.plot()

In [None]:
shap_fp_max.plot()

In [None]:
lime_fp_max.plot()

In [None]:
breakdown_fp_df_max = breakdown_fp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_max = breakdown_fp_df_max.drop(index=[0, 26])
breakdown_fp_df_max['sign'] = breakdown_fp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_max = breakdown_fp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_max = shap_fp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_max = shap_fp_df_max.tail(25)
shap_fp_df_max['sign'] = shap_fp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_max = shap_fp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_max["Variable"] = lime_fp_df_max["variable"].str.split(" ").str[0]
lime_fp_df_max["Signo"] = lime_fp_df_max["effect"].apply(evaluar_valor)
lime_fp_df_max = lime_fp_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_max = lime_fp_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_max['Ranking'] = breakdown_fp_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_max = breakdown_fp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_max['Ranking'] = shap_fp_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_max = shap_fp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_max['Ranking'] = lime_fp_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_max = lime_fp_df_max.head(5)
lime_fp_df_max = lime_fp_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_fp_df_max = breakdown_fp_df_max.drop(columns=['contribution'])
shap_fp_df_max = shap_fp_df_max.drop(columns=['contribution'])
lime_fp_df_max = lime_fp_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_max = breakdown_fp_df_max.head(5)
breakdown_fp_df_max = breakdown_fp_df_max.reset_index(drop=True)

shap_fp_df_max = shap_fp_df_max.head(5)
shap_fp_df_max = shap_fp_df_max.reset_index(drop=True)

lime_fp_df_max = lime_fp_df_max.reset_index(drop=True)

print(breakdown_fp_df_max)
print(shap_fp_df_max)
print(lime_fp_df_max)

In [None]:
#lime_fp_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_max['Variable'])
shapley_features = list(shap_fp_df_max['Variable'])
lime_features = list(lime_fp_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_max[breakdown_fp_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_max[shap_fp_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_max[lime_fp_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "file_removed", "developer_num", "file_added", "duration"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia FP MEDIANA:**

In [None]:
breakdown_fp_mediana = exp.predict_parts(df_instancia_fp_mediana, type="break_down",random_state=42)
shap_fp_mediana = exp.predict_parts(df_instancia_fp_mediana, type="shap",random_state=42)
lime_fp_mediana = exp.predict_surrogate(df_instancia_fp_mediana, random_state=42)

breakdown_fp_df_mediana = breakdown_fp_mediana.result
shap_fp_df_mediana = shap_fp_mediana.result
lime_fp_df_mediana=lime_fp_mediana.result

In [None]:
breakdown_fp_mediana.plot()

In [None]:
shap_fp_mediana.plot()

In [None]:
lime_fp_mediana.plot()

In [None]:
breakdown_fp_df_mediana = breakdown_fp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_mediana = breakdown_fp_df_mediana.drop(index=[0, 26])
breakdown_fp_df_mediana['sign'] = breakdown_fp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_mediana = breakdown_fp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_mediana = shap_fp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_mediana = shap_fp_df_mediana.tail(25)
shap_fp_df_mediana['sign'] = shap_fp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_mediana = shap_fp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_mediana["Variable"] = lime_fp_df_mediana["variable"].str.split(" ").str[0]
lime_fp_df_mediana["Signo"] = lime_fp_df_mediana["effect"].apply(evaluar_valor)
lime_fp_df_mediana = lime_fp_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_mediana = lime_fp_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_mediana['Ranking'] = breakdown_fp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_mediana = breakdown_fp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_mediana['Ranking'] = shap_fp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_mediana = shap_fp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_mediana['Ranking'] = lime_fp_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_mediana = lime_fp_df_mediana.head(5)
lime_fp_df_mediana = lime_fp_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_fp_df_mediana = breakdown_fp_df_mediana.drop(columns=['contribution'])
shap_fp_df_mediana = shap_fp_df_mediana.drop(columns=['contribution'])
lime_fp_df_mediana = lime_fp_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_mediana = breakdown_fp_df_mediana.head(5)
breakdown_fp_df_mediana = breakdown_fp_df_mediana.reset_index(drop=True)

shap_fp_df_mediana = shap_fp_df_mediana.head(5)
shap_fp_df_mediana = shap_fp_df_mediana.reset_index(drop=True)

lime_fp_df_mediana = lime_fp_df_mediana.reset_index(drop=True)

print(breakdown_fp_df_mediana)
print(shap_fp_df_mediana)
print(lime_fp_df_mediana)

In [None]:
#lime_fp_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_mediana['Variable'])
shapley_features = list(shap_fp_df_mediana['Variable'])
lime_features = list(lime_fp_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_mediana[breakdown_fp_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_mediana[shap_fp_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_mediana[lime_fp_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "messages_max", "file_removed", "line_removed", "file_added", "file_modified", "fix_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia FP MIN:**

In [None]:
breakdown_fp_min = exp.predict_parts(df_instancia_fp_min, type="break_down",random_state=42)
shap_fp_min = exp.predict_parts(df_instancia_fp_min, type="shap",random_state=42)
lime_fp_min = exp.predict_surrogate(df_instancia_fp_min, random_state=42)

breakdown_fp_df_min = breakdown_fp_min.result
shap_fp_df_min = shap_fp_min.result
lime_fp_df_min = lime_fp_min.result

In [None]:
breakdown_fp_min.plot()

In [None]:
shap_fp_min.plot()

In [None]:
lime_fp_min.plot()

In [None]:
breakdown_fp_df_min = breakdown_fp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_min = breakdown_fp_df_min.drop(index=[0, 26])
breakdown_fp_df_min['sign'] = breakdown_fp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_min = breakdown_fp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_min = shap_fp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_min = shap_fp_df_min.tail(25)
shap_fp_df_min['sign'] = shap_fp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_min = shap_fp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_min["Variable"] = lime_fp_df_min["variable"].str.split(" ").str[0]
lime_fp_df_min["Signo"] = lime_fp_df_min["effect"].apply(evaluar_valor)
lime_fp_df_min = lime_fp_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_min = lime_fp_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_min['Ranking'] = breakdown_fp_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_min = breakdown_fp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_min['Ranking'] = shap_fp_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_min = shap_fp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_min['Ranking'] = lime_fp_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_min = lime_fp_df_min.head(5)
lime_fp_df_min = lime_fp_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_fp_df_min = breakdown_fp_df_min.drop(columns=['contribution'])
shap_fp_df_min = shap_fp_df_min.drop(columns=['contribution'])
lime_fp_df_min = lime_fp_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_min = breakdown_fp_df_min.head(5)
breakdown_fp_df_min = breakdown_fp_df_min.reset_index(drop=True)

shap_fp_df_min = shap_fp_df_min.head(5)
shap_fp_df_min = shap_fp_df_min.reset_index(drop=True)

lime_fp_df_min = lime_fp_df_min.reset_index(drop=True)

print(breakdown_fp_df_min)
print(shap_fp_df_min)
print(lime_fp_df_min)

In [None]:
#lime_fp_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_min['Variable'])
shapley_features = list(shap_fp_df_min['Variable'])
lime_features = list(lime_fp_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_min[breakdown_fp_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_min[shap_fp_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_min[lime_fp_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "commit_density", "line_added", "line_removed", "developer_num", "file_added", "messages_min", "file_removed", "file_modified"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **FP General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_brf_fp = pd.DataFrame(data_dict, index=all_caract)

df_resumen_brf_fp

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_brf_fp[("General", "Ranking")] = df_resumen_brf_fp[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_brf_fp[("General", "Conteo Total")] = df_resumen_brf_fp[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_brf_fp

In [None]:
# Obtener el número de características
num_caract = df_resumen_brf_fp.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_brf_fp[("General", "Peso Rango")] = 1 - ((df_resumen_brf_fp[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_brf_fp[("General", "Peso Conteo")] = df_resumen_brf_fp[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_brf_fp[("General", "Puntaje")] = df_resumen_brf_fp[("General", "Peso Rango")] + df_resumen_brf_fp[("General", "Peso Conteo")]
df_resumen_brf_fp[("General", "Ranking")] = df_resumen_brf_fp[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_brf_fp.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_brf_fp.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_brf_fp.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_brf_fp

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_brf_fp.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_brf_fp = df_resumen_brf_fp[new_columns]

In [None]:
df_resumen_brf_fp.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_brf_fp[(tech, "Ranking Medio")] = df_resumen_brf_fp[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_brf_fp

### **Instancia FN MAX:**

In [None]:
breakdown_fn_max = exp.predict_parts(df_instancia_fn_max, type="break_down",random_state=42)
shap_fn_max = exp.predict_parts(df_instancia_fn_max, type="shap",random_state=42)
lime_fn_max = exp.predict_surrogate(df_instancia_fn_max, random_state=42)

breakdown_fn_df_max = breakdown_fn_max.result
shap_fn_df_max = shap_fn_max.result
lime_fn_df_max=lime_fn_max.result

In [None]:
breakdown_fn_max.plot()

In [None]:
shap_fn_max.plot()

In [None]:
lime_fn_max.plot()

In [None]:
breakdown_fn_df_max = breakdown_fn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_max = breakdown_fn_df_max.drop(index=[0, 26])
breakdown_fn_df_max['sign'] = breakdown_fn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_max = breakdown_fn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_max = shap_fn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_max = shap_fn_df_max.tail(25)
shap_fn_df_max['sign'] = shap_fn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_max = shap_fn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_max["Variable"] = lime_fn_df_max["variable"].str.split(" ").str[0]
lime_fn_df_max["Signo"] = lime_fn_df_max["effect"].apply(evaluar_valor)
lime_fn_df_max = lime_fn_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_max = lime_fn_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_max['Ranking'] = breakdown_fn_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_max = breakdown_fn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_max['Ranking'] = shap_fn_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_max = shap_fn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_max['Ranking'] = lime_fn_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_max = lime_fn_df_max.head(5)
lime_fn_df_max = lime_fn_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_fn_df_max = breakdown_fn_df_max.drop(columns=['contribution'])
shap_fn_df_max = shap_fn_df_max.drop(columns=['contribution'])
lime_fn_df_max = lime_fn_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_max = breakdown_fn_df_max.head(5)
breakdown_fn_df_max = breakdown_fn_df_max.reset_index(drop=True)

shap_fn_df_max = shap_fn_df_max.head(5)
shap_fn_df_max = shap_fn_df_max.reset_index(drop=True)

lime_fn_df_max = lime_fn_df_max.reset_index(drop=True)
lime_fn_df_max.at[3, 'Variable'] = 'developer_num'

print(breakdown_fn_df_max)
print(shap_fn_df_max)
print(lime_fn_df_max)

In [None]:
#lime_fn_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_max['Variable'])
shapley_features = list(shap_fn_df_max['Variable'])
lime_features = list(lime_fn_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_max[breakdown_fn_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_max[shap_fn_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_max[lime_fn_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["commit_num", "parallel_changed_file_num", "line_removed", "developer_num", "use_frequency", "line_added", "file_removed", "duration"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia FN MEDIANA:**

In [None]:
breakdown_fn_mediana = exp.predict_parts(df_instancia_fn_mediana, type="break_down",random_state=42)
shap_fn_mediana = exp.predict_parts(df_instancia_fn_mediana, type="shap",random_state=42)
lime_fn_mediana = exp.predict_surrogate(df_instancia_fn_mediana, random_state=42)

breakdown_fn_df_mediana = breakdown_fn_mediana.result
shap_fn_df_mediana = shap_fn_mediana.result
lime_fn_df_mediana=lime_fn_mediana.result

In [None]:
breakdown_fn_mediana.plot()

In [None]:
shap_fn_mediana.plot()

In [None]:
lime_fn_mediana.plot()

In [None]:
breakdown_fn_df_mediana = breakdown_fn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_mediana = breakdown_fn_df_mediana.drop(index=[0, 26])
breakdown_fn_df_mediana['sign'] = breakdown_fn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_mediana = breakdown_fn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_mediana = shap_fn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_mediana = shap_fn_df_mediana.tail(25)
shap_fn_df_mediana['sign'] = shap_fn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_mediana = shap_fn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_mediana["Variable"] = lime_fn_df_mediana["variable"].str.split(" ").str[0]
lime_fn_df_mediana["Signo"] = lime_fn_df_mediana["effect"].apply(evaluar_valor)
lime_fn_df_mediana = lime_fn_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_mediana = lime_fn_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_mediana['Ranking'] = breakdown_fn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_mediana = breakdown_fn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_mediana['Ranking'] = shap_fn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_mediana = shap_fn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_mediana['Ranking'] = lime_fn_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_mediana = lime_fn_df_mediana.head(5)
lime_fn_df_mediana = lime_fn_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_fn_df_mediana = breakdown_fn_df_mediana.drop(columns=['contribution'])
shap_fn_df_mediana = shap_fn_df_mediana.drop(columns=['contribution'])
lime_fn_df_mediana = lime_fn_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_mediana = breakdown_fn_df_mediana.head(5)
breakdown_fn_df_mediana = breakdown_fn_df_mediana.reset_index(drop=True)

shap_fn_df_mediana = shap_fn_df_mediana.head(5)
shap_fn_df_mediana = shap_fn_df_mediana.reset_index(drop=True)

lime_fn_df_mediana = lime_fn_df_mediana.reset_index(drop=True)

print(breakdown_fn_df_mediana)
print(shap_fn_df_mediana)
print(lime_fn_df_mediana)

In [None]:
#lime_fn_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_mediana['Variable'])
shapley_features = list(shap_fn_df_mediana['Variable'])
lime_features = list(lime_fn_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_mediana[breakdown_fn_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_mediana[shap_fn_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_mediana[lime_fn_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "file_modified", "developer_num", "line_added", "commit_num", "file_added", "file_removed"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia FN MIN:**

In [None]:
breakdown_fn_min = exp.predict_parts(df_instancia_fn_min, type="break_down",random_state=42)
shap_fn_min = exp.predict_parts(df_instancia_fn_min, type="shap",random_state=42)
lime_fn_min = exp.predict_surrogate(df_instancia_fn_min, random_state=42)

breakdown_fn_df_min = breakdown_fn_min.result
shap_fn_df_min = shap_fn_min.result
lime_fn_df_min = lime_fn_min.result

In [None]:
breakdown_fn_min.plot()

In [None]:
shap_fn_min.plot()

In [None]:
lime_fn_min.plot()

In [None]:
breakdown_fn_df_min = breakdown_fn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_min = breakdown_fn_df_min.drop(index=[0, 26])
breakdown_fn_df_min['sign'] = breakdown_fn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_min = breakdown_fn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_min = shap_fn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_min = shap_fn_df_min.tail(25)
shap_fn_df_min['sign'] = shap_fn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_min = shap_fn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_min["Variable"] = lime_fn_df_min["variable"].str.split(" ").str[0]
lime_fn_df_min["Signo"] = lime_fn_df_min["effect"].apply(evaluar_valor)
lime_fn_df_min = lime_fn_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_min = lime_fn_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_min['Ranking'] = breakdown_fn_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_min = breakdown_fn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_min['Ranking'] = shap_fn_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_min = shap_fn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_min['Ranking'] = lime_fn_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_min = lime_fn_df_min.head(5)
lime_fn_df_min = lime_fn_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_fn_df_min = breakdown_fn_df_min.drop(columns=['contribution'])
shap_fn_df_min = shap_fn_df_min.drop(columns=['contribution'])
lime_fn_df_min = lime_fn_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_min = breakdown_fn_df_min.head(5)
breakdown_fn_df_min = breakdown_fn_df_min.reset_index(drop=True)

shap_fn_df_min = shap_fn_df_min.head(5)
shap_fn_df_min = shap_fn_df_min.reset_index(drop=True)

lime_fn_df_min = lime_fn_df_min.reset_index(drop=True)

print(breakdown_fn_df_min)
print(shap_fn_df_min)
print(lime_fn_df_min)

In [None]:
#lime_fn_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_min['Variable'])
shapley_features = list(shap_fn_df_min['Variable'])
lime_features = list(lime_fn_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_min[breakdown_fn_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_min[shap_fn_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_min[lime_fn_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "commit_num", "line_added", "duration", "commit_density", "file_removed", "file_modified"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **FN General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_brf_fn = pd.DataFrame(data_dict, index=all_caract)

df_resumen_brf_fn

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_brf_fn[("General", "Ranking")] = df_resumen_brf_fn[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_brf_fn[("General", "Conteo Total")] = df_resumen_brf_fn[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_brf_fn

In [None]:
# Obtener el número de características
num_caract = df_resumen_brf_fn.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_brf_fn[("General", "Peso Rango")] = 1 - ((df_resumen_brf_fn[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_brf_fn[("General", "Peso Conteo")] = df_resumen_brf_fn[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_brf_fn[("General", "Puntaje")] = df_resumen_brf_fn[("General", "Peso Rango")] + df_resumen_brf_fn[("General", "Peso Conteo")]
df_resumen_brf_fn[("General", "Ranking")] = df_resumen_brf_fn[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_brf_fn.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_brf_fn.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_brf_fn.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_brf_fn

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_brf_fn.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_brf_fn = df_resumen_brf_fn[new_columns]

In [None]:
df_resumen_brf_fn.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_brf_fn[(tech, "Ranking Medio")] = df_resumen_brf_fn[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_brf_fn

## **GradientBoosting:**

**FEATURE IMPORTANCE**

In [None]:
permu = permutation_importance(modelo_gb, x_test, y_test, n_repeats=20, random_state=42, n_jobs=2, scoring='f1')

# Umbral para valores significativos
importance_threshold = 0.01

# Filtrar los caracteristicas
significant_indices = permu.importances_mean > importance_threshold
permu_importances = pd.Series(permu.importances_mean.round(3), index=feature_names)[significant_indices]
permu_std = permu.importances_std[significant_indices]

# Crear la representación gráfica
fig, ax = plt.subplots()
permu_importances.plot.bar(yerr=permu_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
scoring = ['precision', 'recall','f1']
metric_names = ['Precision', 'Recall', 'F1-score']

permu_score = permutation_importance(modelo_gb, x_test, y_test, n_repeats=20, random_state=42, scoring=scoring)
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

# Umbral para importancias significativas
importance_threshold = 0.01

# Itera a través de las métricas
for i, metric in enumerate(scoring):
    permu = permu_score[metric]

    # Filtra las características que cumplen la condición del umbral
    significant_indices = [j for j in range(len(permu.importances_mean)) if permu.importances_mean[j] > importance_threshold]
    sorted_feature_names = [feature_names[j] for j in significant_indices]
    importances_mean = permu.importances_mean[significant_indices]
    importances_std = permu.importances_std[significant_indices]

    # Ordena los datos por importancia de mayor a menor
    sorted_indices = np.argsort(importances_mean)[::1]
    sorted_feature_names = [sorted_feature_names[j] for j in sorted_indices]
    importances_mean = importances_mean[sorted_indices]
    importances_std = importances_std[sorted_indices]

    # Crea la representación gráfica en el subplot correspondiente
    axs[i].barh(range(len(sorted_feature_names)), importances_mean, xerr=importances_std, align='center')
    axs[i].set_yticks(range(len(sorted_feature_names)))
    axs[i].set_yticklabels(sorted_feature_names)
    axs[i].set_xlabel('Valor Importancia')
    axs[i].set_title(f'Importancia por Permutación para {metric_names[i]}')

# Ajusta los espacios entre subplots y muestra la figura
plt.tight_layout()
plt.show()


In [None]:
scoring = ['precision','recall','f1']
metric_names = ['Precision', 'Recall', 'F1-score']

# Umbral para importancias significativas
importance_threshold = 0.01

# Crea un diccionario para almacenar los DataFrames
results_global_gb = {}

permu_score = permutation_importance(modelo_gb, x_test, y_test, n_repeats=20, random_state=42, scoring=scoring)
for i, metric in enumerate(scoring):
  permu = permu_score[metric]

  # Filtra las características que cumplen la condición
  significant_indices = [j for j in range(len(permu.importances_mean)) if permu.importances_mean[j] > importance_threshold]
  sorted_feature_names = [feature_names[j] for j in significant_indices]
  importances_mean = permu.importances_mean[significant_indices]
  importances_std = permu.importances_std[significant_indices]

  # Crear un DataFrame con los resultados
  df_exp_global = pd.DataFrame({'Feature': sorted_feature_names,
                       'Importance_Mean': importances_mean,
                       'Importance_Std': importances_std})

  # Ordenar el DataFrame por importance_mean en orden descendente
  df_exp_global = df_exp_global.sort_values(by='Importance_Mean', ascending=False)

  # Asignar el DataFrame al diccionario con el nombre de la métrica
  results_global_gb[f'df_global_{metric_names[i]}'] = df_exp_global

In [None]:
results_global_gb['df_global_Precision']

In [None]:
results_global_gb['df_global_Recall']

In [None]:
results_global_gb['df_global_F1-score']

**BREAK-DOWN, SHAP Y LIME:**

In [None]:
#primero definimos el explainer
exp = dx.Explainer(modelo_gb, x_train, y_train)

### **Instancia VP MAX:**

In [None]:
breakdown_vp_max = exp.predict_parts(df_instancia_vp_max, type="break_down",random_state=42)
shap_vp_max = exp.predict_parts(df_instancia_vp_max, type="shap",random_state=42)
lime_vp_max = exp.predict_surrogate(df_instancia_vp_max, random_state=42)

breakdown_vp_df_max = breakdown_vp_max.result
shap_vp_df_max = shap_vp_max.result
lime_vp_df_max=lime_vp_max.result

In [None]:
breakdown_vp_max.plot()

In [None]:
shap_vp_max.plot()

In [None]:
lime_vp_max.plot()

In [None]:
breakdown_vp_df_max = breakdown_vp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_max = breakdown_vp_df_max.drop(index=[0, 26])
breakdown_vp_df_max['sign'] = breakdown_vp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_max = breakdown_vp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_max = shap_vp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_max = shap_vp_df_max.tail(25)
shap_vp_df_max['sign'] = shap_vp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_max = shap_vp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_max["Variable"] = lime_vp_df_max["variable"].str.split(" ").str[0]
lime_vp_df_max["Signo"] = lime_vp_df_max["effect"].apply(evaluar_valor)
lime_vp_df_max = lime_vp_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_max = lime_vp_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_max['Ranking'] = breakdown_vp_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_max = breakdown_vp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_max['Ranking'] = shap_vp_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_max = shap_vp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_max['Ranking'] = lime_vp_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_max = lime_vp_df_max.head(5)
lime_vp_df_max = lime_vp_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_max = breakdown_vp_df_max.drop(columns=['contribution'])
shap_vp_df_max = shap_vp_df_max.drop(columns=['contribution'])
lime_vp_df_max = lime_vp_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_max = breakdown_vp_df_max.head(5)
breakdown_vp_df_max = breakdown_vp_df_max.reset_index(drop=True)

shap_vp_df_max = shap_vp_df_max.head(5)
shap_vp_df_max = shap_vp_df_max.reset_index(drop=True)

lime_vp_df_max = lime_vp_df_max.reset_index(drop=True)

print(breakdown_vp_df_max)
print(shap_vp_df_max)
print(lime_vp_df_max)

In [None]:
#lime_vp_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_max['Variable'])
shapley_features = list(shap_vp_df_max['Variable'])
lime_features = list(lime_vp_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_max[breakdown_vp_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_max[shap_vp_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_max[lime_vp_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "line_removed","file_removed", "duration", "commit_num", "file_added"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia VP MEDIANA:**

In [None]:
breakdown_vp_mediana = exp.predict_parts(df_instancia_vp_mediana, type="break_down",random_state=42)
shap_vp_mediana = exp.predict_parts(df_instancia_vp_mediana, type="shap",random_state=42)
lime_vp_mediana = exp.predict_surrogate(df_instancia_vp_mediana, random_state=42)

breakdown_vp_df_mediana = breakdown_vp_mediana.result
shap_vp_df_mediana = shap_vp_mediana.result
lime_vp_df_mediana=lime_vp_mediana.result

In [None]:
breakdown_vp_mediana.plot()

In [None]:
shap_vp_mediana.plot()

In [None]:
lime_vp_mediana.plot()

In [None]:
breakdown_vp_df_mediana = breakdown_vp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_mediana = breakdown_vp_df_mediana.drop(index=[0, 26])
breakdown_vp_df_mediana['sign'] = breakdown_vp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_mediana = breakdown_vp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_mediana = shap_vp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_mediana = shap_vp_df_mediana.tail(25)
shap_vp_df_mediana['sign'] = shap_vp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_mediana = shap_vp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_mediana["Variable"] = lime_vp_df_mediana["variable"].str.split(" ").str[0]
lime_vp_df_mediana["Signo"] = lime_vp_df_mediana["effect"].apply(evaluar_valor)
lime_vp_df_mediana = lime_vp_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_mediana = lime_vp_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_mediana['Ranking'] = breakdown_vp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_mediana = breakdown_vp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_mediana['Ranking'] = shap_vp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_mediana = shap_vp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_mediana['Ranking'] = lime_vp_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_mediana = lime_vp_df_mediana.head(5)
lime_vp_df_mediana = lime_vp_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_mediana = breakdown_vp_df_mediana.drop(columns=['contribution'])
shap_vp_df_mediana = shap_vp_df_mediana.drop(columns=['contribution'])
lime_vp_df_mediana = lime_vp_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_mediana = breakdown_vp_df_mediana.head(5)
breakdown_vp_df_mediana = breakdown_vp_df_mediana.reset_index(drop=True)

shap_vp_df_mediana = shap_vp_df_mediana.head(5)
shap_vp_df_mediana = shap_vp_df_mediana.reset_index(drop=True)

lime_vp_df_mediana = lime_vp_df_mediana.reset_index(drop=True)

print(breakdown_vp_df_mediana)
print(shap_vp_df_mediana)
print(lime_vp_df_mediana)

In [None]:
#lime_vp_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_mediana['Variable'])
shapley_features = list(shap_vp_df_mediana['Variable'])
lime_features = list(lime_vp_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_mediana[breakdown_vp_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_mediana[shap_vp_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_mediana[lime_vp_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["developer_num", "file_added", "line_removed", "file_removed", "parallel_changed_file_num", "remove_frequency", "commit_num"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia VP MIN:**

In [None]:
breakdown_vp_min = exp.predict_parts(df_instancia_vp_min, type="break_down",random_state=42)
shap_vp_min = exp.predict_parts(df_instancia_vp_min, type="shap",random_state=42)
lime_vp_min = exp.predict_surrogate(df_instancia_vp_min, random_state=42)

breakdown_vp_df_min = breakdown_vp_min.result
shap_vp_df_min = shap_vp_min.result
lime_vp_df_min = lime_vp_min.result

In [None]:
breakdown_vp_min.plot()

In [None]:
shap_vp_min.plot()

In [None]:
lime_vp_min.plot()

In [None]:
breakdown_vp_df_min = breakdown_vp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_min = breakdown_vp_df_min.drop(index=[0, 26])
breakdown_vp_df_min['sign'] = breakdown_vp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_min = breakdown_vp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_min = shap_vp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_min = shap_vp_df_min.tail(25)
shap_vp_df_min['sign'] = shap_vp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_min = shap_vp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_min["Variable"] = lime_vp_df_min["variable"].str.split(" ").str[0]
lime_vp_df_min["Signo"] = lime_vp_df_min["effect"].apply(evaluar_valor)
lime_vp_df_min = lime_vp_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_min = lime_vp_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_min['Ranking'] = breakdown_vp_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_min = breakdown_vp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_min['Ranking'] = shap_vp_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_min = shap_vp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_min['Ranking'] = lime_vp_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_min = lime_vp_df_min.head(5)
lime_vp_df_min = lime_vp_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_min = breakdown_vp_df_min.drop(columns=['contribution'])
shap_vp_df_min = shap_vp_df_min.drop(columns=['contribution'])
lime_vp_df_min = lime_vp_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_min = breakdown_vp_df_min.head(5)
breakdown_vp_df_min = breakdown_vp_df_min.reset_index(drop=True)

shap_vp_df_min = shap_vp_df_min.head(5)
shap_vp_df_min = shap_vp_df_min.reset_index(drop=True)

lime_vp_df_min = lime_vp_df_min.reset_index(drop=True)

print(breakdown_vp_df_min)
print(shap_vp_df_min)
print(lime_vp_df_min)

In [None]:
#lime_vp_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_min['Variable'])
shapley_features = list(shap_vp_df_min['Variable'])
lime_features = list(lime_vp_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_min[breakdown_vp_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_min[shap_vp_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_min[lime_vp_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["commit_density", "commit_num", "parallel_changed_file_num", "line_removed", "developer_num", "file_modified",  "file_removed", "improve_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **VP General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_gb_vp = pd.DataFrame(data_dict, index=all_caract)

df_resumen_gb_vp

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_gb_vp[("General", "Ranking")] = df_resumen_gb_vp[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_gb_vp[("General", "Conteo Total")] = df_resumen_gb_vp[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_gb_vp

In [None]:
# Obtener el número de características
num_caract = df_resumen_gb_vp.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_gb_vp[("General", "Peso Rango")] = 1 - ((df_resumen_gb_vp[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_gb_vp[("General", "Peso Conteo")] = df_resumen_gb_vp[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_gb_vp[("General", "Puntaje")] = df_resumen_gb_vp[("General", "Peso Rango")] + df_resumen_gb_vp[("General", "Peso Conteo")]
df_resumen_gb_vp[("General", "Ranking")] = df_resumen_gb_vp[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_gb_vp.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_gb_vp.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_gb_vp.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_gb_vp

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_gb_vp.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_gb_vp = df_resumen_gb_vp[new_columns]

In [None]:
df_resumen_gb_vp.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_gb_vp[(tech, "Ranking Medio")] = df_resumen_gb_vp[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_gb_vp

### **Instancia VN MAX:**

In [None]:
breakdown_vn_max = exp.predict_parts(df_instancia_vn_max, type="break_down",random_state=42)
shap_vn_max = exp.predict_parts(df_instancia_vn_max, type="shap",random_state=42)
lime_vn_max = exp.predict_surrogate(df_instancia_vn_max, random_state=42)

breakdown_vn_df_max = breakdown_vn_max.result
shap_vn_df_max = shap_vn_max.result
lime_vn_df_max = lime_vn_max.result

In [None]:
breakdown_vn_max.plot()

In [None]:
shap_vn_max.plot()

In [None]:
lime_vn_max.plot()

In [None]:
breakdown_vn_df_max = breakdown_vn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_max = breakdown_vn_df_max.drop(index=[0, 26])
breakdown_vn_df_max['sign'] = breakdown_vn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_max = breakdown_vn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_max = shap_vn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_max = shap_vn_df_max.tail(25)
shap_vn_df_max['sign'] = shap_vn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_max = shap_vn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_max["Variable"] = lime_vn_df_max["variable"].str.split(" ").str[0]
lime_vn_df_max["Signo"] = lime_vn_df_max["effect"].apply(evaluar_valor)
lime_vn_df_max = lime_vn_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_max = lime_vn_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_max['Ranking'] = breakdown_vn_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_max = breakdown_vn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_max['Ranking'] = shap_vn_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_max = shap_vn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_max['Ranking'] = lime_vn_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_max = lime_vn_df_max.head(5)
lime_vn_df_max = lime_vn_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_vn_df_max = breakdown_vn_df_max.drop(columns=['contribution'])
shap_vn_df_max = shap_vn_df_max.drop(columns=['contribution'])
lime_vn_df_max = lime_vn_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_max = breakdown_vn_df_max.head(5)
breakdown_vn_df_max = breakdown_vn_df_max.reset_index(drop=True)

shap_vn_df_max = shap_vn_df_max.head(5)
shap_vn_df_max = shap_vn_df_max.reset_index(drop=True)

lime_vn_df_max = lime_vn_df_max.reset_index(drop=True)
lime_vn_df_max.at[2, 'Variable'] = 'commit_num'
lime_vn_df_max.at[3, 'Variable'] = 'file_added'
lime_vn_df_max.at[4, 'Variable'] = 'line_removed'

print(breakdown_vn_df_max)
print(shap_vn_df_max)
print(lime_vn_df_max)

In [None]:
#lime_vn_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_max['Variable'])
shapley_features = list(shap_vn_df_max['Variable'])
lime_features = list(lime_vn_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_max[breakdown_vn_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_max[shap_vn_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_max[lime_vn_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["file_added", "commit_num", "improve_frequency", "line_removed", "developer_num", "parallel_changed_file_num", "duration", "file_removed"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia VN MEDIANA:**

In [None]:
breakdown_vn_mediana = exp.predict_parts(df_instancia_vn_mediana, type="break_down",random_state=42)
shap_vn_mediana = exp.predict_parts(df_instancia_vn_mediana, type="shap",random_state=42)
lime_vn_mediana = exp.predict_surrogate(df_instancia_vn_mediana, random_state=42)

breakdown_vn_df_mediana = breakdown_vn_mediana.result
shap_vn_df_mediana = shap_vn_mediana.result
lime_vn_df_mediana = lime_vn_mediana.result

In [None]:
breakdown_vn_mediana.plot()

In [None]:
shap_vn_mediana.plot()

In [None]:
lime_vn_mediana.plot()

In [None]:
breakdown_vn_df_mediana = breakdown_vn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_mediana = breakdown_vn_df_mediana.drop(index=[0, 26])
breakdown_vn_df_mediana['sign'] = breakdown_vn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_mediana = breakdown_vn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_mediana = shap_vn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_mediana = shap_vn_df_mediana.tail(25)
shap_vn_df_mediana['sign'] = shap_vn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_mediana = shap_vn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_mediana["Variable"] = lime_vn_df_mediana["variable"].str.split(" ").str[0]
lime_vn_df_mediana["Signo"] = lime_vn_df_mediana["effect"].apply(evaluar_valor)
lime_vn_df_mediana = lime_vn_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_mediana = lime_vn_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_mediana['Ranking'] = breakdown_vn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_mediana = breakdown_vn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_mediana['Ranking'] = shap_vn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_mediana = shap_vn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_mediana['Ranking'] = lime_vn_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_mediana = lime_vn_df_mediana.head(5)
lime_vn_df_mediana = lime_vn_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_vn_df_mediana = breakdown_vn_df_mediana.drop(columns=['contribution'])
shap_vn_df_mediana = shap_vn_df_mediana.drop(columns=['contribution'])
lime_vn_df_mediana = lime_vn_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_mediana = breakdown_vn_df_mediana.head(5)
breakdown_vn_df_mediana = breakdown_vn_df_mediana.reset_index(drop=True)

shap_vn_df_mediana = shap_vn_df_mediana.head(5)
shap_vn_df_mediana = shap_vn_df_mediana.reset_index(drop=True)

lime_vn_df_mediana = lime_vn_df_mediana.reset_index(drop=True)
lime_vn_df_mediana.at[2, 'Variable'] = 'developer_num'
lime_vn_df_mediana.at[3, 'Variable'] = 'file_added'
lime_vn_df_mediana.at[4, 'Variable'] = 'line_removed'

print(breakdown_vn_df_mediana)
print(shap_vn_df_mediana)
print(lime_vn_df_mediana)

In [None]:
#lime_vn_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_mediana['Variable'])
shapley_features = list(shap_vn_df_mediana['Variable'])
lime_features = list(lime_vn_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_mediana[breakdown_vn_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_mediana[shap_vn_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_mediana[lime_vn_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "file_modified", "messages_max", "file_added", "commit_num", "line_added", "file_removed", "line_removed"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia VN MIN:**

In [None]:
breakdown_vn_min = exp.predict_parts(df_instancia_vn_min, type="break_down",random_state=42)
shap_vn_min = exp.predict_parts(df_instancia_vn_min, type="shap",random_state=42)
lime_vn_min = exp.predict_surrogate(df_instancia_vn_min, random_state=42)

breakdown_vn_df_min = breakdown_vn_min.result
shap_vn_df_min = shap_vn_min.result
lime_vn_df_min = lime_vn_min.result

In [None]:
breakdown_vn_min.plot()

In [None]:
shap_vn_min.plot()

In [None]:
lime_vn_min.plot()

In [None]:
breakdown_vn_df_min = breakdown_vn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_min = breakdown_vn_df_min.drop(index=[0, 26])
breakdown_vn_df_min['sign'] = breakdown_vn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_min = breakdown_vn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_min = shap_vn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_min = shap_vn_df_min.tail(25)
shap_vn_df_min['sign'] = shap_vn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_min = shap_vn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_min["Variable"] = lime_vn_df_min["variable"].str.split(" ").str[0]
lime_vn_df_min["Signo"] = lime_vn_df_min["effect"].apply(evaluar_valor)
lime_vn_df_min = lime_vn_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_min = lime_vn_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_min['Ranking'] = breakdown_vn_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_min = breakdown_vn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_min['Ranking'] = shap_vn_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_min = shap_vn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_min['Ranking'] = lime_vn_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_min = lime_vn_df_min.head(5)
lime_vn_df_min = lime_vn_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vn_df_min = breakdown_vn_df_min.drop(columns=['contribution'])
shap_vn_df_min = shap_vn_df_min.drop(columns=['contribution'])
lime_vn_df_min = lime_vn_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_min = breakdown_vn_df_min.head(5)
breakdown_vn_df_min = breakdown_vn_df_min.reset_index(drop=True)

shap_vn_df_min = shap_vn_df_min.head(5)
shap_vn_df_min = shap_vn_df_min.reset_index(drop=True)

lime_vn_df_min = lime_vn_df_min.reset_index(drop=True)
lime_vn_df_min.at[4, 'Variable'] = 'duration'

print(breakdown_vn_df_min)
print(shap_vn_df_min)
print(lime_vn_df_min)

In [None]:
#lime_vn_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_min['Variable'])
shapley_features = list(shap_vn_df_min['Variable'])
lime_features = list(lime_vn_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_min[breakdown_vn_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_min[shap_vn_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_min[lime_vn_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["commit_num", "messages_max", "parallel_changed_file_num", "line_removed", "messages_min", "developer_num", "file_removed", "file_added", "duration"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **VN General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_gb_vn = pd.DataFrame(data_dict, index=all_caract)

df_resumen_gb_vn

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_gb_vn[("General", "Ranking")] = df_resumen_gb_vn[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_gb_vn[("General", "Conteo Total")] = df_resumen_gb_vn[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_gb_vn

In [None]:
# Obtener el número de características
num_caract = df_resumen_gb_vn.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_gb_vn[("General", "Peso Rango")] = 1 - ((df_resumen_gb_vn[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_gb_vn[("General", "Peso Conteo")] = df_resumen_gb_vn[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_gb_vn[("General", "Puntaje")] = df_resumen_gb_vn[("General", "Peso Rango")] + df_resumen_gb_vn[("General", "Peso Conteo")]
df_resumen_gb_vn[("General", "Ranking")] = df_resumen_gb_vn[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_gb_vn.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_gb_vn.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_gb_vn.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_gb_vn

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_gb_vn.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_gb_vn = df_resumen_gb_vn[new_columns]

In [None]:
df_resumen_gb_vn.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_gb_vn[(tech, "Ranking Medio")] = df_resumen_gb_vn[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_gb_vn

### **Instancia FP MAX:**

In [None]:
breakdown_fp_max = exp.predict_parts(df_instancia_fp_max, type="break_down",random_state=42)
shap_fp_max = exp.predict_parts(df_instancia_fp_max, type="shap",random_state=42)
lime_fp_max = exp.predict_surrogate(df_instancia_fp_max, random_state=42)

breakdown_fp_df_max = breakdown_fp_max.result
shap_fp_df_max = shap_fp_max.result
lime_fp_df_max=lime_fp_max.result

In [None]:
breakdown_fp_max.plot()

In [None]:
shap_fp_max.plot()

In [None]:
lime_fp_max.plot()

In [None]:
breakdown_fp_df_max = breakdown_fp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_max = breakdown_fp_df_max.drop(index=[0, 26])
breakdown_fp_df_max['sign'] = breakdown_fp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_max = breakdown_fp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_max = shap_fp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_max = shap_fp_df_max.tail(25)
shap_fp_df_max['sign'] = shap_fp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_max = shap_fp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_max["Variable"] = lime_fp_df_max["variable"].str.split(" ").str[0]
lime_fp_df_max["Signo"] = lime_fp_df_max["effect"].apply(evaluar_valor)
lime_fp_df_max = lime_fp_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_max = lime_fp_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_max['Ranking'] = breakdown_fp_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_max = breakdown_fp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_max['Ranking'] = shap_fp_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_max = shap_fp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_max['Ranking'] = lime_fp_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_max = lime_fp_df_max.head(5)
lime_fp_df_max = lime_fp_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_fp_df_max = breakdown_fp_df_max.drop(columns=['contribution'])
shap_fp_df_max = shap_fp_df_max.drop(columns=['contribution'])
lime_fp_df_max = lime_fp_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_max = breakdown_fp_df_max.head(5)
breakdown_fp_df_max = breakdown_fp_df_max.reset_index(drop=True)

shap_fp_df_max = shap_fp_df_max.head(5)
shap_fp_df_max = shap_fp_df_max.reset_index(drop=True)

lime_fp_df_max = lime_fp_df_max.reset_index(drop=True)

print(breakdown_fp_df_max)
print(shap_fp_df_max)
print(lime_fp_df_max)

In [None]:
#lime_fp_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_max['Variable'])
shapley_features = list(shap_fp_df_max['Variable'])
lime_features = list(lime_fp_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_max[breakdown_fp_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_max[shap_fp_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_max[lime_fp_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "line_removed", "commit_num", "developer_num", "duration", "messages_max", "file_removed",  "messages_min"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia FP MEDIANA:**

In [None]:
breakdown_fp_mediana = exp.predict_parts(df_instancia_fp_mediana, type="break_down",random_state=42)
shap_fp_mediana = exp.predict_parts(df_instancia_fp_mediana, type="shap",random_state=42)
lime_fp_mediana = exp.predict_surrogate(df_instancia_fp_mediana, random_state=42)

breakdown_fp_df_mediana = breakdown_fp_mediana.result
shap_fp_df_mediana = shap_fp_mediana.result
lime_fp_df_mediana=lime_fp_mediana.result

In [None]:
breakdown_fp_mediana.plot()

In [None]:
shap_fp_mediana.plot()

In [None]:
lime_fp_mediana.plot()

In [None]:
breakdown_fp_df_mediana = breakdown_fp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_mediana = breakdown_fp_df_mediana.drop(index=[0, 26])
breakdown_fp_df_mediana['sign'] = breakdown_fp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_mediana = breakdown_fp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_mediana = shap_fp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_mediana = shap_fp_df_mediana.tail(25)
shap_fp_df_mediana['sign'] = shap_fp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_mediana = shap_fp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_mediana["Variable"] = lime_fp_df_mediana["variable"].str.split(" ").str[0]
lime_fp_df_mediana["Signo"] = lime_fp_df_mediana["effect"].apply(evaluar_valor)
lime_fp_df_mediana = lime_fp_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_mediana = lime_fp_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_mediana['Ranking'] = breakdown_fp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_mediana = breakdown_fp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_mediana['Ranking'] = shap_fp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_mediana = shap_fp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_mediana['Ranking'] = lime_fp_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_mediana = lime_fp_df_mediana.head(5)
lime_fp_df_mediana = lime_fp_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_fp_df_mediana = breakdown_fp_df_mediana.drop(columns=['contribution'])
shap_fp_df_mediana = shap_fp_df_mediana.drop(columns=['contribution'])
lime_fp_df_mediana = lime_fp_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_mediana = breakdown_fp_df_mediana.head(5)
breakdown_fp_df_mediana = breakdown_fp_df_mediana.reset_index(drop=True)

shap_fp_df_mediana = shap_fp_df_mediana.head(5)
shap_fp_df_mediana = shap_fp_df_mediana.reset_index(drop=True)

lime_fp_df_mediana = lime_fp_df_mediana.reset_index(drop=True)

print(breakdown_fp_df_mediana)
print(shap_fp_df_mediana)
print(lime_fp_df_mediana)

In [None]:
#lime_fp_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_mediana['Variable'])
shapley_features = list(shap_fp_df_mediana['Variable'])
lime_features = list(lime_fp_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_mediana[breakdown_fp_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_mediana[shap_fp_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_mediana[lime_fp_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "line_removed", "messages_min", "developer_num", "file_added", "file_removed", "file_modified"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia FP MIN:**

In [None]:
breakdown_fp_min = exp.predict_parts(df_instancia_fp_min, type="break_down",random_state=42)
shap_fp_min = exp.predict_parts(df_instancia_fp_min, type="shap",random_state=42)
lime_fp_min = exp.predict_surrogate(df_instancia_fp_min, random_state=42)

breakdown_fp_df_min = breakdown_fp_min.result
shap_fp_df_min = shap_fp_min.result
lime_fp_df_min = lime_fp_min.result

In [None]:
breakdown_fp_min.plot()

In [None]:
shap_fp_min.plot()

In [None]:
lime_fp_min.plot()

In [None]:
breakdown_fp_df_min = breakdown_fp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_min = breakdown_fp_df_min.drop(index=[0, 26])
breakdown_fp_df_min['sign'] = breakdown_fp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_min = breakdown_fp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_min = shap_fp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_min = shap_fp_df_min.tail(25)
shap_fp_df_min['sign'] = shap_fp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_min = shap_fp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_min["Variable"] = lime_fp_df_min["variable"].str.split(" ").str[0]
lime_fp_df_min["Signo"] = lime_fp_df_min["effect"].apply(evaluar_valor)
lime_fp_df_min = lime_fp_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_min = lime_fp_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_min['Ranking'] = breakdown_fp_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_min = breakdown_fp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_min['Ranking'] = shap_fp_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_min = shap_fp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_min['Ranking'] = lime_fp_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_min = lime_fp_df_min.head(5)
lime_fp_df_min = lime_fp_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_fp_df_min = breakdown_fp_df_min.drop(columns=['contribution'])
shap_fp_df_min = shap_fp_df_min.drop(columns=['contribution'])
lime_fp_df_min = lime_fp_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_min = breakdown_fp_df_min.head(5)
breakdown_fp_df_min = breakdown_fp_df_min.reset_index(drop=True)

shap_fp_df_min = shap_fp_df_min.head(5)
shap_fp_df_min = shap_fp_df_min.reset_index(drop=True)

lime_fp_df_min = lime_fp_df_min.reset_index(drop=True)

print(breakdown_fp_df_min)
print(shap_fp_df_min)
print(lime_fp_df_min)

In [None]:
#lime_fp_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_min['Variable'])
shapley_features = list(shap_fp_df_min['Variable'])
lime_features = list(lime_fp_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_min[breakdown_fp_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_min[shap_fp_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_min[lime_fp_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["developer_num", "parallel_changed_file_num", "commit_num", "file_added", "file_removed", "duration"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **FP General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_gb_fp = pd.DataFrame(data_dict, index=all_caract)

df_resumen_gb_fp

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_gb_fp[("General", "Ranking")] = df_resumen_gb_fp[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_gb_fp[("General", "Conteo Total")] = df_resumen_gb_fp[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_gb_fp

In [None]:
# Obtener el número de características
num_caract = df_resumen_gb_fp.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_gb_fp[("General", "Peso Rango")] = 1 - ((df_resumen_gb_fp[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_gb_fp[("General", "Peso Conteo")] = df_resumen_gb_fp[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_gb_fp[("General", "Puntaje")] = df_resumen_gb_fp[("General", "Peso Rango")] + df_resumen_gb_fp[("General", "Peso Conteo")]
df_resumen_gb_fp[("General", "Ranking")] = df_resumen_gb_fp[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_gb_fp.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_gb_fp.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_gb_fp.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_gb_fp

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_gb_fp.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_gb_fp = df_resumen_gb_fp[new_columns]

In [None]:
df_resumen_gb_fp.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_gb_fp[(tech, "Ranking Medio")] = df_resumen_gb_fp[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_gb_fp

### **Instancia FN MAX:**

In [None]:
breakdown_fn_max = exp.predict_parts(df_instancia_fn_max, type="break_down",random_state=42)
shap_fn_max = exp.predict_parts(df_instancia_fn_max, type="shap",random_state=42)
lime_fn_max = exp.predict_surrogate(df_instancia_fn_max, random_state=42)

breakdown_fn_df_max = breakdown_fn_max.result
shap_fn_df_max = shap_fn_max.result
lime_fn_df_max=lime_fn_max.result

In [None]:
breakdown_fn_max.plot()

In [None]:
shap_fn_max.plot()

In [None]:
lime_fn_max.plot()

In [None]:
breakdown_fn_df_max = breakdown_fn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_max = breakdown_fn_df_max.drop(index=[0, 26])
breakdown_fn_df_max['sign'] = breakdown_fn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_max = breakdown_fn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_max = shap_fn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_max = shap_fn_df_max.tail(25)
shap_fn_df_max['sign'] = shap_fn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_max = shap_fn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_max["Variable"] = lime_fn_df_max["variable"].str.split(" ").str[0]
lime_fn_df_max["Signo"] = lime_fn_df_max["effect"].apply(evaluar_valor)
lime_fn_df_max = lime_fn_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_max = lime_fn_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_max['Ranking'] = breakdown_fn_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_max = breakdown_fn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_max['Ranking'] = shap_fn_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_max = shap_fn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_max['Ranking'] = lime_fn_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_max = lime_fn_df_max.head(5)
lime_fn_df_max = lime_fn_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_fn_df_max = breakdown_fn_df_max.drop(columns=['contribution'])
shap_fn_df_max = shap_fn_df_max.drop(columns=['contribution'])
lime_fn_df_max = lime_fn_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_max = breakdown_fn_df_max.head(5)
breakdown_fn_df_max = breakdown_fn_df_max.reset_index(drop=True)

shap_fn_df_max = shap_fn_df_max.head(5)
shap_fn_df_max = shap_fn_df_max.reset_index(drop=True)

lime_fn_df_max = lime_fn_df_max.reset_index(drop=True)
lime_fn_df_max.at[2, 'Variable'] = 'developer_num'
lime_fn_df_max.at[3, 'Variable'] = 'line_removed'
lime_fn_df_max.at[4, 'Variable'] = 'file_added'

print(breakdown_fn_df_max)
print(shap_fn_df_max)
print(lime_fn_df_max)

In [None]:
#lime_fn_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_max['Variable'])
shapley_features = list(shap_fn_df_max['Variable'])
lime_features = list(lime_fn_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_max[breakdown_fn_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_max[shap_fn_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_max[lime_fn_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["commit_num", "parallel_changed_file_num", "developer_num", "file_added", "duration", "line_added", "file_removed", "line_removed"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia FN MEDIANA:**

In [None]:
breakdown_fn_mediana = exp.predict_parts(df_instancia_fn_mediana, type="break_down",random_state=42)
shap_fn_mediana = exp.predict_parts(df_instancia_fn_mediana, type="shap",random_state=42)
lime_fn_mediana = exp.predict_surrogate(df_instancia_fn_mediana, random_state=42)

breakdown_fn_df_mediana = breakdown_fn_mediana.result
shap_fn_df_mediana = shap_fn_mediana.result
lime_fn_df_mediana=lime_fn_mediana.result

In [None]:
breakdown_fn_mediana.plot()

In [None]:
shap_fn_mediana.plot()

In [None]:
lime_fn_mediana.plot()

In [None]:
breakdown_fn_df_mediana = breakdown_fn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_mediana = breakdown_fn_df_mediana.drop(index=[0, 26])
breakdown_fn_df_mediana['sign'] = breakdown_fn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_mediana = breakdown_fn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_mediana = shap_fn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_mediana = shap_fn_df_mediana.tail(25)
shap_fn_df_mediana['sign'] = shap_fn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_mediana = shap_fn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_mediana["Variable"] = lime_fn_df_mediana["variable"].str.split(" ").str[0]
lime_fn_df_mediana["Signo"] = lime_fn_df_mediana["effect"].apply(evaluar_valor)
lime_fn_df_mediana = lime_fn_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_mediana = lime_fn_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_mediana['Ranking'] = breakdown_fn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_mediana = breakdown_fn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_mediana['Ranking'] = shap_fn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_mediana = shap_fn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_mediana['Ranking'] = lime_fn_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_mediana = lime_fn_df_mediana.head(5)
lime_fn_df_mediana = lime_fn_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_fn_df_mediana = breakdown_fn_df_mediana.drop(columns=['contribution'])
shap_fn_df_mediana = shap_fn_df_mediana.drop(columns=['contribution'])
lime_fn_df_mediana = lime_fn_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_mediana = breakdown_fn_df_mediana.head(5)
breakdown_fn_df_mediana = breakdown_fn_df_mediana.reset_index(drop=True)

shap_fn_df_mediana = shap_fn_df_mediana.head(5)
shap_fn_df_mediana = shap_fn_df_mediana.reset_index(drop=True)

lime_fn_df_mediana = lime_fn_df_mediana.reset_index(drop=True)

print(breakdown_fn_df_mediana)
print(shap_fn_df_mediana)
print(lime_fn_df_mediana)

In [None]:
#lime_fn_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_mediana['Variable'])
shapley_features = list(shap_fn_df_mediana['Variable'])
lime_features = list(lime_fn_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_mediana[breakdown_fn_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_mediana[shap_fn_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_mediana[lime_fn_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["line_added", "parallel_changed_file_num", "file_added", "file_modified", "file_removed", "developer_num","commit_num"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia FN MIN:**

In [None]:
breakdown_fn_min = exp.predict_parts(df_instancia_fn_min, type="break_down",random_state=42)
shap_fn_min = exp.predict_parts(df_instancia_fn_min, type="shap",random_state=42)
lime_fn_min = exp.predict_surrogate(df_instancia_fn_min, random_state=42)

breakdown_fn_df_min = breakdown_fn_min.result
shap_fn_df_min = shap_fn_min.result
lime_fn_df_min = lime_fn_min.result

In [None]:
breakdown_fn_min.plot()

In [None]:
shap_fn_min.plot()

In [None]:
lime_fn_min.plot()

In [None]:
breakdown_fn_df_min = breakdown_fn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_min = breakdown_fn_df_min.drop(index=[0, 26])
breakdown_fn_df_min['sign'] = breakdown_fn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_min = breakdown_fn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_min = shap_fn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_min = shap_fn_df_min.tail(25)
shap_fn_df_min['sign'] = shap_fn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_min = shap_fn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_min["Variable"] = lime_fn_df_min["variable"].str.split(" ").str[0]
lime_fn_df_min["Signo"] = lime_fn_df_min["effect"].apply(evaluar_valor)
lime_fn_df_min = lime_fn_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_min = lime_fn_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_min['Ranking'] = breakdown_fn_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_min = breakdown_fn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_min['Ranking'] = shap_fn_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_min = shap_fn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_min['Ranking'] = lime_fn_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_min = lime_fn_df_min.head(5)
lime_fn_df_min = lime_fn_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_fn_df_min = breakdown_fn_df_min.drop(columns=['contribution'])
shap_fn_df_min = shap_fn_df_min.drop(columns=['contribution'])
lime_fn_df_min = lime_fn_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_min = breakdown_fn_df_min.head(5)
breakdown_fn_df_min = breakdown_fn_df_min.reset_index(drop=True)

shap_fn_df_min = shap_fn_df_min.head(5)
shap_fn_df_min = shap_fn_df_min.reset_index(drop=True)

lime_fn_df_min = lime_fn_df_min.reset_index(drop=True)

print(breakdown_fn_df_min)
print(shap_fn_df_min)
print(lime_fn_df_min)

In [None]:
#lime_fn_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_min['Variable'])
shapley_features = list(shap_fn_df_min['Variable'])
lime_features = list(lime_fn_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_min[breakdown_fn_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_min[shap_fn_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_min[lime_fn_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["duration", "parallel_changed_file_num", "file_added", "file_modified", "developer_num", "line_added", "file_removed"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **FN General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_gb_fn = pd.DataFrame(data_dict, index=all_caract)

df_resumen_gb_fn

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_gb_fn[("General", "Ranking")] = df_resumen_gb_fn[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_gb_fn[("General", "Conteo Total")] = df_resumen_gb_fn[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_gb_fn

In [None]:
# Obtener el número de características
num_caract = df_resumen_gb_fn.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_gb_fn[("General", "Peso Rango")] = 1 - ((df_resumen_gb_fn[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_gb_fn[("General", "Peso Conteo")] = df_resumen_gb_fn[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_gb_fn[("General", "Puntaje")] = df_resumen_gb_fn[("General", "Peso Rango")] + df_resumen_gb_fn[("General", "Peso Conteo")]
df_resumen_gb_fn[("General", "Ranking")] = df_resumen_gb_fn[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_gb_fn.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_gb_fn.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_gb_fn.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_gb_fn

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_gb_fn.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_gb_fn = df_resumen_gb_fn[new_columns]

In [None]:
df_resumen_gb_fn.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_gb_fn[(tech, "Ranking Medio")] = df_resumen_gb_fn[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_gb_fn

## **AdaBoosting:**

**FEATURE IMPORTANCE**

In [None]:
permu = permutation_importance(modelo_ada, x_test, y_test, n_repeats=20, random_state=42, n_jobs=2, scoring='f1')

# Umbral para valores significativos
importance_threshold = 0.01

# Filtrar los caracteristicas
significant_indices = permu.importances_mean > importance_threshold
permu_importances = pd.Series(permu.importances_mean.round(3), index=feature_names)[significant_indices]
permu_std = permu.importances_std[significant_indices]

# Crear la representación gráfica
fig, ax = plt.subplots()
permu_importances.plot.bar(yerr=permu_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
scoring = ['precision', 'recall','f1']
metric_names = ['Precision', 'Recall', 'F1-score']

permu_score = permutation_importance(modelo_ada, x_test, y_test, n_repeats=20, random_state=42, scoring=scoring)
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

# Umbral para importancias significativas
importance_threshold = 0.01

# Itera a través de las métricas
for i, metric in enumerate(scoring):
    permu = permu_score[metric]

    # Filtra las características que cumplen la condición del umbral
    significant_indices = [j for j in range(len(permu.importances_mean)) if permu.importances_mean[j] > importance_threshold]
    sorted_feature_names = [feature_names[j] for j in significant_indices]
    importances_mean = permu.importances_mean[significant_indices]
    importances_std = permu.importances_std[significant_indices]

    # Ordena los datos por importancia de mayor a menor
    sorted_indices = np.argsort(importances_mean)[::1]
    sorted_feature_names = [sorted_feature_names[j] for j in sorted_indices]
    importances_mean = importances_mean[sorted_indices]
    importances_std = importances_std[sorted_indices]

    # Crea la representación gráfica en el subplot correspondiente
    axs[i].barh(range(len(sorted_feature_names)), importances_mean, xerr=importances_std, align='center')
    axs[i].set_yticks(range(len(sorted_feature_names)))
    axs[i].set_yticklabels(sorted_feature_names)
    axs[i].set_xlabel('Valor Importancia')
    axs[i].set_title(f'Importancia por Permutación para {metric_names[i]}')

# Ajusta los espacios entre subplots y muestra la figura
plt.tight_layout()
plt.show()


In [None]:
scoring = ['precision','recall','f1']
metric_names = ['Precision', 'Recall', 'F1-score']

# Umbral para importancias significativas
importance_threshold = 0.01

# Crea un diccionario para almacenar los DataFrames
results_global_ada = {}

permu_score = permutation_importance(modelo_ada, x_test, y_test, n_repeats=20, random_state=42, scoring=scoring)
for i, metric in enumerate(scoring):
  permu = permu_score[metric]

  # Filtra las características que cumplen la condición
  significant_indices = [j for j in range(len(permu.importances_mean)) if permu.importances_mean[j] > importance_threshold]
  sorted_feature_names = [feature_names[j] for j in significant_indices]
  importances_mean = permu.importances_mean[significant_indices]
  importances_std = permu.importances_std[significant_indices]

  # Crear un DataFrame con los resultados
  df_exp_global = pd.DataFrame({'Feature': sorted_feature_names,
                       'Importance_Mean': importances_mean,
                       'Importance_Std': importances_std})

  # Ordenar el DataFrame por importance_mean en orden descendente
  df_exp_global = df_exp_global.sort_values(by='Importance_Mean', ascending=False)

  # Asignar el DataFrame al diccionario con el nombre de la métrica
  results_global_ada[f'df_global_{metric_names[i]}'] = df_exp_global

In [None]:
results_global_ada['df_global_Precision']

In [None]:
results_global_ada['df_global_Recall']

In [None]:
results_global_ada['df_global_F1-score']

**BREAK-DOWN, SHAP Y LIME:**

In [None]:
#primero definimos el explainer
exp = dx.Explainer(modelo_ada, x_train, y_train)

### **Instancia VP MAX:**

In [None]:
breakdown_vp_max = exp.predict_parts(df_instancia_vp_max, type="break_down",random_state=42)
shap_vp_max = exp.predict_parts(df_instancia_vp_max, type="shap",random_state=42)
lime_vp_max = exp.predict_surrogate(df_instancia_vp_max, random_state=42)

breakdown_vp_df_max = breakdown_vp_max.result
shap_vp_df_max = shap_vp_max.result
lime_vp_df_max=lime_vp_max.result

In [None]:
breakdown_vp_max.plot()

In [None]:
shap_vp_max.plot()

In [None]:
lime_vp_max.plot()

In [None]:
breakdown_vp_df_max = breakdown_vp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_max = breakdown_vp_df_max.drop(index=[0, 26])
breakdown_vp_df_max['sign'] = breakdown_vp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_max = breakdown_vp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_max = shap_vp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_max = shap_vp_df_max.tail(25)
shap_vp_df_max['sign'] = shap_vp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_max = shap_vp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_max["Variable"] = lime_vp_df_max["variable"].str.split(" ").str[0]
lime_vp_df_max["Signo"] = lime_vp_df_max["effect"].apply(evaluar_valor)
lime_vp_df_max = lime_vp_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_max = lime_vp_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_max['Ranking'] = breakdown_vp_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_max = breakdown_vp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_max['Ranking'] = shap_vp_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_max = shap_vp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_max['Ranking'] = lime_vp_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_max = lime_vp_df_max.head(5)
lime_vp_df_max = lime_vp_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_max = breakdown_vp_df_max.drop(columns=['contribution'])
shap_vp_df_max = shap_vp_df_max.drop(columns=['contribution'])
lime_vp_df_max = lime_vp_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_max = breakdown_vp_df_max.head(5)
breakdown_vp_df_max = breakdown_vp_df_max.reset_index(drop=True)

shap_vp_df_max = shap_vp_df_max.head(5)
shap_vp_df_max = shap_vp_df_max.reset_index(drop=True)

lime_vp_df_max = lime_vp_df_max.reset_index(drop=True)

print(breakdown_vp_df_max)
print(shap_vp_df_max)
print(lime_vp_df_max)

In [None]:
#lime_vp_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_max['Variable'])
shapley_features = list(shap_vp_df_max['Variable'])
lime_features = list(lime_vp_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_max[breakdown_vp_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_max[shap_vp_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_max[lime_vp_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "line_removed", "file_modified", "developer_num", "file_removed", "file_added", "line_added"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia VP MEDIANA:**

In [None]:
breakdown_vp_mediana = exp.predict_parts(df_instancia_vp_mediana, type="break_down",random_state=42)
shap_vp_mediana = exp.predict_parts(df_instancia_vp_mediana, type="shap",random_state=42)
lime_vp_mediana = exp.predict_surrogate(df_instancia_vp_mediana, random_state=42)

breakdown_vp_df_mediana = breakdown_vp_mediana.result
shap_vp_df_mediana = shap_vp_mediana.result
lime_vp_df_mediana=lime_vp_mediana.result

In [None]:
breakdown_vp_mediana.plot()

In [None]:
shap_vp_mediana.plot()

In [None]:
lime_vp_mediana.plot()

In [None]:
breakdown_vp_df_mediana = breakdown_vp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_mediana = breakdown_vp_df_mediana.drop(index=[0, 26])
breakdown_vp_df_mediana['sign'] = breakdown_vp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_mediana = breakdown_vp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_mediana = shap_vp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_mediana = shap_vp_df_mediana.tail(25)
shap_vp_df_mediana['sign'] = shap_vp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_mediana = shap_vp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_mediana["Variable"] = lime_vp_df_mediana["variable"].str.split(" ").str[0]
lime_vp_df_mediana["Signo"] = lime_vp_df_mediana["effect"].apply(evaluar_valor)
lime_vp_df_mediana = lime_vp_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_mediana = lime_vp_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_mediana['Ranking'] = breakdown_vp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_mediana = breakdown_vp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_mediana['Ranking'] = shap_vp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_mediana = shap_vp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_mediana['Ranking'] = lime_vp_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_mediana = lime_vp_df_mediana.head(5)
lime_vp_df_mediana = lime_vp_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_mediana = breakdown_vp_df_mediana.drop(columns=['contribution'])
shap_vp_df_mediana = shap_vp_df_mediana.drop(columns=['contribution'])
lime_vp_df_mediana = lime_vp_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_mediana = breakdown_vp_df_mediana.head(5)
breakdown_vp_df_mediana = breakdown_vp_df_mediana.reset_index(drop=True)

shap_vp_df_mediana = shap_vp_df_mediana.head(5)
shap_vp_df_mediana = shap_vp_df_mediana.reset_index(drop=True)

lime_vp_df_mediana = lime_vp_df_mediana.reset_index(drop=True)

print(breakdown_vp_df_mediana)
print(shap_vp_df_mediana)
print(lime_vp_df_mediana)

In [None]:
#lime_vp_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_mediana['Variable'])
shapley_features = list(shap_vp_df_mediana['Variable'])
lime_features = list(lime_vp_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_mediana[breakdown_vp_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_mediana[shap_vp_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_mediana[lime_vp_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "file_removed", "commit_num", "file_modified", "line_added", "file_added"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia VP MIN:**

In [None]:
breakdown_vp_min = exp.predict_parts(df_instancia_vp_min, type="break_down",random_state=42)
shap_vp_min = exp.predict_parts(df_instancia_vp_min, type="shap",random_state=42)
lime_vp_min = exp.predict_surrogate(df_instancia_vp_min, random_state=42)

breakdown_vp_df_min = breakdown_vp_min.result
shap_vp_df_min = shap_vp_min.result
lime_vp_df_min = lime_vp_min.result

In [None]:
breakdown_vp_min.plot()

In [None]:
shap_vp_min.plot()

In [None]:
lime_vp_min.plot()

In [None]:
breakdown_vp_df_min = breakdown_vp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_min = breakdown_vp_df_min.drop(index=[0, 26])
breakdown_vp_df_min['sign'] = breakdown_vp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_min = breakdown_vp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_min = shap_vp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_min = shap_vp_df_min.tail(25)
shap_vp_df_min['sign'] = shap_vp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_min = shap_vp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_min["Variable"] = lime_vp_df_min["variable"].str.split(" ").str[0]
lime_vp_df_min["Signo"] = lime_vp_df_min["effect"].apply(evaluar_valor)
lime_vp_df_min = lime_vp_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_min = lime_vp_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_min['Ranking'] = breakdown_vp_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_min = breakdown_vp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_min['Ranking'] = shap_vp_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_min = shap_vp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_min['Ranking'] = lime_vp_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_min = lime_vp_df_min.head(5)
lime_vp_df_min = lime_vp_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_min = breakdown_vp_df_min.drop(columns=['contribution'])
shap_vp_df_min = shap_vp_df_min.drop(columns=['contribution'])
lime_vp_df_min = lime_vp_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_min = breakdown_vp_df_min.head(5)
breakdown_vp_df_min = breakdown_vp_df_min.reset_index(drop=True)

shap_vp_df_min = shap_vp_df_min.head(5)
shap_vp_df_min = shap_vp_df_min.reset_index(drop=True)

lime_vp_df_min = lime_vp_df_min.reset_index(drop=True)

print(breakdown_vp_df_min)
print(shap_vp_df_min)
print(lime_vp_df_min)

In [None]:
#lime_vp_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_min['Variable'])
shapley_features = list(shap_vp_df_min['Variable'])
lime_features = list(lime_vp_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_min[breakdown_vp_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_min[shap_vp_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_min[lime_vp_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "developer_num", "line_removed", "messages_median", "file_removed", "file_added"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **VP General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_ada_vp = pd.DataFrame(data_dict, index=all_caract)

df_resumen_ada_vp

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_ada_vp[("General", "Ranking")] = df_resumen_ada_vp[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_ada_vp[("General", "Conteo Total")] = df_resumen_ada_vp[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_ada_vp

In [None]:
# Obtener el número de características
num_caract = df_resumen_ada_vp.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_ada_vp[("General", "Peso Rango")] = 1 - ((df_resumen_ada_vp[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_ada_vp[("General", "Peso Conteo")] = df_resumen_ada_vp[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_ada_vp[("General", "Puntaje")] = df_resumen_ada_vp[("General", "Peso Rango")] + df_resumen_ada_vp[("General", "Peso Conteo")]
df_resumen_ada_vp[("General", "Ranking")] = df_resumen_ada_vp[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_ada_vp.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_ada_vp.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_ada_vp.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_ada_vp

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_ada_vp.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_ada_vp = df_resumen_ada_vp[new_columns]

In [None]:
df_resumen_ada_vp.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_ada_vp[(tech, "Ranking Medio")] = df_resumen_ada_vp[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_ada_vp

### **Instancia VN MAX:**

In [None]:
breakdown_vn_max = exp.predict_parts(df_instancia_vn_max, type="break_down",random_state=42)
shap_vn_max = exp.predict_parts(df_instancia_vn_max, type="shap",random_state=42)
lime_vn_max = exp.predict_surrogate(df_instancia_vn_max, random_state=42)

breakdown_vn_df_max = breakdown_vn_max.result
shap_vn_df_max = shap_vn_max.result
lime_vn_df_max = lime_vn_max.result

In [None]:
breakdown_vn_max.plot()

In [None]:
shap_vn_max.plot()

In [None]:
lime_vn_max.plot()

In [None]:
breakdown_vn_df_max = breakdown_vn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_max = breakdown_vn_df_max.drop(index=[0, 26])
breakdown_vn_df_max['sign'] = breakdown_vn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_max = breakdown_vn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_max = shap_vn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_max = shap_vn_df_max.tail(25)
shap_vn_df_max['sign'] = shap_vn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_max = shap_vn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_max["Variable"] = lime_vn_df_max["variable"].str.split(" ").str[0]
lime_vn_df_max["Signo"] = lime_vn_df_max["effect"].apply(evaluar_valor)
lime_vn_df_max = lime_vn_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_max = lime_vn_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_max['Ranking'] = breakdown_vn_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_max = breakdown_vn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_max['Ranking'] = shap_vn_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_max = shap_vn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_max['Ranking'] = lime_vn_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_max = lime_vn_df_max.head(5)
lime_vn_df_max = lime_vn_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_vn_df_max = breakdown_vn_df_max.drop(columns=['contribution'])
shap_vn_df_max = shap_vn_df_max.drop(columns=['contribution'])
lime_vn_df_max = lime_vn_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_max = breakdown_vn_df_max.head(5)
breakdown_vn_df_max = breakdown_vn_df_max.reset_index(drop=True)

shap_vn_df_max = shap_vn_df_max.head(5)
shap_vn_df_max = shap_vn_df_max.reset_index(drop=True)

lime_vn_df_max = lime_vn_df_max.reset_index(drop=True)
lime_vn_df_max.at[2, 'Variable'] = 'commit_num'
lime_vn_df_max.at[3, 'Variable'] = 'file_modified'

print(breakdown_vn_df_max)
print(shap_vn_df_max)
print(lime_vn_df_max)

In [None]:
#lime_vn_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_max['Variable'])
shapley_features = list(shap_vn_df_max['Variable'])
lime_features = list(lime_vn_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_max[breakdown_vn_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_max[shap_vn_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_max[lime_vn_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "improve_frequency","commit_num", "developer_num", "duration", "file_removed", "file_modified", "commit_density"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia VN MEDIANA:**

In [None]:
breakdown_vn_mediana = exp.predict_parts(df_instancia_vn_mediana, type="break_down",random_state=42)
shap_vn_mediana = exp.predict_parts(df_instancia_vn_mediana, type="shap",random_state=42)
lime_vn_mediana = exp.predict_surrogate(df_instancia_vn_mediana, random_state=42)

breakdown_vn_df_mediana = breakdown_vn_mediana.result
shap_vn_df_mediana = shap_vn_mediana.result
lime_vn_df_mediana = lime_vn_mediana.result

In [None]:
breakdown_vn_mediana.plot()

In [None]:
shap_vn_mediana.plot()

In [None]:
lime_vn_mediana.plot()

In [None]:
breakdown_vn_df_mediana = breakdown_vn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_mediana = breakdown_vn_df_mediana.drop(index=[0, 26])
breakdown_vn_df_mediana['sign'] = breakdown_vn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_mediana = breakdown_vn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_mediana = shap_vn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_mediana = shap_vn_df_mediana.tail(25)
shap_vn_df_mediana['sign'] = shap_vn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_mediana = shap_vn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_mediana["Variable"] = lime_vn_df_mediana["variable"].str.split(" ").str[0]
lime_vn_df_mediana["Signo"] = lime_vn_df_mediana["effect"].apply(evaluar_valor)
lime_vn_df_mediana = lime_vn_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_mediana = lime_vn_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_mediana['Ranking'] = breakdown_vn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_mediana = breakdown_vn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_mediana['Ranking'] = shap_vn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_mediana = shap_vn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_mediana['Ranking'] = lime_vn_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_mediana = lime_vn_df_mediana.head(5)
lime_vn_df_mediana = lime_vn_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_vn_df_mediana = breakdown_vn_df_mediana.drop(columns=['contribution'])
shap_vn_df_mediana = shap_vn_df_mediana.drop(columns=['contribution'])
lime_vn_df_mediana = lime_vn_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_mediana = breakdown_vn_df_mediana.head(5)
breakdown_vn_df_mediana = breakdown_vn_df_mediana.reset_index(drop=True)

shap_vn_df_mediana = shap_vn_df_mediana.head(5)
shap_vn_df_mediana = shap_vn_df_mediana.reset_index(drop=True)

lime_vn_df_mediana = lime_vn_df_mediana.reset_index(drop=True)
lime_vn_df_mediana.at[3, 'Variable'] = 'developer_num'
lime_vn_df_mediana.at[4, 'Variable'] = 'file_modified'

print(breakdown_vn_df_mediana)
print(shap_vn_df_mediana)
print(lime_vn_df_mediana)

In [None]:
#lime_vn_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_mediana['Variable'])
shapley_features = list(shap_vn_df_mediana['Variable'])
lime_features = list(lime_vn_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_mediana[breakdown_vn_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_mediana[shap_vn_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_mediana[lime_vn_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "developer_num", "line_removed", "file_added", "file_removed", "file_modified"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia VN MIN:**

In [None]:
breakdown_vn_min = exp.predict_parts(df_instancia_vn_min, type="break_down",random_state=42)
shap_vn_min = exp.predict_parts(df_instancia_vn_min, type="shap",random_state=42)
lime_vn_min = exp.predict_surrogate(df_instancia_vn_min, random_state=42)

breakdown_vn_df_min = breakdown_vn_min.result
shap_vn_df_min = shap_vn_min.result
lime_vn_df_min = lime_vn_min.result

In [None]:
breakdown_vn_min.plot()

In [None]:
shap_vn_min.plot()

In [None]:
lime_vn_min.plot()

In [None]:
breakdown_vn_df_min = breakdown_vn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_min = breakdown_vn_df_min.drop(index=[0, 26])
breakdown_vn_df_min['sign'] = breakdown_vn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_min = breakdown_vn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_min = shap_vn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_min = shap_vn_df_min.tail(25)
shap_vn_df_min['sign'] = shap_vn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_min = shap_vn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_min["Variable"] = lime_vn_df_min["variable"].str.split(" ").str[0]
lime_vn_df_min["Signo"] = lime_vn_df_min["effect"].apply(evaluar_valor)
lime_vn_df_min = lime_vn_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_min = lime_vn_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_min['Ranking'] = breakdown_vn_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_min = breakdown_vn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_min['Ranking'] = shap_vn_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_min = shap_vn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_min['Ranking'] = lime_vn_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_min = lime_vn_df_min.head(5)
lime_vn_df_min = lime_vn_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vn_df_min = breakdown_vn_df_min.drop(columns=['contribution'])
shap_vn_df_min = shap_vn_df_min.drop(columns=['contribution'])
lime_vn_df_min = lime_vn_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_min = breakdown_vn_df_min.head(5)
breakdown_vn_df_min = breakdown_vn_df_min.reset_index(drop=True)

shap_vn_df_min = shap_vn_df_min.head(5)
shap_vn_df_min = shap_vn_df_min.reset_index(drop=True)

lime_vn_df_min = lime_vn_df_min.reset_index(drop=True)
lime_vn_df_min.at[4, 'Variable'] = 'file_modified'

print(breakdown_vn_df_min)
print(shap_vn_df_min)
print(lime_vn_df_min)

In [None]:
#lime_vn_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_min['Variable'])
shapley_features = list(shap_vn_df_min['Variable'])
lime_features = list(lime_vn_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_min[breakdown_vn_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_min[shap_vn_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_min[lime_vn_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "messages_median", "messages_min", "developer_num", "commit_num", "file_removed", "file_added", "file_modified"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **VN General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_ada_vn = pd.DataFrame(data_dict, index=all_caract)

df_resumen_ada_vn

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_ada_vn[("General", "Ranking")] = df_resumen_ada_vn[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_ada_vn[("General", "Conteo Total")] = df_resumen_ada_vn[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_ada_vn

In [None]:
# Obtener el número de características
num_caract = df_resumen_ada_vn.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_ada_vn[("General", "Peso Rango")] = 1 - ((df_resumen_ada_vn[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_ada_vn[("General", "Peso Conteo")] = df_resumen_ada_vn[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_ada_vn[("General", "Puntaje")] = df_resumen_ada_vn[("General", "Peso Rango")] + df_resumen_ada_vn[("General", "Peso Conteo")]
df_resumen_ada_vn[("General", "Ranking")] = df_resumen_ada_vn[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_ada_vn.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_ada_vn.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_ada_vn.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_ada_vn

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_ada_vn.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_ada_vn = df_resumen_ada_vn[new_columns]

In [None]:
df_resumen_ada_vn.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_ada_vn[(tech, "Ranking Medio")] = df_resumen_ada_vn[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_ada_vn

### **Instancia FP MAX:**

In [None]:
breakdown_fp_max = exp.predict_parts(df_instancia_fp_max, type="break_down",random_state=42)
shap_fp_max = exp.predict_parts(df_instancia_fp_max, type="shap",random_state=42)
lime_fp_max = exp.predict_surrogate(df_instancia_fp_max, random_state=42)

breakdown_fp_df_max = breakdown_fp_max.result
shap_fp_df_max = shap_fp_max.result
lime_fp_df_max=lime_fp_max.result

In [None]:
breakdown_fp_max.plot()

In [None]:
shap_fp_max.plot()

In [None]:
lime_fp_max.plot()

In [None]:
breakdown_fp_df_max = breakdown_fp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_max = breakdown_fp_df_max.drop(index=[0, 26])
breakdown_fp_df_max['sign'] = breakdown_fp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_max = breakdown_fp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_max = shap_fp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_max = shap_fp_df_max.tail(25)
shap_fp_df_max['sign'] = shap_fp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_max = shap_fp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_max["Variable"] = lime_fp_df_max["variable"].str.split(" ").str[0]
lime_fp_df_max["Signo"] = lime_fp_df_max["effect"].apply(evaluar_valor)
lime_fp_df_max = lime_fp_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_max = lime_fp_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_max['Ranking'] = breakdown_fp_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_max = breakdown_fp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_max['Ranking'] = shap_fp_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_max = shap_fp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_max['Ranking'] = lime_fp_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_max = lime_fp_df_max.head(5)
lime_fp_df_max = lime_fp_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_fp_df_max = breakdown_fp_df_max.drop(columns=['contribution'])
shap_fp_df_max = shap_fp_df_max.drop(columns=['contribution'])
lime_fp_df_max = lime_fp_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_max = breakdown_fp_df_max.head(5)
breakdown_fp_df_max = breakdown_fp_df_max.reset_index(drop=True)

shap_fp_df_max = shap_fp_df_max.head(5)
shap_fp_df_max = shap_fp_df_max.reset_index(drop=True)

lime_fp_df_max = lime_fp_df_max.reset_index(drop=True)

print(breakdown_fp_df_max)
print(shap_fp_df_max)
print(lime_fp_df_max)

In [None]:
#lime_fp_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_max['Variable'])
shapley_features = list(shap_fp_df_max['Variable'])
lime_features = list(lime_fp_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_max[breakdown_fp_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_max[shap_fp_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_max[lime_fp_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "line_removed", "line_added", "file_modified", "file_removed", "file_added"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia FP MEDIANA:**

In [None]:
breakdown_fp_mediana = exp.predict_parts(df_instancia_fp_mediana, type="break_down",random_state=42)
shap_fp_mediana = exp.predict_parts(df_instancia_fp_mediana, type="shap",random_state=42)
lime_fp_mediana = exp.predict_surrogate(df_instancia_fp_mediana, random_state=42)

breakdown_fp_df_mediana = breakdown_fp_mediana.result
shap_fp_df_mediana = shap_fp_mediana.result
lime_fp_df_mediana=lime_fp_mediana.result

In [None]:
breakdown_fp_mediana.plot()

In [None]:
shap_fp_mediana.plot()

In [None]:
lime_fp_mediana.plot()

In [None]:
breakdown_fp_df_mediana = breakdown_fp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_mediana = breakdown_fp_df_mediana.drop(index=[0, 26])
breakdown_fp_df_mediana['sign'] = breakdown_fp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_mediana = breakdown_fp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_mediana = shap_fp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_mediana = shap_fp_df_mediana.tail(25)
shap_fp_df_mediana['sign'] = shap_fp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_mediana = shap_fp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_mediana["Variable"] = lime_fp_df_mediana["variable"].str.split(" ").str[0]
lime_fp_df_mediana["Signo"] = lime_fp_df_mediana["effect"].apply(evaluar_valor)
lime_fp_df_mediana = lime_fp_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_mediana = lime_fp_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_mediana['Ranking'] = breakdown_fp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_mediana = breakdown_fp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_mediana['Ranking'] = shap_fp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_mediana = shap_fp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_mediana['Ranking'] = lime_fp_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_mediana = lime_fp_df_mediana.head(5)
lime_fp_df_mediana = lime_fp_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_fp_df_mediana = breakdown_fp_df_mediana.drop(columns=['contribution'])
shap_fp_df_mediana = shap_fp_df_mediana.drop(columns=['contribution'])
lime_fp_df_mediana = lime_fp_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_mediana = breakdown_fp_df_mediana.head(5)
breakdown_fp_df_mediana = breakdown_fp_df_mediana.reset_index(drop=True)

shap_fp_df_mediana = shap_fp_df_mediana.head(5)
shap_fp_df_mediana = shap_fp_df_mediana.reset_index(drop=True)

lime_fp_df_mediana = lime_fp_df_mediana.reset_index(drop=True)

print(breakdown_fp_df_mediana)
print(shap_fp_df_mediana)
print(lime_fp_df_mediana)

In [None]:
#lime_fp_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_mediana['Variable'])
shapley_features = list(shap_fp_df_mediana['Variable'])
lime_features = list(lime_fp_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_mediana[breakdown_fp_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_mediana[shap_fp_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_mediana[lime_fp_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "file_added", "developer_num", "line_added",  "file_removed"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia FP MIN:**

In [None]:
breakdown_fp_min = exp.predict_parts(df_instancia_fp_min, type="break_down",random_state=42)
shap_fp_min = exp.predict_parts(df_instancia_fp_min, type="shap",random_state=42)
lime_fp_min = exp.predict_surrogate(df_instancia_fp_min, random_state=42)

breakdown_fp_df_min = breakdown_fp_min.result
shap_fp_df_min = shap_fp_min.result
lime_fp_df_min = lime_fp_min.result

In [None]:
breakdown_fp_min.plot()

In [None]:
shap_fp_min.plot()

In [None]:
lime_fp_min.plot()

In [None]:
breakdown_fp_df_min = breakdown_fp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_min = breakdown_fp_df_min.drop(index=[0, 26])
breakdown_fp_df_min['sign'] = breakdown_fp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_min = breakdown_fp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_min = shap_fp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_min = shap_fp_df_min.tail(25)
shap_fp_df_min['sign'] = shap_fp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_min = shap_fp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_min["Variable"] = lime_fp_df_min["variable"].str.split(" ").str[0]
lime_fp_df_min["Signo"] = lime_fp_df_min["effect"].apply(evaluar_valor)
lime_fp_df_min = lime_fp_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_min = lime_fp_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_min['Ranking'] = breakdown_fp_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_min = breakdown_fp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_min['Ranking'] = shap_fp_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_min = shap_fp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_min['Ranking'] = lime_fp_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_min = lime_fp_df_min.head(5)
lime_fp_df_min = lime_fp_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_fp_df_min = breakdown_fp_df_min.drop(columns=['contribution'])
shap_fp_df_min = shap_fp_df_min.drop(columns=['contribution'])
lime_fp_df_min = lime_fp_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_min = breakdown_fp_df_min.head(5)
breakdown_fp_df_min = breakdown_fp_df_min.reset_index(drop=True)

shap_fp_df_min = shap_fp_df_min.head(5)
shap_fp_df_min = shap_fp_df_min.reset_index(drop=True)

lime_fp_df_min = lime_fp_df_min.reset_index(drop=True)

print(breakdown_fp_df_min)
print(shap_fp_df_min)
print(lime_fp_df_min)

In [None]:
#lime_fp_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_min['Variable'])
shapley_features = list(shap_fp_df_min['Variable'])
lime_features = list(lime_fp_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_min[breakdown_fp_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_min[shap_fp_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_min[lime_fp_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "developer_num", "file_added", "line_added", "file_removed"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **FP General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_ada_fp = pd.DataFrame(data_dict, index=all_caract)

df_resumen_ada_fp

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_ada_fp[("General", "Ranking")] = df_resumen_ada_fp[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_ada_fp[("General", "Conteo Total")] = df_resumen_ada_fp[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_ada_fp

In [None]:
# Obtener el número de características
num_caract = df_resumen_ada_fp.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_ada_fp[("General", "Peso Rango")] = 1 - ((df_resumen_ada_fp[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_ada_fp[("General", "Peso Conteo")] = df_resumen_ada_fp[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_ada_fp[("General", "Puntaje")] = df_resumen_ada_fp[("General", "Peso Rango")] + df_resumen_ada_fp[("General", "Peso Conteo")]
df_resumen_ada_fp[("General", "Ranking")] = df_resumen_ada_fp[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_ada_fp.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_ada_fp.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_ada_fp.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_ada_fp

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_ada_fp.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_ada_fp = df_resumen_ada_fp[new_columns]

In [None]:
df_resumen_ada_fp.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_ada_fp[(tech, "Ranking Medio")] = df_resumen_ada_fp[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_ada_fp

### **Instancia FN MAX:**

In [None]:
breakdown_fn_max = exp.predict_parts(df_instancia_fn_max, type="break_down",random_state=42)
shap_fn_max = exp.predict_parts(df_instancia_fn_max, type="shap",random_state=42)
lime_fn_max = exp.predict_surrogate(df_instancia_fn_max, random_state=42)

breakdown_fn_df_max = breakdown_fn_max.result
shap_fn_df_max = shap_fn_max.result
lime_fn_df_max=lime_fn_max.result

In [None]:
breakdown_fn_max.plot()

In [None]:
shap_fn_max.plot()

In [None]:
lime_fn_max.plot()

In [None]:
breakdown_fn_df_max = breakdown_fn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_max = breakdown_fn_df_max.drop(index=[0, 26])
breakdown_fn_df_max['sign'] = breakdown_fn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_max = breakdown_fn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_max = shap_fn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_max = shap_fn_df_max.tail(25)
shap_fn_df_max['sign'] = shap_fn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_max = shap_fn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_max["Variable"] = lime_fn_df_max["variable"].str.split(" ").str[0]
lime_fn_df_max["Signo"] = lime_fn_df_max["effect"].apply(evaluar_valor)
lime_fn_df_max = lime_fn_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_max = lime_fn_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_max['Ranking'] = breakdown_fn_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_max = breakdown_fn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_max['Ranking'] = shap_fn_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_max = shap_fn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_max['Ranking'] = lime_fn_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_max = lime_fn_df_max.head(5)
lime_fn_df_max = lime_fn_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_fn_df_max = breakdown_fn_df_max.drop(columns=['contribution'])
shap_fn_df_max = shap_fn_df_max.drop(columns=['contribution'])
lime_fn_df_max = lime_fn_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_max = breakdown_fn_df_max.head(5)
breakdown_fn_df_max = breakdown_fn_df_max.reset_index(drop=True)

shap_fn_df_max = shap_fn_df_max.head(5)
shap_fn_df_max = shap_fn_df_max.reset_index(drop=True)

lime_fn_df_max = lime_fn_df_max.reset_index(drop=True)
lime_fn_df_max.at[3, 'Variable'] = 'developer_num'
lime_fn_df_max.at[4, 'Variable'] = 'file_modified'

print(breakdown_fn_df_max)
print(shap_fn_df_max)
print(lime_fn_df_max)

In [None]:
#lime_fn_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_max['Variable'])
shapley_features = list(shap_fn_df_max['Variable'])
lime_features = list(lime_fn_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_max[breakdown_fn_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_max[shap_fn_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_max[lime_fn_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "developer_num", "use_frequency", "commit_density", "file_removed", "file_modified"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia FN MEDIANA:**

In [None]:
breakdown_fn_mediana = exp.predict_parts(df_instancia_fn_mediana, type="break_down",random_state=42)
shap_fn_mediana = exp.predict_parts(df_instancia_fn_mediana, type="shap",random_state=42)
lime_fn_mediana = exp.predict_surrogate(df_instancia_fn_mediana, random_state=42)

breakdown_fn_df_mediana = breakdown_fn_mediana.result
shap_fn_df_mediana = shap_fn_mediana.result
lime_fn_df_mediana=lime_fn_mediana.result

In [None]:
breakdown_fn_mediana.plot()

In [None]:
shap_fn_mediana.plot()

In [None]:
lime_fn_mediana.plot()

In [None]:
breakdown_fn_df_mediana = breakdown_fn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_mediana = breakdown_fn_df_mediana.drop(index=[0, 26])
breakdown_fn_df_mediana['sign'] = breakdown_fn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_mediana = breakdown_fn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_mediana = shap_fn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_mediana = shap_fn_df_mediana.tail(25)
shap_fn_df_mediana['sign'] = shap_fn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_mediana = shap_fn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_mediana["Variable"] = lime_fn_df_mediana["variable"].str.split(" ").str[0]
lime_fn_df_mediana["Signo"] = lime_fn_df_mediana["effect"].apply(evaluar_valor)
lime_fn_df_mediana = lime_fn_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_mediana = lime_fn_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_mediana['Ranking'] = breakdown_fn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_mediana = breakdown_fn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_mediana['Ranking'] = shap_fn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_mediana = shap_fn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_mediana['Ranking'] = lime_fn_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_mediana = lime_fn_df_mediana.head(5)
lime_fn_df_mediana = lime_fn_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_fn_df_mediana = breakdown_fn_df_mediana.drop(columns=['contribution'])
shap_fn_df_mediana = shap_fn_df_mediana.drop(columns=['contribution'])
lime_fn_df_mediana = lime_fn_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_mediana = breakdown_fn_df_mediana.head(5)
breakdown_fn_df_mediana = breakdown_fn_df_mediana.reset_index(drop=True)

shap_fn_df_mediana = shap_fn_df_mediana.head(5)
shap_fn_df_mediana = shap_fn_df_mediana.reset_index(drop=True)

lime_fn_df_mediana = lime_fn_df_mediana.reset_index(drop=True)

print(breakdown_fn_df_mediana)
print(shap_fn_df_mediana)
print(lime_fn_df_mediana)

In [None]:
#lime_fn_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_mediana['Variable'])
shapley_features = list(shap_fn_df_mediana['Variable'])
lime_features = list(lime_fn_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_mediana[breakdown_fn_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_mediana[shap_fn_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_mediana[lime_fn_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "commit_num", "line_added", "file_modified", "file_removed", "file_added"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia FN MIN:**

In [None]:
breakdown_fn_min = exp.predict_parts(df_instancia_fn_min, type="break_down",random_state=42)
shap_fn_min = exp.predict_parts(df_instancia_fn_min, type="shap",random_state=42)
lime_fn_min = exp.predict_surrogate(df_instancia_fn_min, random_state=42)

breakdown_fn_df_min = breakdown_fn_min.result
shap_fn_df_min = shap_fn_min.result
lime_fn_df_min = lime_fn_min.result

In [None]:
breakdown_fn_min.plot()

In [None]:
shap_fn_min.plot()

In [None]:
lime_fn_min.plot()

In [None]:
breakdown_fn_df_min = breakdown_fn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_min = breakdown_fn_df_min.drop(index=[0, 26])
breakdown_fn_df_min['sign'] = breakdown_fn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_min = breakdown_fn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_min = shap_fn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_min = shap_fn_df_min.tail(25)
shap_fn_df_min['sign'] = shap_fn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_min = shap_fn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_min["Variable"] = lime_fn_df_min["variable"].str.split(" ").str[0]
lime_fn_df_min["Signo"] = lime_fn_df_min["effect"].apply(evaluar_valor)
lime_fn_df_min = lime_fn_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_min = lime_fn_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_min['Ranking'] = breakdown_fn_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_min = breakdown_fn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_min['Ranking'] = shap_fn_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_min = shap_fn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_min['Ranking'] = lime_fn_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_min = lime_fn_df_min.head(5)
lime_fn_df_min = lime_fn_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_fn_df_min = breakdown_fn_df_min.drop(columns=['contribution'])
shap_fn_df_min = shap_fn_df_min.drop(columns=['contribution'])
lime_fn_df_min = lime_fn_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_min = breakdown_fn_df_min.head(5)
breakdown_fn_df_min = breakdown_fn_df_min.reset_index(drop=True)

shap_fn_df_min = shap_fn_df_min.head(5)
shap_fn_df_min = shap_fn_df_min.reset_index(drop=True)

lime_fn_df_min = lime_fn_df_min.reset_index(drop=True)

print(breakdown_fn_df_min)
print(shap_fn_df_min)
print(lime_fn_df_min)

In [None]:
#lime_fn_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_min['Variable'])
shapley_features = list(shap_fn_df_min['Variable'])
lime_features = list(lime_fn_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_min[breakdown_fn_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_min[shap_fn_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_min[lime_fn_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "duration", "commit_num", "line_removed", "file_removed", "file_added", "commit_density"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **FN General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_ada_fn = pd.DataFrame(data_dict, index=all_caract)

df_resumen_ada_fn

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_ada_fn[("General", "Ranking")] = df_resumen_ada_fn[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_ada_fn[("General", "Conteo Total")] = df_resumen_ada_fn[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_ada_fn

In [None]:
# Obtener el número de características
num_caract = df_resumen_ada_fn.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_ada_fn[("General", "Peso Rango")] = 1 - ((df_resumen_ada_fn[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_ada_fn[("General", "Peso Conteo")] = df_resumen_ada_fn[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_ada_fn[("General", "Puntaje")] = df_resumen_ada_fn[("General", "Peso Rango")] + df_resumen_ada_fn[("General", "Peso Conteo")]
df_resumen_ada_fn[("General", "Ranking")] = df_resumen_ada_fn[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_ada_fn.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_ada_fn.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_ada_fn.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_ada_fn

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_ada_fn.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_ada_fn = df_resumen_ada_fn[new_columns]

In [None]:
df_resumen_ada_fn.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_ada_fn[(tech, "Ranking Medio")] = df_resumen_ada_fn[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_ada_fn

## **RUSBoost:**

**FEATURE IMPORTANCE**

In [None]:
permu = permutation_importance(modelo_rus, x_test, y_test, n_repeats=20, random_state=42, n_jobs=2, scoring='f1')

# Umbral para valores significativos
importance_threshold = 0.01

# Filtrar los caracteristicas
significant_indices = permu.importances_mean > importance_threshold
permu_importances = pd.Series(permu.importances_mean.round(3), index=feature_names)[significant_indices]
permu_std = permu.importances_std[significant_indices]

# Crear la representación gráfica
fig, ax = plt.subplots()
permu_importances.plot.bar(yerr=permu_std, ax=ax)
ax.set_title("Feature importances using permutation on full model")
ax.set_ylabel("Mean accuracy decrease")
fig.tight_layout()
plt.show()

In [None]:
scoring = ['precision', 'recall','f1']
metric_names = ['Precision', 'Recall', 'F1-score']

permu_score = permutation_importance(modelo_rus, x_test, y_test, n_repeats=20, random_state=42, scoring=scoring)
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

# Umbral para importancias significativas
importance_threshold = 0.01

# Itera a través de las métricas
for i, metric in enumerate(scoring):
    permu = permu_score[metric]

    # Filtra las características que cumplen la condición del umbral
    significant_indices = [j for j in range(len(permu.importances_mean)) if permu.importances_mean[j] > importance_threshold]
    sorted_feature_names = [feature_names[j] for j in significant_indices]
    importances_mean = permu.importances_mean[significant_indices]
    importances_std = permu.importances_std[significant_indices]

    # Ordena los datos por importancia de mayor a menor
    sorted_indices = np.argsort(importances_mean)[::1]
    sorted_feature_names = [sorted_feature_names[j] for j in sorted_indices]
    importances_mean = importances_mean[sorted_indices]
    importances_std = importances_std[sorted_indices]

    # Crea la representación gráfica en el subplot correspondiente
    axs[i].barh(range(len(sorted_feature_names)), importances_mean, xerr=importances_std, align='center')
    axs[i].set_yticks(range(len(sorted_feature_names)))
    axs[i].set_yticklabels(sorted_feature_names)
    axs[i].set_xlabel('Valor Importancia')
    axs[i].set_title(f'Importancia por Permutación para {metric_names[i]}')

# Ajusta los espacios entre subplots y muestra la figura
plt.tight_layout()
plt.show()


In [None]:
scoring = ['precision','recall','f1']
metric_names = ['Precision', 'Recall', 'F1-score']

# Umbral para importancias significativas
importance_threshold = 0.01

# Crea un diccionario para almacenar los DataFrames
results_global_rus = {}

permu_score = permutation_importance(modelo_rus, x_test, y_test, n_repeats=20, random_state=42, scoring=scoring)
for i, metric in enumerate(scoring):
  permu = permu_score[metric]

  # Filtra las características que cumplen la condición
  significant_indices = [j for j in range(len(permu.importances_mean)) if permu.importances_mean[j] > importance_threshold]
  sorted_feature_names = [feature_names[j] for j in significant_indices]
  importances_mean = permu.importances_mean[significant_indices]
  importances_std = permu.importances_std[significant_indices]

  # Crear un DataFrame con los resultados
  df_exp_global = pd.DataFrame({'Feature': sorted_feature_names,
                       'Importance_Mean': importances_mean,
                       'Importance_Std': importances_std})

  # Ordenar el DataFrame por importance_mean en orden descendente
  df_exp_global = df_exp_global.sort_values(by='Importance_Mean', ascending=False)

  # Asignar el DataFrame al diccionario con el nombre de la métrica
  results_global_rus[f'df_global_{metric_names[i]}'] = df_exp_global

In [None]:
results_global_rus['df_global_Precision']

In [None]:
results_global_rus['df_global_Recall']

In [None]:
results_global_rus['df_global_F1-score']

**BREAK-DOWN, SHAP Y LIME:**

In [None]:
#primero definimos el explainer
exp = dx.Explainer(modelo_rus, x_train, y_train)

### **Instancia VP MAX:**

In [None]:
breakdown_vp_max = exp.predict_parts(df_instancia_vp_max, type="break_down",random_state=42)
shap_vp_max = exp.predict_parts(df_instancia_vp_max, type="shap",random_state=42)
lime_vp_max = exp.predict_surrogate(df_instancia_vp_max, random_state=42)

breakdown_vp_df_max = breakdown_vp_max.result
shap_vp_df_max = shap_vp_max.result
lime_vp_df_max=lime_vp_max.result

In [None]:
breakdown_vp_max.plot()

In [None]:
shap_vp_max.plot()

In [None]:
lime_vp_max.plot()

In [None]:
breakdown_vp_df_max = breakdown_vp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_max = breakdown_vp_df_max.drop(index=[0, 26])
breakdown_vp_df_max['sign'] = breakdown_vp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_max = breakdown_vp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_max = shap_vp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_max = shap_vp_df_max.tail(25)
shap_vp_df_max['sign'] = shap_vp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_max = shap_vp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_max["Variable"] = lime_vp_df_max["variable"].str.split(" ").str[0]
lime_vp_df_max["Signo"] = lime_vp_df_max["effect"].apply(evaluar_valor)
lime_vp_df_max = lime_vp_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_max = lime_vp_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_max['Ranking'] = breakdown_vp_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_max = breakdown_vp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_max['Ranking'] = shap_vp_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_max = shap_vp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_max['Ranking'] = lime_vp_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_max = lime_vp_df_max.head(5)
lime_vp_df_max = lime_vp_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_max = breakdown_vp_df_max.drop(columns=['contribution'])
shap_vp_df_max = shap_vp_df_max.drop(columns=['contribution'])
lime_vp_df_max = lime_vp_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_max = breakdown_vp_df_max.head(5)
breakdown_vp_df_max = breakdown_vp_df_max.reset_index(drop=True)

shap_vp_df_max = shap_vp_df_max.head(5)
shap_vp_df_max = shap_vp_df_max.reset_index(drop=True)

lime_vp_df_max = lime_vp_df_max.reset_index(drop=True)
lime_vp_df_max.at[4, 'Variable'] = 'messages_min'

print(breakdown_vp_df_max)
print(shap_vp_df_max)
print(lime_vp_df_max)

In [None]:
#lime_vp_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_max['Variable'])
shapley_features = list(shap_vp_df_max['Variable'])
lime_features = list(lime_vp_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_max[breakdown_vp_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_max[shap_vp_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_max[lime_vp_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "line_added", "commit_num", "developer_num", "messages_min"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia VP MEDIANA:**

In [None]:
breakdown_vp_mediana = exp.predict_parts(df_instancia_vp_mediana, type="break_down",random_state=42)
shap_vp_mediana = exp.predict_parts(df_instancia_vp_mediana, type="shap",random_state=42)
lime_vp_mediana = exp.predict_surrogate(df_instancia_vp_mediana, random_state=42)

breakdown_vp_df_mediana = breakdown_vp_mediana.result
shap_vp_df_mediana = shap_vp_mediana.result
lime_vp_df_mediana=lime_vp_mediana.result

In [None]:
breakdown_vp_mediana.plot()

In [None]:
shap_vp_mediana.plot()

In [None]:
lime_vp_mediana.plot()

In [None]:
breakdown_vp_df_mediana = breakdown_vp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_mediana = breakdown_vp_df_mediana.drop(index=[0, 26])
breakdown_vp_df_mediana['sign'] = breakdown_vp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_mediana = breakdown_vp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_mediana = shap_vp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_mediana = shap_vp_df_mediana.tail(25)
shap_vp_df_mediana['sign'] = shap_vp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_mediana = shap_vp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_mediana["Variable"] = lime_vp_df_mediana["variable"].str.split(" ").str[0]
lime_vp_df_mediana["Signo"] = lime_vp_df_mediana["effect"].apply(evaluar_valor)
lime_vp_df_mediana = lime_vp_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_mediana = lime_vp_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_mediana['Ranking'] = breakdown_vp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_mediana = breakdown_vp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_mediana['Ranking'] = shap_vp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_mediana = shap_vp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_mediana['Ranking'] = lime_vp_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_mediana = lime_vp_df_mediana.head(5)
lime_vp_df_mediana = lime_vp_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_mediana = breakdown_vp_df_mediana.drop(columns=['contribution'])
shap_vp_df_mediana = shap_vp_df_mediana.drop(columns=['contribution'])
lime_vp_df_mediana = lime_vp_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_mediana = breakdown_vp_df_mediana.head(5)
breakdown_vp_df_mediana = breakdown_vp_df_mediana.reset_index(drop=True)

shap_vp_df_mediana = shap_vp_df_mediana.head(5)
shap_vp_df_mediana = shap_vp_df_mediana.reset_index(drop=True)

lime_vp_df_mediana = lime_vp_df_mediana.reset_index(drop=True)

print(breakdown_vp_df_mediana)
print(shap_vp_df_mediana)
print(lime_vp_df_mediana)

In [None]:
#lime_vp_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_mediana['Variable'])
shapley_features = list(shap_vp_df_mediana['Variable'])
lime_features = list(lime_vp_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_mediana[breakdown_vp_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_mediana[shap_vp_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_mediana[lime_vp_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "line_added", "commit_num", "developer_num", "messages_min", "delete_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia VP MIN:**

In [None]:
breakdown_vp_min = exp.predict_parts(df_instancia_vp_min, type="break_down",random_state=42)
shap_vp_min = exp.predict_parts(df_instancia_vp_min, type="shap",random_state=42)
lime_vp_min = exp.predict_surrogate(df_instancia_vp_min, random_state=42)

breakdown_vp_df_min = breakdown_vp_min.result
shap_vp_df_min = shap_vp_min.result
lime_vp_df_min = lime_vp_min.result

In [None]:
breakdown_vp_min.plot()

In [None]:
shap_vp_min.plot()

In [None]:
lime_vp_min.plot()

In [None]:
breakdown_vp_df_min = breakdown_vp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vp_df_min = breakdown_vp_df_min.drop(index=[0, 26])
breakdown_vp_df_min['sign'] = breakdown_vp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vp_df_min = breakdown_vp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vp_df_min = shap_vp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vp_df_min = shap_vp_df_min.tail(25)
shap_vp_df_min['sign'] = shap_vp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vp_df_min = shap_vp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vp_df_min["Variable"] = lime_vp_df_min["variable"].str.split(" ").str[0]
lime_vp_df_min["Signo"] = lime_vp_df_min["effect"].apply(evaluar_valor)
lime_vp_df_min = lime_vp_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vp_df_min = lime_vp_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vp_df_min['Ranking'] = breakdown_vp_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vp_df_min = breakdown_vp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vp_df_min['Ranking'] = shap_vp_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_vp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vp_df_min = shap_vp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vp_df_min['Ranking'] = lime_vp_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_vp_df_min = lime_vp_df_min.head(5)
lime_vp_df_min = lime_vp_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vp_df_min = breakdown_vp_df_min.drop(columns=['contribution'])
shap_vp_df_min = shap_vp_df_min.drop(columns=['contribution'])
lime_vp_df_min = lime_vp_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vp_df_min = breakdown_vp_df_min.head(5)
breakdown_vp_df_min = breakdown_vp_df_min.reset_index(drop=True)

shap_vp_df_min = shap_vp_df_min.head(5)
shap_vp_df_min = shap_vp_df_min.reset_index(drop=True)

lime_vp_df_min = lime_vp_df_min.reset_index(drop=True)
lime_vp_df_min.at[2, 'Variable'] = 'line_added'

print(breakdown_vp_df_min)
print(shap_vp_df_min)
print(lime_vp_df_min)

In [None]:
#lime_vp_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vp_df_min['Variable'])
shapley_features = list(shap_vp_df_min['Variable'])
lime_features = list(lime_vp_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vp_df_min[breakdown_vp_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vp_df_min[shap_vp_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vp_df_min[lime_vp_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "developer_num", "line_added", "messages_min", "delete_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **VP General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_rus_vp = pd.DataFrame(data_dict, index=all_caract)

df_resumen_rus_vp

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_rus_vp[("General", "Ranking")] = df_resumen_rus_vp[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_rus_vp[("General", "Conteo Total")] = df_resumen_rus_vp[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_rus_vp

In [None]:
# Obtener el número de características
num_caract = df_resumen_rus_vp.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_rus_vp[("General", "Peso Rango")] = 1 - ((df_resumen_rus_vp[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_rus_vp[("General", "Peso Conteo")] = df_resumen_rus_vp[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_rus_vp[("General", "Puntaje")] = df_resumen_rus_vp[("General", "Peso Rango")] + df_resumen_rus_vp[("General", "Peso Conteo")]
df_resumen_rus_vp[("General", "Ranking")] = df_resumen_rus_vp[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_rus_vp.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_rus_vp.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_rus_vp.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_rus_vp

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_rus_vp.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_rus_vp = df_resumen_rus_vp[new_columns]

In [None]:
df_resumen_rus_vp.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_rus_vp[(tech, "Ranking Medio")] = df_resumen_rus_vp[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_rus_vp

### **Instancia VN MAX:**

In [None]:
breakdown_vn_max = exp.predict_parts(df_instancia_vn_max, type="break_down",random_state=42)
shap_vn_max = exp.predict_parts(df_instancia_vn_max, type="shap",random_state=42)
lime_vn_max = exp.predict_surrogate(df_instancia_vn_max, random_state=42)

breakdown_vn_df_max = breakdown_vn_max.result
shap_vn_df_max = shap_vn_max.result
lime_vn_df_max = lime_vn_max.result

In [None]:
breakdown_vn_max.plot()

In [None]:
shap_vn_max.plot()

In [None]:
lime_vn_max.plot()

In [None]:
breakdown_vn_df_max = breakdown_vn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_max = breakdown_vn_df_max.drop(index=[0, 26])
breakdown_vn_df_max['sign'] = breakdown_vn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_max = breakdown_vn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_max = shap_vn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_max = shap_vn_df_max.tail(25)
shap_vn_df_max['sign'] = shap_vn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_max = shap_vn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_max["Variable"] = lime_vn_df_max["variable"].str.split(" ").str[0]
lime_vn_df_max["Signo"] = lime_vn_df_max["effect"].apply(evaluar_valor)
lime_vn_df_max = lime_vn_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_max = lime_vn_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_max['Ranking'] = breakdown_vn_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_max = breakdown_vn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_max['Ranking'] = shap_vn_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_max = shap_vn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_max['Ranking'] = lime_vn_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_max = lime_vn_df_max.head(5)
lime_vn_df_max = lime_vn_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_vn_df_max = breakdown_vn_df_max.drop(columns=['contribution'])
shap_vn_df_max = shap_vn_df_max.drop(columns=['contribution'])
lime_vn_df_max = lime_vn_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_max = breakdown_vn_df_max.head(5)
breakdown_vn_df_max = breakdown_vn_df_max.reset_index(drop=True)

shap_vn_df_max = shap_vn_df_max.head(5)
shap_vn_df_max = shap_vn_df_max.reset_index(drop=True)

lime_vn_df_max = lime_vn_df_max.reset_index(drop=True)
lime_vn_df_max.at[1, 'Variable'] = 'line_added'
lime_vn_df_max.at[2, 'Variable'] = 'commit_num'
lime_vn_df_max.at[3, 'Variable'] = 'developer_num'

print(breakdown_vn_df_max)
print(shap_vn_df_max)
print(lime_vn_df_max)

In [None]:
#lime_vn_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_max['Variable'])
shapley_features = list(shap_vn_df_max['Variable'])
lime_features = list(lime_vn_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_max[breakdown_vn_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_max[shap_vn_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_max[lime_vn_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "commit_num", "messages_min", "line_added", "delete_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia VN MEDIANA:**

In [None]:
breakdown_vn_mediana = exp.predict_parts(df_instancia_vn_mediana, type="break_down",random_state=42)
shap_vn_mediana = exp.predict_parts(df_instancia_vn_mediana, type="shap",random_state=42)
lime_vn_mediana = exp.predict_surrogate(df_instancia_vn_mediana, random_state=42)

breakdown_vn_df_mediana = breakdown_vn_mediana.result
shap_vn_df_mediana = shap_vn_mediana.result
lime_vn_df_mediana = lime_vn_mediana.result

In [None]:
breakdown_vn_mediana.plot()

In [None]:
shap_vn_mediana.plot()

In [None]:
lime_vn_mediana.plot()

In [None]:
breakdown_vn_df_mediana = breakdown_vn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_mediana = breakdown_vn_df_mediana.drop(index=[0, 26])
breakdown_vn_df_mediana['sign'] = breakdown_vn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_mediana = breakdown_vn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_mediana = shap_vn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_mediana = shap_vn_df_mediana.tail(25)
shap_vn_df_mediana['sign'] = shap_vn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_mediana = shap_vn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_mediana["Variable"] = lime_vn_df_mediana["variable"].str.split(" ").str[0]
lime_vn_df_mediana["Signo"] = lime_vn_df_mediana["effect"].apply(evaluar_valor)
lime_vn_df_mediana = lime_vn_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_mediana = lime_vn_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_mediana['Ranking'] = breakdown_vn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_mediana = breakdown_vn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_mediana['Ranking'] = shap_vn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_mediana = shap_vn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_mediana['Ranking'] = lime_vn_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_mediana = lime_vn_df_mediana.head(5)
lime_vn_df_mediana = lime_vn_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_vn_df_mediana = breakdown_vn_df_mediana.drop(columns=['contribution'])
shap_vn_df_mediana = shap_vn_df_mediana.drop(columns=['contribution'])
lime_vn_df_mediana = lime_vn_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_mediana = breakdown_vn_df_mediana.head(5)
breakdown_vn_df_mediana = breakdown_vn_df_mediana.reset_index(drop=True)

shap_vn_df_mediana = shap_vn_df_mediana.head(5)
shap_vn_df_mediana = shap_vn_df_mediana.reset_index(drop=True)

lime_vn_df_mediana = lime_vn_df_mediana.reset_index(drop=True)
lime_vn_df_mediana.at[1, 'Variable'] = 'developer_num'
lime_vn_df_mediana.at[2, 'Variable'] = 'line_added'

print(breakdown_vn_df_mediana)
print(shap_vn_df_mediana)
print(lime_vn_df_mediana)

In [None]:
#lime_vn_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_mediana['Variable'])
shapley_features = list(shap_vn_df_mediana['Variable'])
lime_features = list(lime_vn_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_mediana[breakdown_vn_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_mediana[shap_vn_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_mediana[lime_vn_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "commit_num", "line_added", "messages_min", "delete_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia VN MIN:**

In [None]:
breakdown_vn_min = exp.predict_parts(df_instancia_vn_min, type="break_down",random_state=42)
shap_vn_min = exp.predict_parts(df_instancia_vn_min, type="shap",random_state=42)
lime_vn_min = exp.predict_surrogate(df_instancia_vn_min, random_state=42)

breakdown_vn_df_min = breakdown_vn_min.result
shap_vn_df_min = shap_vn_min.result
lime_vn_df_min = lime_vn_min.result

In [None]:
breakdown_vn_min.plot()

In [None]:
shap_vn_min.plot()

In [None]:
lime_vn_min.plot()

In [None]:
breakdown_vn_df_min = breakdown_vn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_vn_df_min = breakdown_vn_df_min.drop(index=[0, 26])
breakdown_vn_df_min['sign'] = breakdown_vn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_vn_df_min = breakdown_vn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_vn_df_min = shap_vn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_vn_df_min = shap_vn_df_min.tail(25)
shap_vn_df_min['sign'] = shap_vn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_vn_df_min = shap_vn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_vn_df_min["Variable"] = lime_vn_df_min["variable"].str.split(" ").str[0]
lime_vn_df_min["Signo"] = lime_vn_df_min["effect"].apply(evaluar_valor)
lime_vn_df_min = lime_vn_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_vn_df_min = lime_vn_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_vn_df_min['Ranking'] = breakdown_vn_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_vn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_vn_df_min = breakdown_vn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_vn_df_min['Ranking'] = shap_vn_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_vn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_vn_df_min = shap_vn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_vn_df_min['Ranking'] = lime_vn_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_vn_df_min = lime_vn_df_min.head(5)
lime_vn_df_min = lime_vn_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_vn_df_min = breakdown_vn_df_min.drop(columns=['contribution'])
shap_vn_df_min = shap_vn_df_min.drop(columns=['contribution'])
lime_vn_df_min = lime_vn_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_vn_df_min = breakdown_vn_df_min.head(5)
breakdown_vn_df_min = breakdown_vn_df_min.reset_index(drop=True)

shap_vn_df_min = shap_vn_df_min.head(5)
shap_vn_df_min = shap_vn_df_min.reset_index(drop=True)

lime_vn_df_min = lime_vn_df_min.reset_index(drop=True)
lime_vn_df_min.at[2, 'Variable'] = 'line_added'
lime_vn_df_min.at[3, 'Variable'] = 'commit_num'

print(breakdown_vn_df_min)
print(shap_vn_df_min)
print(lime_vn_df_min)

In [None]:
#lime_vn_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_vn_df_min['Variable'])
shapley_features = list(shap_vn_df_min['Variable'])
lime_features = list(lime_vn_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_vn_df_min[breakdown_vn_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_vn_df_min[shap_vn_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_vn_df_min[lime_vn_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "commit_num", "messages_min", "line_added", "delete_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **VN General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_rus_vn = pd.DataFrame(data_dict, index=all_caract)

df_resumen_rus_vn

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_rus_vn[("General", "Ranking")] = df_resumen_rus_vn[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_rus_vn[("General", "Conteo Total")] = df_resumen_rus_vn[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_rus_vn

In [None]:
# Obtener el número de características
num_caract = df_resumen_rus_vn.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_rus_vn[("General", "Peso Rango")] = 1 - ((df_resumen_rus_vn[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_rus_vn[("General", "Peso Conteo")] = df_resumen_rus_vn[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_rus_vn[("General", "Puntaje")] = df_resumen_rus_vn[("General", "Peso Rango")] + df_resumen_rus_vn[("General", "Peso Conteo")]
df_resumen_rus_vn[("General", "Ranking")] = df_resumen_rus_vn[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_rus_vn.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_rus_vn.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_rus_vn.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_rus_vn

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_rus_vn.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_rus_vn = df_resumen_rus_vn[new_columns]

In [None]:
df_resumen_rus_vn.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_rus_vn[(tech, "Ranking Medio")] = df_resumen_rus_vn[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_rus_vn

### **Instancia FP MAX:**

In [None]:
breakdown_fp_max = exp.predict_parts(df_instancia_fp_max, type="break_down",random_state=42)
shap_fp_max = exp.predict_parts(df_instancia_fp_max, type="shap",random_state=42)
lime_fp_max = exp.predict_surrogate(df_instancia_fp_max, random_state=42)

breakdown_fp_df_max = breakdown_fp_max.result
shap_fp_df_max = shap_fp_max.result
lime_fp_df_max=lime_fp_max.result

In [None]:
breakdown_fp_max.plot()

In [None]:
shap_fp_max.plot()

In [None]:
lime_fp_max.plot()

In [None]:
breakdown_fp_df_max = breakdown_fp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_max = breakdown_fp_df_max.drop(index=[0, 26])
breakdown_fp_df_max['sign'] = breakdown_fp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_max = breakdown_fp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_max = shap_fp_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_max = shap_fp_df_max.tail(25)
shap_fp_df_max['sign'] = shap_fp_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_max = shap_fp_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_max["Variable"] = lime_fp_df_max["variable"].str.split(" ").str[0]
lime_fp_df_max["Signo"] = lime_fp_df_max["effect"].apply(evaluar_valor)
lime_fp_df_max = lime_fp_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_max = lime_fp_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_max['Ranking'] = breakdown_fp_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_max = breakdown_fp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_max['Ranking'] = shap_fp_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_max = shap_fp_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_max['Ranking'] = lime_fp_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_max = lime_fp_df_max.head(5)
lime_fp_df_max = lime_fp_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_fp_df_max = breakdown_fp_df_max.drop(columns=['contribution'])
shap_fp_df_max = shap_fp_df_max.drop(columns=['contribution'])
lime_fp_df_max = lime_fp_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_max = breakdown_fp_df_max.head(5)
breakdown_fp_df_max = breakdown_fp_df_max.reset_index(drop=True)

shap_fp_df_max = shap_fp_df_max.head(5)
shap_fp_df_max = shap_fp_df_max.reset_index(drop=True)

lime_fp_df_max = lime_fp_df_max.reset_index(drop=True)

print(breakdown_fp_df_max)
print(shap_fp_df_max)
print(lime_fp_df_max)

In [None]:
#lime_fp_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_max['Variable'])
shapley_features = list(shap_fp_df_max['Variable'])
lime_features = list(lime_fp_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_max[breakdown_fp_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_max[shap_fp_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_max[lime_fp_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "line_added", "commit_num", "developer_num", "messages_min"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia FP MEDIANA:**

In [None]:
breakdown_fp_mediana = exp.predict_parts(df_instancia_fp_mediana, type="break_down",random_state=42)
shap_fp_mediana = exp.predict_parts(df_instancia_fp_mediana, type="shap",random_state=42)
lime_fp_mediana = exp.predict_surrogate(df_instancia_fp_mediana, random_state=42)

breakdown_fp_df_mediana = breakdown_fp_mediana.result
shap_fp_df_mediana = shap_fp_mediana.result
lime_fp_df_mediana=lime_fp_mediana.result

In [None]:
breakdown_fp_mediana.plot()

In [None]:
shap_fp_mediana.plot()

In [None]:
lime_fp_mediana.plot()

In [None]:
breakdown_fp_df_mediana = breakdown_fp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_mediana = breakdown_fp_df_mediana.drop(index=[0, 26])
breakdown_fp_df_mediana['sign'] = breakdown_fp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_mediana = breakdown_fp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_mediana = shap_fp_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_mediana = shap_fp_df_mediana.tail(25)
shap_fp_df_mediana['sign'] = shap_fp_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_mediana = shap_fp_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_mediana["Variable"] = lime_fp_df_mediana["variable"].str.split(" ").str[0]
lime_fp_df_mediana["Signo"] = lime_fp_df_mediana["effect"].apply(evaluar_valor)
lime_fp_df_mediana = lime_fp_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_mediana = lime_fp_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_mediana['Ranking'] = breakdown_fp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_mediana = breakdown_fp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_mediana['Ranking'] = shap_fp_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_mediana = shap_fp_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_mediana['Ranking'] = lime_fp_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_mediana = lime_fp_df_mediana.head(5)
lime_fp_df_mediana = lime_fp_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_fp_df_mediana = breakdown_fp_df_mediana.drop(columns=['contribution'])
shap_fp_df_mediana = shap_fp_df_mediana.drop(columns=['contribution'])
lime_fp_df_mediana = lime_fp_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_mediana = breakdown_fp_df_mediana.head(5)
breakdown_fp_df_mediana = breakdown_fp_df_mediana.reset_index(drop=True)

shap_fp_df_mediana = shap_fp_df_mediana.head(5)
shap_fp_df_mediana = shap_fp_df_mediana.reset_index(drop=True)

lime_fp_df_mediana = lime_fp_df_mediana.reset_index(drop=True)
lime_fp_df_mediana.at[3, 'Variable'] = 'developer_num'

print(breakdown_fp_df_mediana)
print(shap_fp_df_mediana)
print(lime_fp_df_mediana)

In [None]:
#lime_fp_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_mediana['Variable'])
shapley_features = list(shap_fp_df_mediana['Variable'])
lime_features = list(lime_fp_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_mediana[breakdown_fp_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_mediana[shap_fp_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_mediana[lime_fp_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "developer_num", "line_added", "messages_min", "delete_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia FP MIN:**

In [None]:
breakdown_fp_min = exp.predict_parts(df_instancia_fp_min, type="break_down",random_state=42)
shap_fp_min = exp.predict_parts(df_instancia_fp_min, type="shap",random_state=42)
lime_fp_min = exp.predict_surrogate(df_instancia_fp_min, random_state=42)

breakdown_fp_df_min = breakdown_fp_min.result
shap_fp_df_min = shap_fp_min.result
lime_fp_df_min = lime_fp_min.result

In [None]:
breakdown_fp_min.plot()

In [None]:
shap_fp_min.plot()

In [None]:
lime_fp_min.plot()

In [None]:
breakdown_fp_df_min = breakdown_fp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fp_df_min = breakdown_fp_df_min.drop(index=[0, 26])
breakdown_fp_df_min['sign'] = breakdown_fp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fp_df_min = breakdown_fp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fp_df_min = shap_fp_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fp_df_min = shap_fp_df_min.tail(25)
shap_fp_df_min['sign'] = shap_fp_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fp_df_min = shap_fp_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fp_df_min["Variable"] = lime_fp_df_min["variable"].str.split(" ").str[0]
lime_fp_df_min["Signo"] = lime_fp_df_min["effect"].apply(evaluar_valor)
lime_fp_df_min = lime_fp_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fp_df_min = lime_fp_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fp_df_min['Ranking'] = breakdown_fp_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fp_df_min = breakdown_fp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fp_df_min['Ranking'] = shap_fp_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_fp_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fp_df_min = shap_fp_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fp_df_min['Ranking'] = lime_fp_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_fp_df_min = lime_fp_df_min.head(5)
lime_fp_df_min = lime_fp_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_fp_df_min = breakdown_fp_df_min.drop(columns=['contribution'])
shap_fp_df_min = shap_fp_df_min.drop(columns=['contribution'])
lime_fp_df_min = lime_fp_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fp_df_min = breakdown_fp_df_min.head(5)
breakdown_fp_df_min = breakdown_fp_df_min.reset_index(drop=True)

shap_fp_df_min = shap_fp_df_min.head(5)
shap_fp_df_min = shap_fp_df_min.reset_index(drop=True)

lime_fp_df_min = lime_fp_df_min.reset_index(drop=True)

print(breakdown_fp_df_min)
print(shap_fp_df_min)
print(lime_fp_df_min)

In [None]:
#lime_fp_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fp_df_min['Variable'])
shapley_features = list(shap_fp_df_min['Variable'])
lime_features = list(lime_fp_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fp_df_min[breakdown_fp_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fp_df_min[shap_fp_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fp_df_min[lime_fp_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "commit_num", "developer_num", "line_added", "delete_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **FP General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_rus_fp = pd.DataFrame(data_dict, index=all_caract)

df_resumen_rus_fp

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_rus_fp[("General", "Ranking")] = df_resumen_rus_fp[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_rus_fp[("General", "Conteo Total")] = df_resumen_rus_fp[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_rus_fp

In [None]:
# Obtener el número de características
num_caract = df_resumen_rus_fp.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_rus_fp[("General", "Peso Rango")] = 1 - ((df_resumen_rus_fp[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_rus_fp[("General", "Peso Conteo")] = df_resumen_rus_fp[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_rus_fp[("General", "Puntaje")] = df_resumen_rus_fp[("General", "Peso Rango")] + df_resumen_rus_fp[("General", "Peso Conteo")]
df_resumen_rus_fp[("General", "Ranking")] = df_resumen_rus_fp[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_rus_fp.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_rus_fp.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_rus_fp.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_rus_fp

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_rus_fp.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_rus_fp = df_resumen_rus_fp[new_columns]

In [None]:
df_resumen_rus_fp.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_rus_fp[(tech, "Ranking Medio")] = df_resumen_rus_fp[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_rus_fp

### **Instancia FN MAX:**

In [None]:
breakdown_fn_max = exp.predict_parts(df_instancia_fn_max, type="break_down",random_state=42)
shap_fn_max = exp.predict_parts(df_instancia_fn_max, type="shap",random_state=42)
lime_fn_max = exp.predict_surrogate(df_instancia_fn_max, random_state=42)

breakdown_fn_df_max = breakdown_fn_max.result
shap_fn_df_max = shap_fn_max.result
lime_fn_df_max=lime_fn_max.result

In [None]:
breakdown_fn_max.plot()

In [None]:
shap_fn_max.plot()

In [None]:
lime_fn_max.plot()

In [None]:
breakdown_fn_df_max = breakdown_fn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_max = breakdown_fn_df_max.drop(index=[0, 26])
breakdown_fn_df_max['sign'] = breakdown_fn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_max = breakdown_fn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_max = shap_fn_df_max.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_max = shap_fn_df_max.tail(25)
shap_fn_df_max['sign'] = shap_fn_df_max['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_max = shap_fn_df_max.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_max["Variable"] = lime_fn_df_max["variable"].str.split(" ").str[0]
lime_fn_df_max["Signo"] = lime_fn_df_max["effect"].apply(evaluar_valor)
lime_fn_df_max = lime_fn_df_max.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_max = lime_fn_df_max.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_max['Ranking'] = breakdown_fn_df_max['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_max = breakdown_fn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_max['Ranking'] = shap_fn_df_max['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_max.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_max = shap_fn_df_max[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_max['Ranking'] = lime_fn_df_max['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_max = lime_fn_df_max.head(5)
lime_fn_df_max = lime_fn_df_max[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimaxo la columna de la contribucion
breakdown_fn_df_max = breakdown_fn_df_max.drop(columns=['contribution'])
shap_fn_df_max = shap_fn_df_max.drop(columns=['contribution'])
lime_fn_df_max = lime_fn_df_max.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_max = breakdown_fn_df_max.head(5)
breakdown_fn_df_max = breakdown_fn_df_max.reset_index(drop=True)

shap_fn_df_max = shap_fn_df_max.head(5)
shap_fn_df_max = shap_fn_df_max.reset_index(drop=True)

lime_fn_df_max = lime_fn_df_max.reset_index(drop=True)
lime_fn_df_max.at[1, 'Variable'] = 'developer_num'
lime_fn_df_max.at[2, 'Variable'] = 'line_added'

print(breakdown_fn_df_max)
print(shap_fn_df_max)
print(lime_fn_df_max)

In [None]:
#lime_fn_max.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_max['Variable'])
shapley_features = list(shap_fn_df_max['Variable'])
lime_features = list(lime_fn_df_max['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_max = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_max['Variable'] = list(all_features)
df_final_max= df_final_max[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_max['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_max[breakdown_fn_df_max['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_max[shap_fn_df_max['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_max[lime_fn_df_max['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_max.loc[df_final_max['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_max

In [None]:
df_final_max.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "commit_num", "line_added", "messages_min", "delete_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_max = df_final_max.reindex(nuevo_orden)

df_final_max

### **Instancia FN MEDIANA:**

In [None]:
breakdown_fn_mediana = exp.predict_parts(df_instancia_fn_mediana, type="break_down",random_state=42)
shap_fn_mediana = exp.predict_parts(df_instancia_fn_mediana, type="shap",random_state=42)
lime_fn_mediana = exp.predict_surrogate(df_instancia_fn_mediana, random_state=42)

breakdown_fn_df_mediana = breakdown_fn_mediana.result
shap_fn_df_mediana = shap_fn_mediana.result
lime_fn_df_mediana=lime_fn_mediana.result

In [None]:
breakdown_fn_mediana.plot()

In [None]:
shap_fn_mediana.plot()

In [None]:
lime_fn_mediana.plot()

In [None]:
breakdown_fn_df_mediana = breakdown_fn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_mediana = breakdown_fn_df_mediana.drop(index=[0, 26])
breakdown_fn_df_mediana['sign'] = breakdown_fn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_mediana = breakdown_fn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_mediana = shap_fn_df_mediana.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_mediana = shap_fn_df_mediana.tail(25)
shap_fn_df_mediana['sign'] = shap_fn_df_mediana['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_mediana = shap_fn_df_mediana.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_mediana["Variable"] = lime_fn_df_mediana["variable"].str.split(" ").str[0]
lime_fn_df_mediana["Signo"] = lime_fn_df_mediana["effect"].apply(evaluar_valor)
lime_fn_df_mediana = lime_fn_df_mediana.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_mediana = lime_fn_df_mediana.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_mediana['Ranking'] = breakdown_fn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_mediana = breakdown_fn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_mediana['Ranking'] = shap_fn_df_mediana['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_mediana.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_mediana = shap_fn_df_mediana[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_mediana['Ranking'] = lime_fn_df_mediana['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_mediana = lime_fn_df_mediana.head(5)
lime_fn_df_mediana = lime_fn_df_mediana[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimedianao la columna de la contribucion
breakdown_fn_df_mediana = breakdown_fn_df_mediana.drop(columns=['contribution'])
shap_fn_df_mediana = shap_fn_df_mediana.drop(columns=['contribution'])
lime_fn_df_mediana = lime_fn_df_mediana.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_mediana = breakdown_fn_df_mediana.head(5)
breakdown_fn_df_mediana = breakdown_fn_df_mediana.reset_index(drop=True)

shap_fn_df_mediana = shap_fn_df_mediana.head(5)
shap_fn_df_mediana = shap_fn_df_mediana.reset_index(drop=True)

lime_fn_df_mediana = lime_fn_df_mediana.reset_index(drop=True)


print(breakdown_fn_df_mediana)
print(shap_fn_df_mediana)
print(lime_fn_df_mediana)

In [None]:
#lime_fn_mediana.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_mediana['Variable'])
shapley_features = list(shap_fn_df_mediana['Variable'])
lime_features = list(lime_fn_df_mediana['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_mediana = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_mediana['Variable'] = list(all_features)
df_final_mediana= df_final_mediana[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_mediana['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_mediana[breakdown_fn_df_mediana['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_mediana[shap_fn_df_mediana['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_mediana[lime_fn_df_mediana['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_mediana.loc[df_final_mediana['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_mediana

In [None]:
df_final_mediana.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "commit_num", "line_added", "messages_min", "delete_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_mediana = df_final_mediana.reindex(nuevo_orden)

df_final_mediana

### **Instancia FN MIN:**

In [None]:
breakdown_fn_min = exp.predict_parts(df_instancia_fn_min, type="break_down",random_state=42)
shap_fn_min = exp.predict_parts(df_instancia_fn_min, type="shap",random_state=42)
lime_fn_min = exp.predict_surrogate(df_instancia_fn_min, random_state=42)

breakdown_fn_df_min = breakdown_fn_min.result
shap_fn_df_min = shap_fn_min.result
lime_fn_df_min = lime_fn_min.result

In [None]:
breakdown_fn_min.plot()

In [None]:
shap_fn_min.plot()

In [None]:
lime_fn_min.plot()

In [None]:
breakdown_fn_df_min = breakdown_fn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
breakdown_fn_df_min = breakdown_fn_df_min.drop(index=[0, 26])
breakdown_fn_df_min['sign'] = breakdown_fn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
breakdown_fn_df_min = breakdown_fn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

shap_fn_df_min = shap_fn_df_min.loc[:, ['variable_name', 'contribution', 'sign']]
shap_fn_df_min = shap_fn_df_min.tail(25)
shap_fn_df_min['sign'] = shap_fn_df_min['sign'].replace({1.0: 'Positivo', 0.0: 'Nulo', -1.0: 'Negativo'})
shap_fn_df_min = shap_fn_df_min.sort_values(by='contribution', key=lambda x: abs(x), ascending=False)

lime_fn_df_min["Variable"] = lime_fn_df_min["variable"].str.split(" ").str[0]
lime_fn_df_min["Signo"] = lime_fn_df_min["effect"].apply(evaluar_valor)
lime_fn_df_min = lime_fn_df_min.sort_values(by='effect', key=lambda x: abs(x), ascending=False)
lime_fn_df_min = lime_fn_df_min.drop(columns=['variable'])

# Agregar una columna de ranking
breakdown_fn_df_min['Ranking'] = breakdown_fn_df_min['contribution'].abs().rank(ascending=False).astype(int)
breakdown_fn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
breakdown_fn_df_min = breakdown_fn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

shap_fn_df_min['Ranking'] = shap_fn_df_min['contribution'].abs().rank(ascending=False).astype(int)
shap_fn_df_min.rename(columns={'sign': 'Signo', 'variable_name':'Variable'}, inplace=True)
shap_fn_df_min = shap_fn_df_min[['Variable', 'Ranking', 'contribution', 'Signo']]

lime_fn_df_min['Ranking'] = lime_fn_df_min['effect'].abs().rank(ascending=False).astype(int)
lime_fn_df_min = lime_fn_df_min.head(5)
lime_fn_df_min = lime_fn_df_min[['Variable', 'Ranking', 'effect', 'Signo']]

# Elimino la columna de la contribucion
breakdown_fn_df_min = breakdown_fn_df_min.drop(columns=['contribution'])
shap_fn_df_min = shap_fn_df_min.drop(columns=['contribution'])
lime_fn_df_min = lime_fn_df_min.drop(columns=['effect'])

# Filtrar las 5 primeras contribuciones en valor absoluto
breakdown_fn_df_min = breakdown_fn_df_min.head(5)
breakdown_fn_df_min = breakdown_fn_df_min.reset_index(drop=True)

shap_fn_df_min = shap_fn_df_min.head(5)
shap_fn_df_min = shap_fn_df_min.reset_index(drop=True)

lime_fn_df_min = lime_fn_df_min.reset_index(drop=True)
lime_fn_df_min.at[2, 'Variable'] = 'line_added'
lime_fn_df_min.at[3, 'Variable'] = 'commit_num'

print(breakdown_fn_df_min)
print(shap_fn_df_min)
print(lime_fn_df_min)

In [None]:
#lime_fn_min.show_in_notebook()

In [None]:
# Obtener todas las características únicas de las tres técnicas
breakdown_features = list(breakdown_fn_df_min['Variable'])
shapley_features = list(shap_fn_df_min['Variable'])
lime_features = list(lime_fn_df_min['Variable'])
all_features = list(set(breakdown_features + shapley_features + lime_features))
all_features = all_features[::-1]

df_final_min = pd.DataFrame(index=range(len(all_features)), columns=columns_multi)
df_final_min['Variable'] = list(all_features)
df_final_min= df_final_min[['Variable', 'Breakdown', 'Shapley', 'Lime']]

print(breakdown_features)
print(shapley_features)
print(lime_features)
print(all_features)

for feature in df_final_min['Variable']:
    # Buscar la característica en el DataFrame de Breakdown
    breakdown_row = breakdown_fn_df_min[breakdown_fn_df_min['Variable'] == feature]
    if not breakdown_row.empty:
        # Si se encuentra, obtener los valores de "Ranking" y "Signo"
        ranking_breakdown = breakdown_row.iloc[0]['Ranking']
        signo_breakdown = breakdown_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = ranking_breakdown
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = signo_breakdown
    else:
        # Si no se encuentra, añadir "-"
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Breakdown', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Shapley
    shap_row = shap_fn_df_min[shap_fn_df_min['Variable'] == feature]
    if not shap_row.empty:
        ranking_shap = shap_row.iloc[0]['Ranking']
        signo_shap = shap_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = ranking_shap
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = signo_shap
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Shapley', 'Signo')] = '-'

    # Repetir el proceso para el DataFrame de Lime
    lime_row = lime_fn_df_min[lime_fn_df_min['Variable'] == feature]
    if not lime_row.empty:
        ranking_lime = lime_row.iloc[0]['Ranking']
        signo_lime = lime_row.iloc[0]['Signo']
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = ranking_lime
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = signo_lime
    else:
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Ranking')] = '-'
        df_final_min.loc[df_final_min['Variable'] == feature, ('Lime', 'Signo')] = '-'

df_final_min

In [None]:
df_final_min.set_index('Variable', inplace=True)
nuevo_orden = ["parallel_changed_file_num", "developer_num", "commit_num", "line_added", "messages_min", "delete_frequency"]

# Reorganizar el DataFrame según el nuevo orden
df_final_min = df_final_min.reindex(nuevo_orden)

df_final_min

### **FN General:**

In [None]:
ranking_valores = {
    'Breakdown': [],
    'Shapley': [],
    'Lime': []
}

# Recorre los DataFrames y almacena los pares de valores por técnica
for tecnica in ranking_valores:
    for caracteristica in df_final_max.index:
        ranking = df_final_max[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_mediana.index:
        ranking = df_final_mediana[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

    for caracteristica in df_final_min.index:
        ranking = df_final_min[(tecnica, 'Ranking')][caracteristica]
        if ranking != "-":
            ranking_valores[tecnica].append((caracteristica, int(ranking)))

# Calcula el ranking medio por técnica y característica
ranking_medio = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, ranking in ranking_values:
        if caracteristica not in ranking_medio[tecnica]:
            ranking_medio[tecnica][caracteristica] = [ranking]
        else:
            ranking_medio[tecnica][caracteristica].append(ranking)

for tecnica, ranking_values in ranking_medio.items():
    for caracteristica in ranking_values:
        ranking_medio[tecnica][caracteristica] = sum(ranking_values[caracteristica]) / len(ranking_values[caracteristica])


# Ahora el conteo de apariciones por característica y técnica
apariciones_count = {
    'Breakdown': {},
    'Shapley': {},
    'Lime': {}
}

for tecnica, ranking_values in ranking_valores.items():
    for caracteristica, _ in ranking_values:
        if caracteristica not in apariciones_count[tecnica]:
            apariciones_count[tecnica][caracteristica] = 1
        else:
            apariciones_count[tecnica][caracteristica] += 1

# Tengo los valores, los conteos de apariciones y los ranking medios por técnica en los respectivos diccionarios
print(ranking_valores)
print(apariciones_count)
print(ranking_medio)

In [None]:
# Lista de técnicas
techniques = ['Breakdown', 'Shapley', 'Lime']

# Diccionario para almacenar los datos
data_dict = {}

# Crear una lista de todas las características
all_caract = list(set().union(*[set(ranking_medio[technique]) for technique in techniques]))

# Recorrer las técnicas y las características
for technique in techniques:
    caract = list(ranking_medio[technique].keys())
    rank = [ranking_medio[technique].get(c, "-") for c in all_caract]
    apar = [apariciones_count[technique].get(c, "-") for c in all_caract]

    # Agregar los datos al diccionario
    data_dict[(technique, "Ranking Medio")] = rank
    data_dict[(technique, "Conteo")] = apar

# Crear el DataFrame resumen
df_resumen_rus_fn = pd.DataFrame(data_dict, index=all_caract)

df_resumen_rus_fn

In [None]:
# Calcular el ranking medio general omitiendo los valores "-"
df_resumen_rus_fn[("General", "Ranking")] = df_resumen_rus_fn[[(tech, "Ranking Medio") for tech in techniques]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_rus_fn[("General", "Conteo Total")] = df_resumen_rus_fn[[(tech, "Conteo") for tech in techniques]].replace('-', 0).sum(axis=1)

# Mostrar el DataFrame resumen actualizado
df_resumen_rus_fn

In [None]:
# Obtener el número de características
num_caract = df_resumen_rus_fn.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_rus_fn[("General", "Peso Rango")] = 1 - ((df_resumen_rus_fn[("General", "Ranking")].rank(ascending=True) - 1) / num_caract)
df_resumen_rus_fn[("General", "Peso Conteo")] = df_resumen_rus_fn[("General", "Conteo Total")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_rus_fn[("General", "Puntaje")] = df_resumen_rus_fn[("General", "Peso Rango")] + df_resumen_rus_fn[("General", "Peso Conteo")]
df_resumen_rus_fn[("General", "Ranking")] = df_resumen_rus_fn[("General", "Puntaje")].rank(ascending=False, method="min")

# Eliminar la subcolumnas innecesarias
df_resumen_rus_fn.drop("Peso Rango", axis=1, level=1, inplace=True)
df_resumen_rus_fn.drop("Peso Conteo", axis=1, level=1, inplace=True)
df_resumen_rus_fn.drop("Puntaje", axis=1, level=1, inplace=True)

df_resumen_rus_fn

In [None]:
# Obtener las columnas actuales del DataFrame
columns = df_resumen_rus_fn.columns

# Extraer las columnas de "General" y sus subcolumnas
general_columns = columns.get_level_values(0) == "General"

# Crear una lista con las subcolumnas de "General"
general_subcolumns = [("General", "Ranking"), ("General", "Conteo Total")]

# Crear una lista con las subcolumnas asociadas a las técnicas
technique_subcolumns = list(columns[~general_columns])

# Reorganizar las columnas para mover "General" al principio
new_columns = general_subcolumns + technique_subcolumns

# Crear un nuevo DataFrame con las columnas reorganizadas
df_resumen_rus_fn = df_resumen_rus_fn[new_columns]

In [None]:
df_resumen_rus_fn.sort_values(by=("General", "Ranking"), ascending=True, inplace=True)

# Formatear los valores
for tech in techniques:
    df_resumen_rus_fn[(tech, "Ranking Medio")] = df_resumen_rus_fn[(tech, "Ranking Medio")].apply(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)

df_resumen_rus_fn

## **LOCAL GENERAL:**

Verdadero Positivo:

In [None]:
# Lista de DataFrames resumen y algoritmo
dfs_resumen = [df_resumen_rf_vp, df_resumen_brf_vp, df_resumen_gb_vp, df_resumen_ada_vp, df_resumen_rus_vp]
algorithms = ["RandomForest", "BalancedRandomForest", "GradientBoosting", "ADABoost", "RUSBoost"]

# Crear un conjunto de índices único
index_set = set()

# Iterar sobre cada DataFrame resumen y agregar sus índices al conjunto
for df_resumen in dfs_resumen:
    index_set.update(df_resumen.index)
index_list = list(index_set)

# Crear un nuevo DataFrame para el resultado final
df_resumen_final = pd.DataFrame(index=index_list)

# Agregar las columnas de "General" con "Ranking" y "Conteo Total" para cada algoritmo
for algorithm, df_resumen in zip(algorithms, dfs_resumen):
    df_resumen_final[(algorithm, 'Ranking')] = df_resumen[('General', 'Ranking')]
    df_resumen_final[(algorithm, 'Conteo Total')] = df_resumen[('General', 'Conteo Total')]

# Rellenar el DataFrame final con datos o '-'
df_resumen_final = df_resumen_final.fillna('-')

# Calcular el ranking final promediando los rankings de las técnicas
df_resumen_final["Ranking Final"] = df_resumen_final[[(algo, "Ranking") for algo in algorithms]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_final[("Conteo Final")] = df_resumen_final[[(algo, "Conteo Total") for algo in algorithms]].replace('-', 0).sum(axis=1)

# Obtener el número de características
num_caract = df_resumen_final.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_final[("Peso Rango")] = 1 - ((df_resumen_final[("Ranking Final")].rank(ascending=True) - 1) / num_caract)
df_resumen_final[("Peso Conteo")] = df_resumen_final[("Conteo Final")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_final[("Puntaje")] = df_resumen_final[("Peso Rango")] + df_resumen_final[("Peso Conteo")]
df_resumen_final[("Ranking Final")] = df_resumen_final[("Puntaje")].rank(ascending=False, method="min")


# Eliminar las columnas innecesarias
df_resumen_final.drop("Peso Rango", axis=1, inplace=True)
df_resumen_final.drop("Peso Conteo", axis=1, inplace=True)
df_resumen_final.drop("Puntaje", axis=1, inplace=True)

df_resumen_final.sort_values(by=("Ranking Final"), ascending=True, inplace=True)

# Formatear los valores
for algo in algorithms:
    df_resumen_final[(algo, "Ranking")] = df_resumen_final[(algo, "Ranking")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)
    df_resumen_final[(algo, "Conteo Total")] = df_resumen_final[(algo, "Conteo Total")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)

df_resumen_final[("Ranking Final")] = df_resumen_final[("Ranking Final")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)
df_resumen_final[("Conteo Final")] = df_resumen_final[("Conteo Final")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)

df_resumen_final

Verdadero Negativo:

In [None]:
# Lista de DataFrames resumen y algoritmo
dfs_resumen = [df_resumen_rf_vn, df_resumen_brf_vn, df_resumen_gb_vn, df_resumen_ada_vn, df_resumen_rus_vn]
algorithms = ["RandomForest", "BalancedRandomForest", "GradientBoosting", "ADABoost", "RUSBoost"]

# Crear un conjunto de índices único
index_set = set()

# Iterar sobre cada DataFrame resumen y agregar sus índices al conjunto
for df_resumen in dfs_resumen:
    index_set.update(df_resumen.index)
index_list = list(index_set)

# Crear un nuevo DataFrame para el resultado final
df_resumen_final = pd.DataFrame(index=index_list)

# Agregar las columnas de "General" con "Ranking" y "Conteo Total" para cada algoritmo
for algorithm, df_resumen in zip(algorithms, dfs_resumen):
    df_resumen_final[(algorithm, 'Ranking')] = df_resumen[('General', 'Ranking')]
    df_resumen_final[(algorithm, 'Conteo Total')] = df_resumen[('General', 'Conteo Total')]

# Rellenar el DataFrame final con datos o '-'
df_resumen_final = df_resumen_final.fillna('-')

# Calcular el ranking final promediando los rankings de las técnicas
df_resumen_final["Ranking Final"] = df_resumen_final[[(algo, "Ranking") for algo in algorithms]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_final[("Conteo Final")] = df_resumen_final[[(algo, "Conteo Total") for algo in algorithms]].replace('-', 0).sum(axis=1)

# Obtener el número de características
num_caract = df_resumen_final.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_final[("Peso Rango")] = 1 - ((df_resumen_final[("Ranking Final")].rank(ascending=True) - 1) / num_caract)
df_resumen_final[("Peso Conteo")] = df_resumen_final[("Conteo Final")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_final[("Puntaje")] = df_resumen_final[("Peso Rango")] + df_resumen_final[("Peso Conteo")]
df_resumen_final[("Ranking Final")] = df_resumen_final[("Puntaje")].rank(ascending=False, method="min")


# Eliminar las columnas innecesarias
df_resumen_final.drop("Peso Rango", axis=1, inplace=True)
df_resumen_final.drop("Peso Conteo", axis=1, inplace=True)
df_resumen_final.drop("Puntaje", axis=1, inplace=True)

df_resumen_final.sort_values(by=("Ranking Final"), ascending=True, inplace=True)

# Formatear los valores
for algo in algorithms:
    df_resumen_final[(algo, "Ranking")] = df_resumen_final[(algo, "Ranking")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)
    df_resumen_final[(algo, "Conteo Total")] = df_resumen_final[(algo, "Conteo Total")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)

df_resumen_final[("Ranking Final")] = df_resumen_final[("Ranking Final")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)
df_resumen_final[("Conteo Final")] = df_resumen_final[("Conteo Final")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)

df_resumen_final

Falso Positivo:

In [None]:
# Lista de DataFrames resumen y algoritmo
dfs_resumen = [df_resumen_rf_fp, df_resumen_brf_fp, df_resumen_gb_fp, df_resumen_ada_fp, df_resumen_rus_fp]
algorithms = ["RandomForest", "BalancedRandomForest", "GradientBoosting", "ADABoost", "RUSBoost"]

# Crear un conjunto de índices único
index_set = set()

# Iterar sobre cada DataFrame resumen y agregar sus índices al conjunto
for df_resumen in dfs_resumen:
    index_set.update(df_resumen.index)
index_list = list(index_set)

# Crear un nuevo DataFrame para el resultado final
df_resumen_final = pd.DataFrame(index=index_list)

# Agregar las columnas de "General" con "Ranking" y "Conteo Total" para cada algoritmo
for algorithm, df_resumen in zip(algorithms, dfs_resumen):
    df_resumen_final[(algorithm, 'Ranking')] = df_resumen[('General', 'Ranking')]
    df_resumen_final[(algorithm, 'Conteo Total')] = df_resumen[('General', 'Conteo Total')]

# Rellenar el DataFrame final con datos o '-'
df_resumen_final = df_resumen_final.fillna('-')

# Calcular el ranking final promediando los rankings de las técnicas
df_resumen_final["Ranking Final"] = df_resumen_final[[(algo, "Ranking") for algo in algorithms]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_final[("Conteo Final")] = df_resumen_final[[(algo, "Conteo Total") for algo in algorithms]].replace('-', 0).sum(axis=1)

# Obtener el número de características
num_caract = df_resumen_final.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_final[("Peso Rango")] = 1 - ((df_resumen_final[("Ranking Final")].rank(ascending=True) - 1) / num_caract)
df_resumen_final[("Peso Conteo")] = df_resumen_final[("Conteo Final")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_final[("Puntaje")] = df_resumen_final[("Peso Rango")] + df_resumen_final[("Peso Conteo")]
df_resumen_final[("Ranking Final")] = df_resumen_final[("Puntaje")].rank(ascending=False, method="min")


# Eliminar las columnas innecesarias
df_resumen_final.drop("Peso Rango", axis=1, inplace=True)
df_resumen_final.drop("Peso Conteo", axis=1, inplace=True)
df_resumen_final.drop("Puntaje", axis=1, inplace=True)

df_resumen_final.sort_values(by=("Ranking Final"), ascending=True, inplace=True)

# Formatear los valores
for algo in algorithms:
    df_resumen_final[(algo, "Ranking")] = df_resumen_final[(algo, "Ranking")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)
    df_resumen_final[(algo, "Conteo Total")] = df_resumen_final[(algo, "Conteo Total")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)

df_resumen_final[("Ranking Final")] = df_resumen_final[("Ranking Final")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)
df_resumen_final[("Conteo Final")] = df_resumen_final[("Conteo Final")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)

df_resumen_final

Falso Negativo:

In [None]:
# Lista de DataFrames resumen y algoritmo
dfs_resumen = [df_resumen_rf_fn, df_resumen_brf_fn, df_resumen_gb_fn, df_resumen_ada_fn, df_resumen_rus_fn]
algorithms = ["RandomForest", "BalancedRandomForest", "GradientBoosting", "ADABoost", "RUSBoost"]

# Crear un conjunto de índices único
index_set = set()

# Iterar sobre cada DataFrame resumen y agregar sus índices al conjunto
for df_resumen in dfs_resumen:
    index_set.update(df_resumen.index)
index_list = list(index_set)

# Crear un nuevo DataFrame para el resultado final
df_resumen_final = pd.DataFrame(index=index_list)

# Agregar las columnas de "General" con "Ranking" y "Conteo Total" para cada algoritmo
for algorithm, df_resumen in zip(algorithms, dfs_resumen):
    df_resumen_final[(algorithm, 'Ranking')] = df_resumen[('General', 'Ranking')]
    df_resumen_final[(algorithm, 'Conteo Total')] = df_resumen[('General', 'Conteo Total')]

# Rellenar el DataFrame final con datos o '-'
df_resumen_final = df_resumen_final.fillna('-')

# Calcular el ranking final promediando los rankings de las técnicas
df_resumen_final["Ranking Final"] = df_resumen_final[[(algo, "Ranking") for algo in algorithms]].replace('-', np.nan).mean(axis=1)

# Calcular la suma total de apariciones omitiendo los valores "-"
df_resumen_final[("Conteo Final")] = df_resumen_final[[(algo, "Conteo Total") for algo in algorithms]].replace('-', 0).sum(axis=1)

# Obtener el número de características
num_caract = df_resumen_final.shape[0]

# Calcular el peso para el ranking y apariciones
df_resumen_final[("Peso Rango")] = 1 - ((df_resumen_final[("Ranking Final")].rank(ascending=True) - 1) / num_caract)
df_resumen_final[("Peso Conteo")] = df_resumen_final[("Conteo Final")].rank(ascending=True) / num_caract

# Calcular el puntaje final como suma de los pesos
df_resumen_final[("Puntaje")] = df_resumen_final[("Peso Rango")] + df_resumen_final[("Peso Conteo")]
df_resumen_final[("Ranking Final")] = df_resumen_final[("Puntaje")].rank(ascending=False, method="min")


# Eliminar las columnas innecesarias
df_resumen_final.drop("Peso Rango", axis=1, inplace=True)
df_resumen_final.drop("Peso Conteo", axis=1, inplace=True)
df_resumen_final.drop("Puntaje", axis=1, inplace=True)

df_resumen_final.sort_values(by=("Ranking Final"), ascending=True, inplace=True)

# Formatear los valores
for algo in algorithms:
    df_resumen_final[(algo, "Ranking")] = df_resumen_final[(algo, "Ranking")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)
    df_resumen_final[(algo, "Conteo Total")] = df_resumen_final[(algo, "Conteo Total")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)

df_resumen_final[("Ranking Final")] = df_resumen_final[("Ranking Final")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)
df_resumen_final[("Conteo Final")] = df_resumen_final[("Conteo Final")].apply(lambda x: f"{x:.0f}" if isinstance(x, (int, float)) else x)

df_resumen_final

# **Resultados:**

**RENDIMIENTO MODELOS:**

In [None]:
classifiers = ["RandomForest", "BalancedRF", "GradientBoosting", "AdaBoost", "RUSBoost"]
matrices_confusion = [cm_rf, cm_brf, cm_gb, cm_ada, cm_rus]
class_names = ["0", "1"]

fig, axes = plt.subplots(1, 5, figsize=(15, 5), sharey="row")

for i, (cm, classifier_name) in enumerate(zip(matrices_confusion, classifiers)):

    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)

    disp.plot(ax=axes[i], cmap="Blues")
    axes[i].set_title(classifier_name)
    disp.im_.colorbar.remove()
    disp.ax_.set_xlabel('')
    if i!=0:
        disp.ax_.set_ylabel('')

plt.tight_layout()
fig.text(0.4, 0.1, "Predicted label", ha="left")
fig.colorbar(disp.im_, ax=axes)
plt.show()

In [None]:
score_result

In [None]:
metricas = ['Accuracy', 'Recall', 'Precision', 'F1-score']
colores = ['#3498DB', '#F7DC6F','#EC7063', '#58D68D']

num_modelos = len(score_result)
ancho_barra = 0.15
x = range(num_modelos)

plt.figure(figsize=(9, 6))

# Crear un gráfico de barras para cada métrica
for i, metrica in enumerate(metricas):
    # Calcular la posición para las barras de esta métrica
    posiciones_x = [pos + i * ancho_barra for pos in x]

    # Obtener los valores de la métrica actual
    valores_metrica = score_result[metrica]

    # Crear las barras para la métrica actual
    plt.bar(
        posiciones_x,
        valores_metrica,
        width=ancho_barra,
        label=metrica,
        color=colores[i],
    )

# Personalizar el gráfico
plt.xlabel('Modelo')
plt.ylabel('Valor de la Métrica')
plt.title('Comparación de Rendimiento Python')
plt.xticks([pos + (len(metricas) - 1) * ancho_barra / 2 for pos in x], score_result['Modelo'], rotation=45, ha="right")
plt.legend(loc='lower right')

# Mostrar el gráfico combinado
plt.tight_layout()
plt.show()

In [None]:
variables=list(score_result)
variables.pop(0)

for var in variables:
  score_result.plot.bar(x='Modelo', y=var, rot=0)

**RENDIMIENTO INVERSO:**

In [None]:
score_inv_result

In [None]:
metricas = ['Accuracy', 'Recall', 'Precision', 'F1-score']
colores = ['#3498DB', '#F7DC6F','#EC7063', '#58D68D']

num_modelos = len(score_inv_result)
ancho_barra = 0.15
x = range(num_modelos)

plt.figure(figsize=(9, 6))

# Crear un gráfico de barras para cada métrica
for i, metrica in enumerate(metricas):
    # Calcular la posición para las barras de esta métrica
    posiciones_x = [pos + i * ancho_barra for pos in x]

    # Obtener los valores de la métrica actual
    valores_metrica = score_inv_result[metrica]

    # Crear las barras para la métrica actual
    plt.bar(
        posiciones_x,
        valores_metrica,
        width=ancho_barra,
        label=metrica,
        color=colores[i],
    )

# Personalizar el gráfico
plt.xlabel('Modelo')
plt.ylabel('Valor de la Métrica')
plt.title('Comparación de Rendimiento Python (Inverso)')
plt.xticks([pos + (len(metricas) - 1) * ancho_barra / 2 for pos in x], score_inv_result['Modelo'], rotation=45, ha="right")
plt.legend(loc='lower right')

# Mostrar el gráfico combinado
plt.tight_layout()
plt.show()

In [None]:
variables_inv=list(score_inv_result)
variables_inv.pop(0)

for var in variables_inv:
  score_inv_result.plot.bar(x='Modelo', y=var, rot=0, color='green')

In [None]:
# Código para la primera gráfica (score_result)
metricas = ['Accuracy', 'Recall', 'Precision', 'F1-score']
colores = ['#3498DB', '#F7DC6F', '#EC7063', '#58D68D']

num_modelos = len(score_result)
ancho_barra = 0.15
x = range(num_modelos)

# Crear una figura con dos subplots, uno a la izquierda y otro a la derecha
plt.figure(figsize=(14, 6))

# Subplot izquierdo para la primera gráfica (score_result)
plt.subplot(1, 2, 1)

for i, metrica in enumerate(metricas):
    posiciones_x = [pos + i * ancho_barra for pos in x]
    valores_metrica = score_result[metrica]
    plt.bar(
        posiciones_x,
        valores_metrica,
        width=ancho_barra,
        label=metrica,
        color=colores[i],
    )

plt.xlabel('Modelo')
plt.ylabel('Valor de la Métrica')
plt.title('Comparación de Rendimiento Python')
plt.xticks([pos + (len(metricas) - 1) * ancho_barra / 2 for pos in x], score_result['Modelo'], rotation=45, ha="right")
plt.legend(loc='lower right')

# Subplot derecho para la segunda gráfica (score_inv_result)
plt.subplot(1, 2, 2)

for i, metrica in enumerate(metricas):
    posiciones_x = [pos + i * ancho_barra for pos in x]
    valores_metrica = score_inv_result[metrica]
    plt.bar(
        posiciones_x,
        valores_metrica,
        width=ancho_barra,
        label=metrica,
        color=colores[i],
    )

plt.xlabel('Modelo')
plt.title('Comparación de Rendimiento Python (Inverso)')
plt.xticks([pos + (len(metricas) - 1) * ancho_barra / 2 for pos in x], score_inv_result['Modelo'], rotation=45, ha="right")
plt.legend(loc='lower right')
plt.tick_params(labelleft = False)


# Ajustar los subplots para evitar superposiciones
plt.tight_layout()

# Mostrar la figura combinada
plt.show()
