In [None]:
import pandas as pd
import numpy as np
from scipy.stats import linregress, spearmanr
from sklearn.metrics import mean_squared_error

# 1) Cargar los datos de predicción de los modelos a comparar
df2 = pd.read_csv('predicciones/ByCancer/prediccion_1.csv')  # Modelo A
df3 = pd.read_csv('predicciones/ByCancer/prediccion_2.csv')  # Modelo B

# 2) Definir función para calcular Concordance Index (CI) sin dependencias externas
def concordance_index_np(y_true, y_pred):
    n = len(y_true)
    concordant = 0.0
    permissible = 0
    for i in range(n):
        for j in range(i+1, n):
            if y_true[i] != y_true[j]:
                permissible += 1
                if (y_pred[i] < y_pred[j] and y_true[i] < y_true[j]) or \
                   (y_pred[i] > y_pred[j] and y_true[i] > y_true[j]):
                    concordant += 1
                elif y_pred[i] == y_pred[j]:
                    concordant += 0.5
    return concordant / permissible if permissible > 0 else np.nan

# 3) Función para calcular las métricas clave sobre un grupo (un tipo de cáncer)
def compute_metrics(df_group):
    y_true = df_group['Label'].values
    y_pred = df_group['Pred'].values

    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)

    slope, intercept, r_value, p_value, std_err = linregress(y_true, y_pred)
    pearson_r = r_value

    spearman_rho, _ = spearmanr(y_true, y_pred)
    ci = concordance_index_np(y_true, y_pred)

    return pd.Series({
        'MSE': mse,
        'RMSE': rmse,
        'Pearson': pearson_r,
        'Spearman': spearman_rho,
        'CIndex': ci
    })

# 4) Agrupar por TCGA_DESC y aplicar la función a cada grupo
metrics_B = df2.groupby('TCGA_DESC').apply(compute_metrics).reset_index().rename(columns={'TCGA_DESC': 'Cancer'})
metrics_C = df3.groupby('TCGA_DESC').apply(compute_metrics).reset_index().rename(columns={'TCGA_DESC': 'Cancer'})

# 5) Unir ambas tablas y calcular la diferencia (C – B) en cada métrica
metrics_diff = pd.merge(
    metrics_C, metrics_B,
    on='Cancer',
    suffixes=('_C', '_B')
)

for col in ['MSE', 'RMSE', 'Pearson', 'Spearman', 'CIndex']:
    metrics_diff[f'{col}_diff'] = metrics_diff[f'{col}_C'] - metrics_diff[f'{col}_B']

# 6) Seleccionar solo las columnas de interés (diferencias)
diff_table = metrics_diff[['Cancer', 'MSE_diff', 'RMSE_diff', 'Pearson_diff', 'Spearman_diff', 'CIndex_diff']]

# 7) Calcular la diferencia media de cada métrica entre los dos modelos
mean_diffs = diff_table[['MSE_diff', 'RMSE_diff', 'Pearson_diff', 'Spearman_diff', 'CIndex_diff']].mean()

print("Diferencia media entre modelos A y B:")
print(diff_table)
print(mean_diffs)

# 8) Guardar la tabla de diferencias en un archivo CSV
diff_table.to_csv('predicciones/ByCancer/diferencias_modelos_A_B.csv', index=False)
# 9) Guardar las diferencias medias en un archivo CSV
mean_diffs.to_csv('predicciones/ByCancer/diferencias_medias_A_B.csv', header=True)



  metrics_B = df2.groupby('TCGA_DESC').apply(compute_metrics).reset_index().rename(columns={'TCGA_DESC': 'Cancer'})


Diferencia media entre modelos A y B:
          Cancer  MSE_diff  RMSE_diff  Pearson_diff  Spearman_diff  \
0            ACC -0.094757  -0.060179     -0.018448      -0.081818   
1            ALL -0.167013  -0.088673      0.000838      -0.036040   
2           BLCA -0.569799  -0.256629      0.023079       0.034978   
3           BRCA -0.198364  -0.080991      0.001566      -0.025235   
4           CESC -0.546623  -0.221627      0.020854       0.024841   
5            CLL  0.275846   0.196491     -0.042468      -0.123377   
6         COREAD  0.271863   0.125487     -0.026334      -0.030421   
7           DLBC -0.713121  -0.302223      0.015781       0.006746   
8           ESCA -0.659572  -0.284893      0.021254       0.040816   
9            GBM  0.008231   0.003731     -0.004656      -0.000580   
10          HNSC -0.072056  -0.036329     -0.003661      -0.015531   
11          KIRC  0.103105   0.050602     -0.023851      -0.023406   
12          LAML  0.137182   0.065407     -0.032501 

  metrics_C = df3.groupby('TCGA_DESC').apply(compute_metrics).reset_index().rename(columns={'TCGA_DESC': 'Cancer'})
