## Análisis y Limpieza de Datos

In [1]:
import pandas as pd
import numpy as np
import os

def ordenar_gen(df):
    columnas_ordenadas = (
    ['Paciente'] +  # Columnas fijas al inicio
    [f'Gen_{i}' for i in range(len(df.columns)+1)]  # Genera ['Gen_0', 'Gen_1', ...]
    )
    # Filtrar solo las columnas que existen en el DataFrame
    columnas_finales = [col for col in columnas_ordenadas if col in df.columns]
    df = df[columnas_finales]
    df[df.columns[1:]] = df[df.columns[1:]].fillna(0).astype(int)
    return df


def ordenar_filtrar_datos(path, nombre_csv):
    # Lee la ubicacion del archivo csv y crea un DataFrame
    df = pd.read_csv(os.path.join(path, nombre_csv)) 
    # Quitar columna "Total Ramas" si existe
    if 'Total Ramas' in df.columns:
        df = df.drop('Total Ramas', axis=1)
        df = ordenar_gen(df)
        return df
    else:
        df = ordenar_gen(df)
        return df
    
def arreglar_error_gen(df,gen_inicio):
    # Conserva columna pacientes
    columna_pacientes = df.columns[0]
    # Crea nuevo df a partir de la gen introducida
    nuevo_df = pd.DataFrame({
        columna_pacientes: df[columna_pacientes],
        **{
            f"Gen_{i - gen_inicio}": df[f"Gen_{i}"] 
            for i in range(gen_inicio, df.filter(regex=r"Gen_\d+").shape[1])
        }
    })
    return nuevo_df


In [7]:
# Definir ubicacion y nombre de archivos
path = "a:/main/workspaces/procesado_datos/"
csv_001 = "conteo_metodo_automatico.csv"
#csv_debug = "conteos_ramas_debug.csv"

#df_debug = ordenar_filtrar_datos(path,csv_debug)
df_main = ordenar_filtrar_datos(path,csv_001)
#df_main.to_csv('conteo_raw.csv', index=False)
df_main.head()

Unnamed: 0,Paciente,Gen_0,Gen_1,Gen_2,Gen_3,Gen_4,Gen_5,Gen_6,Gen_7,Gen_8,Gen_9,Gen_10,Gen_11,Gen_12,Gen_13
0,ATM_001_0000 segmentation,1,2,4,9,15,10,4,4,0,0,0,0,0,0
1,ATM_002_0000 segmentation,1,2,5,11,12,5,0,0,0,0,0,0,0,0
2,ATM_003_0000 segmentation,1,2,4,8,15,16,11,4,2,2,0,0,0,0
3,ATM_004_0000 segmentation,1,2,4,8,17,22,17,7,6,2,0,0,0,0
4,ATM_005_0000 segmentation,1,1,2,4,9,15,14,2,4,4,0,0,0,0


In [8]:
# Identificar filas con posibles errores de conteo
df_filtrado = df_main.query('not (Gen_0 == 1 and Gen_1 == 2 and Gen_2 >= 4)')
df_filtrado

Unnamed: 0,Paciente,Gen_0,Gen_1,Gen_2,Gen_3,Gen_4,Gen_5,Gen_6,Gen_7,Gen_8,Gen_9,Gen_10,Gen_11,Gen_12,Gen_13
4,ATM_005_0000 segmentation,1,1,2,4,9,15,14,2,4,4,0,0,0,0
17,ATM_018_0000 segmentation,1,1,2,4,9,15,6,4,0,0,0,0,0,0
22,ATM_023_0000 segmentation,1,1,2,4,9,15,8,4,0,0,0,0,0,0
24,ATM_025_0000 segmentation,1,1,2,4,8,14,19,14,11,5,2,1,0,0
29,ATM_030_0000 segmentation,1,1,2,4,8,17,13,6,6,4,0,0,0,0


In [9]:
# genera lista para generar paths y volver a procesar dichos pacientes
lista = df_filtrado["Paciente"].astype(str).tolist()
adc = ["./Data/Airways/" + name + "/" + name for name in lista]
adc

['./Data/Airways/ATM_005_0000 segmentation/ATM_005_0000 segmentation',
 './Data/Airways/ATM_018_0000 segmentation/ATM_018_0000 segmentation',
 './Data/Airways/ATM_023_0000 segmentation/ATM_023_0000 segmentation',
 './Data/Airways/ATM_025_0000 segmentation/ATM_025_0000 segmentation',
 './Data/Airways/ATM_030_0000 segmentation/ATM_030_0000 segmentation']

In [10]:
# Para casos que una 2da vuelta al script no funciona, toca modificarlo manualmente
# path = "/home/alan/workspaces/Skeleton/"
# csv_debug = "conteos_ramas_debug.csv"
# Aplicar funcion para reordenar conteo de generaciones
# df_debug = ordenar_filtrar_datos(path,csv_debug)
# df_debug



In [11]:
filter = df_main.query('not (Gen_0 == 1 and Gen_1 == 2 and Gen_2 >= 4)')
filter

Unnamed: 0,Paciente,Gen_0,Gen_1,Gen_2,Gen_3,Gen_4,Gen_5,Gen_6,Gen_7,Gen_8,Gen_9,Gen_10,Gen_11,Gen_12,Gen_13
4,ATM_005_0000 segmentation,1,1,2,4,9,15,14,2,4,4,0,0,0,0
17,ATM_018_0000 segmentation,1,1,2,4,9,15,6,4,0,0,0,0,0,0
22,ATM_023_0000 segmentation,1,1,2,4,9,15,8,4,0,0,0,0,0,0
24,ATM_025_0000 segmentation,1,1,2,4,8,14,19,14,11,5,2,1,0,0
29,ATM_030_0000 segmentation,1,1,2,4,8,17,13,6,6,4,0,0,0,0


In [None]:

algo = arreglar_error_gen(filter,1)
algo

Unnamed: 0,Paciente,Gen_0,Gen_1,Gen_2,Gen_3,Gen_4,Gen_5,Gen_6,Gen_7,Gen_8,Gen_9,Gen_10,Gen_11,Gen_12
4,ATM_005_0000 segmentation,1,2,4,9,15,14,2,4,4,0,0,0,0
17,ATM_018_0000 segmentation,1,2,4,9,15,6,4,0,0,0,0,0,0
22,ATM_023_0000 segmentation,1,2,4,9,15,8,4,0,0,0,0,0,0
24,ATM_025_0000 segmentation,1,2,4,8,14,19,14,11,5,2,1,0,0
29,ATM_030_0000 segmentation,1,2,4,8,17,13,6,6,4,0,0,0,0


In [None]:


# Paso 1: Establecer 'Paciente' como índice en ambos DataFrames
df_main = df_main.set_index('Paciente')
algo = algo.set_index('Paciente')

# Paso 2: Actualizar los valores
df_main.update(algo)

# Paso 3: Restablecer el índice para volver a la estructura original
df_main = df_main.reset_index()

In [25]:
#guardar cambios
df_main.to_csv('conteo_metodo_automatico.csv', index=False, sep=",")

Desde aqui hacer data analysis con el csv bien hecho, en su prime

In [6]:
#prime = ordenar_filtrar_datos(path,"~/workspaces/pib/conteo_arreglado.csv")
#prime = ordenar_filtrar_datos("~/workspaces/pib/","conteo_arreglado.csv")
prime = pd.read_csv("~/workspaces/pib/conteo_arreglado.csv", delimiter=";") 
prime.describe().round(2)

Unnamed: 0,Gen_0,Gen_1,Gen_2,Gen_3,Gen_4,Gen_5,Gen_6,Gen_7,Gen_8,Gen_9,Gen_10,Gen_11,Gen_12,Gen_13
count,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0
mean,1.0,2.0,4.03,8.87,18.63,34.7,45.0,35.23,21.43,14.43,7.07,2.93,0.93,0.27
std,0.0,0.0,0.18,0.94,1.99,5.02,12.77,14.86,11.04,7.96,4.86,3.47,1.44,0.87
min,1.0,2.0,4.0,6.0,13.0,20.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,2.0,4.0,8.25,18.0,33.0,38.25,25.0,12.5,8.25,3.25,0.0,0.0,0.0
50%,1.0,2.0,4.0,9.0,19.0,36.0,48.0,36.0,22.0,13.5,6.5,2.0,0.0,0.0
75%,1.0,2.0,4.0,9.0,20.0,38.0,52.0,43.75,26.5,21.5,11.5,4.0,2.0,0.0
max,1.0,2.0,5.0,10.0,21.0,41.0,65.0,70.0,47.0,30.0,17.0,12.0,4.0,4.0


In [7]:
#Para contar todas las generaciones
slice_df = prime.iloc[:, 1:]
hey = slice_df.sum(axis=1).tolist()

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt

# Derretir (melt) las columnas de generaciones
df_melted = prime.melt(
    value_vars=[col for col in prime.columns if col.startswith('Gen_')],
    var_name='Generación',
    value_name='Conteo'
)
df_melted['Generación'] = df_melted['Generación'].str.replace('Gen_', '').astype(int)  # Convertir a número