In [30]:
import pandas as pd

In [31]:
# Cargar todos los CSV que has subido
df_cancer = pd.read_csv("data/analisis_cancer.csv")
df_sangre = pd.read_csv("data/analisis_sangre_dataset.csv")
df_historial = pd.read_csv("data/historial_medico.csv")
df_imagenes = pd.read_csv("data/historial_medico_imagenes.csv")

# Mostrar una vista rápida de cada uno para saber qué contiene
resumen = {
    "analisis_cancer.csv": df_cancer.head(),
    "analisis_sangre_dataset.csv": df_sangre.head(),
    "historial_medico.csv": df_historial.head(),
    "historial_medico_imagenes.csv": df_imagenes.head()
}

resumen


{'analisis_cancer.csv':    id  cancer_stage  tumor_size early_detection inflammatory_bowel_disease  \
 0   1             3    2.788441              No                         No   
 1   2             1    1.049699              No                        Yes   
 2   3             3    8.339153              No                         No   
 3   4             3    7.361716              No                         No   
 4   5             1    7.561065              No                         No   
 
   relapse  
 0      No  
 1      No  
 2      No  
 3      No  
 4      No  ,
 'analisis_sangre_dataset.csv':     id  Hemoglobina  Plaquetas  Globulos blancos  Globulos rojos  Glucosa  HDL
 0  109         10.4     180000              5700             3.7       77   25
 1  150         13.8     320000              7500             5.4       92   30
 2  194         13.5     370000              8500             5.1       90   29
 3  171         12.7     290000              7800             4.8      

In [35]:


# Renombrar por si acaso 'Id' estuviera aún en alguno
df_historial = df_historial.rename(columns={"Id": "id"})
df_imagenes = df_imagenes.rename(columns={"Id": "id"})

# Unir los datasets usando 'id'
df_completo = df_historial.merge(df_cancer, on="id", how="left")
df_completo = df_completo.merge(df_sangre, on="id", how="left")
df_completo = df_completo.merge(df_imagenes, on="id", how="left")

# Mostrar tabla combinada
print(df_completo.head())  # Ver las primeras filas
print(df_completo.columns)  # Ver las columnas disponibles
print(df_completo.shape)  # Ver dimensiones (filas, columnas)


   id Sexo  Age Family history smoke alcohol     obesity      diet  \
0   1    M   77             No    No     Yes  Overweight       Low   
1   2    M   59             No    No      No  Overweight  Moderate   
2   3    M   83             No    No      No       Obese      High   
3   4    M   66             No   Yes      No      Normal       Low   
4   5    F   79             No   Yes      No  Overweight       Low   

  Screening_History Healthcare_Access  ... early_detection  \
0           Regular          Moderate  ...              No   
1           Regular              High  ...              No   
2           Regular          Moderate  ...              No   
3             Never              High  ...              No   
4             Never              High  ...              No   

   inflammatory_bowel_disease  relapse Hemoglobina Plaquetas Globulos blancos  \
0                          No       No        12.3    290000             7400   
1                         Yes       No      

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Cargar datos (en tu caso ya tienes df_completo)
df = df_completo.copy()

# 2. Convertir variable objetivo a 0 y 1
df['Survival_Prediction'] = df['Survival_Prediction'].map({'Yes': 1, 'No': 0})

# 3. Eliminar columnas no útiles (id, imagename)
df = df.drop(columns=['id', 'imagename'])

# 4. Codificar variables categóricas
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = LabelEncoder().fit_transform(df[col])

# 5. Separar features y target
X = df.drop(columns=['Survival_Prediction'])
y = df['Survival_Prediction']

# 6. Dividir en train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Entrenar modelo
model = RandomForestClassifier(n_estimators=200, max_depth=6, random_state=42)
model.fit(X_train, y_train)

# 8. Predecir
y_pred = model.predict(X_test)

# 9. Evaluar
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred))
print("Informe de clasificación:\n", classification_report(y_test, y_pred))


Accuracy: 0.675
Matriz de confusión:
 [[11  8]
 [ 5 16]]
Informe de clasificación:
               precision    recall  f1-score   support

           0       0.69      0.58      0.63        19
           1       0.67      0.76      0.71        21

    accuracy                           0.68        40
   macro avg       0.68      0.67      0.67        40
weighted avg       0.68      0.68      0.67        40



In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Copia del dataset
df = df_completo.copy()

# Convertir variable objetivo
df["Survival_Prediction"] = df["Survival_Prediction"].map({"Yes": 1, "No": 0})

# Eliminar columnas que no aportan directamente al modelo
df = df.drop(columns=["id", "imagename"])

# Codificar variables categóricas
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = LabelEncoder().fit_transform(df[col])

# Separar features y target
X = df.drop(columns=["Survival_Prediction"])
y = df["Survival_Prediction"]

# Separar en train y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definir la rejilla de hiperparámetros
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "min_samples_split": [2, 4, 6],
    "min_samples_leaf": [1, 2, 4]
}

# Inicializar GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

# Ejecutar búsqueda
grid_search.fit(X_train, y_train)

# Predecir con el mejor modelo
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluar
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)
mejores_parametros = grid_search.best_params_

accuracy, conf_matrix, report, mejores_parametros


(0.625,
 array([[ 8, 11],
        [ 4, 17]]),
 '              precision    recall  f1-score   support\n\n           0       0.67      0.42      0.52        19\n           1       0.61      0.81      0.69        21\n\n    accuracy                           0.62        40\n   macro avg       0.64      0.62      0.61        40\nweighted avg       0.64      0.62      0.61        40\n',
 {'max_depth': 4,
  'min_samples_leaf': 2,
  'min_samples_split': 2,
  'n_estimators': 200})

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Copia del dataframe original
df = df_completo.copy()

# Convertir variable objetivo
df["Survival_Prediction"] = df["Survival_Prediction"].map({"Yes": 1, "No": 0})

# Eliminar columna 'id' si existe
if "id" in df.columns:
    df = df.drop(columns=["id"])

# Codificar variables categóricas
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = LabelEncoder().fit_transform(df[col])

# Separar variables
X = df.drop(columns=["Survival_Prediction"])
y = df["Survival_Prediction"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Grid de hiperparámetros
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "min_samples_split": [2, 4, 6],
    "min_samples_leaf": [1, 2, 4]
}

# Búsqueda de hiperparámetros con validación cruzada
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

# Entrenamiento
grid_search.fit(X_train, y_train)

# Predicción y evaluación
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Mejores hiperparámetros:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred))
print("Informe de clasificación:\n", classification_report(y_test, y_pred))


Mejores hiperparámetros: {'max_depth': 4, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 1.0
Matriz de confusión:
 [[19  0]
 [ 0 21]]
Informe de clasificación:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        21

    accuracy                           1.00        40
   macro avg       1.00      1.00      1.00        40
weighted avg       1.00      1.00      1.00        40



In [40]:
# Obtener la importancia de cada variable del mejor modelo
importances = best_model.feature_importances_
features = X.columns

# Combinar en un DataFrame y ordenarlo
feat_importance = pd.DataFrame({
    "Variable": features,
    "Importancia": importances
}).sort_values(by="Importancia", ascending=False)

# Mostrar top 15
print("Top variables más importantes en el modelo:\n")
print(feat_importance.head(15).to_string(index=False))


Top variables más importantes en el modelo:

                  Variable  Importancia
                 imagename     0.506500
                tumor_size     0.215068
                 Plaquetas     0.040198
               Hemoglobina     0.031345
            Globulos rojos     0.029458
          Globulos blancos     0.029333
                       Age     0.022502
                   Glucosa     0.021674
         Healthcare_Access     0.019984
                      diet     0.016785
                       HDL     0.010900
            Family history     0.010418
              cancer_stage     0.009090
                   obesity     0.006502
inflammatory_bowel_disease     0.006271


In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Copia del dataset original
df = df_completo.copy()

# Eliminar columnas que no deben usarse
df = df.drop(columns=["imagename", "id"], errors="ignore")

# Convertir variable objetivo
df["Survival_Prediction"] = df["Survival_Prediction"].map({"Yes": 1, "No": 0})

# Codificar variables categóricas
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = LabelEncoder().fit_transform(df[col])

# Separar datos
X = df.drop(columns=["Survival_Prediction"])
y = df["Survival_Prediction"]

# División train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Hiperparámetros a probar
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "min_samples_split": [2, 4],
    "min_samples_leaf": [1, 2]
}

# Grid search con validación cruzada
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

# Evaluación
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Mejores hiperparámetros:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred))
print("Informe de clasificación:\n", classification_report(y_test, y_pred))


Mejores hiperparámetros: {'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.625
Matriz de confusión:
 [[ 8 11]
 [ 4 17]]
Informe de clasificación:
               precision    recall  f1-score   support

           0       0.67      0.42      0.52        19
           1       0.61      0.81      0.69        21

    accuracy                           0.62        40
   macro avg       0.64      0.62      0.61        40
weighted avg       0.64      0.62      0.61        40



In [42]:
# Entrenamiento con class_weight='balanced' para penalizar más los errores en la clase minoritaria (clase 0)
model_weighted = RandomForestClassifier(
    n_estimators=200,
    max_depth=4,
    min_samples_split=2,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42
)

# Entrenar
model_weighted.fit(X_train, y_train)

# Evaluación
y_pred_weighted = model_weighted.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_weighted)
conf_matrix = confusion_matrix(y_test, y_pred_weighted)
report = classification_report(y_test, y_pred_weighted)

accuracy, conf_matrix, report


(0.65,
 array([[ 8, 11],
        [ 3, 18]]),
 '              precision    recall  f1-score   support\n\n           0       0.73      0.42      0.53        19\n           1       0.62      0.86      0.72        21\n\n    accuracy                           0.65        40\n   macro avg       0.67      0.64      0.63        40\nweighted avg       0.67      0.65      0.63        40\n')

In [43]:
import joblib

# Guardar el modelo entrenado
joblib.dump(model_weighted, "modelo_supervivencia_rf.pkl")

print("✅ Modelo guardado como modelo_supervivencia_rf.pkl")


✅ Modelo guardado como modelo_supervivencia_rf.pkl


In [51]:
# Prepara datos
df_corr = df_completo.drop(columns=["id", "imagename"], errors="ignore").copy()
from sklearn.preprocessing import LabelEncoder
for col in df_corr.columns:
    if df_corr[col].dtype == "object":
        df_corr[col] = LabelEncoder().fit_transform(df_corr[col])

# Calcular correlación y guardar como CSV
correlation_matrix = df_corr.corr()
correlation_matrix.to_csv("matriz_correlacion.csv")

print("✅ Matriz de correlación guardada como matriz_correlacion.csv")


✅ Matriz de correlación guardada como matriz_correlacion.csv


In [52]:
import pandas as pd

# Cargar la matriz de correlación exportada (ya codificada)
cor_matrix = pd.read_csv("matriz_correlacion.csv", index_col=0)

# Umbral de correlación (puedes ajustarlo)
umbral = 0.85

# Encontrar columnas altamente correlacionadas
columnas_correladas = set()
for i in range(len(cor_matrix.columns)):
    for j in range(i):
        if abs(cor_matrix.iloc[i, j]) > umbral:
            colname = cor_matrix.columns[i]
            columnas_correladas.add(colname)

# Mostrar columnas que se van a eliminar
print("Columnas eliminadas por alta correlación (> 0.85):")
print(sorted(columnas_correladas))

# Aplicar la eliminación al DataFrame original codificado
# (Asegúrate de usar un DataFrame con las mismas columnas que esta matriz)
df_codificado = df_completo.drop(columns=["id", "imagename"], errors="ignore").copy()

# Codificar categorías como antes
from sklearn.preprocessing import LabelEncoder
for col in df_codificado.columns:
    if df_codificado[col].dtype == "object":
        df_codificado[col] = LabelEncoder().fit_transform(df_codificado[col])

# Eliminar las columnas altamente correlacionadas
df_filtrado = df_codificado.drop(columns=columnas_correladas)

# Guardar resultado
df_filtrado.to_csv("df_sin_correlaciones_altas.csv", index=False)
print("✅ Dataset guardado como df_sin_correlaciones_altas.csv")


Columnas eliminadas por alta correlación (> 0.85):
['Globulos blancos', 'Globulos rojos', 'Glucosa', 'HDL', 'Plaquetas']
✅ Dataset guardado como df_sin_correlaciones_altas.csv


In [53]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Cargar el dataset limpio
df = pd.read_csv("df_sin_correlaciones_altas.csv")

# Variable objetivo
y = df["Survival_Prediction"]
X = df.drop(columns=["Survival_Prediction"])

# División train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Modelo con pesos balanceados
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=4,
    min_samples_split=2,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42
)

# Entrenar
model.fit(X_train, y_train)

# Evaluación
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Matriz de confusión:\n", confusion_matrix(y_test, y_pred))
print("Informe de clasificación:\n", classification_report(y_test, y_pred))


Accuracy: 0.825
Matriz de confusión:
 [[15  4]
 [ 3 18]]
Informe de clasificación:
               precision    recall  f1-score   support

           0       0.83      0.79      0.81        19
           1       0.82      0.86      0.84        21

    accuracy                           0.82        40
   macro avg       0.83      0.82      0.82        40
weighted avg       0.83      0.82      0.82        40



In [54]:
import joblib

# Guardar modelo entrenado
joblib.dump(model, "modelo_final_rf.pkl")

print("✅ Modelo guardado como modelo_final_rf.pkl")


✅ Modelo guardado como modelo_final_rf.pkl


In [None]:
import joblib

# Cargar el modelo
modelo_cargado = joblib.load("modelo_final_rf.pkl")

# Usarlo para predecir
y_pred = modelo_cargado.predict(X_test)



[1 1 1 1 0 0 1 0 1 1 1 0 0 0 0 0 1 0 1 1 0 1 0 1 1 0 1 0 0 1 1 1 1 1 0 0 1
 0 0 1]


In [57]:
import pandas as pd

df = pd.read_csv("df_sin_correlaciones_altas.csv")
print(df.drop(columns=["Survival_Prediction"]).columns.tolist())


['Sexo', 'Age', 'Family history', 'smoke', 'alcohol', 'obesity', 'diet', 'Screening_History', 'Healthcare_Access', 'cancer_stage', 'tumor_size', 'early_detection', 'inflammatory_bowel_disease', 'relapse', 'Hemoglobina']


In [None]:
# Repetir el proceso tras el reinicio del entorno

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

# Cargar el dataset limpio sin variables altamente correlacionadas
df = pd.read_csv("df_sin_correlaciones_altas.csv")

# Separar variables y target
y = df["Survival_Prediction"]
X = df.drop(columns=["Survival_Prediction"])

# Detectar columnas categóricas para aplicar One-Hot Encoding
columnas_categoricas = [
    "Sexo", "Family history", "smoke", "alcohol", "obesity", "diet",
    "Screening_History", "Healthcare_Access", "early_detection",
    "inflammatory_bowel_disease", "relapse"
]

# Las restantes se dejan como numéricas
columnas_numericas = [col for col in X.columns if col not in columnas_categoricas]

# Definir el preprocesador
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), columnas_categoricas),
        ("num", StandardScaler(), columnas_numericas)
    ]
)

# Crear el pipeline con RandomForest
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=200,
        max_depth=4,
        min_samples_split=2,
        min_samples_leaf=2,
        class_weight="balanced",
        random_state=42
    ))
])

# Dividir en entrenamiento y test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el modelo
pipeline.fit(X_train, y_train)

# Evaluar
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Guardar el pipeline completo
joblib.dump(pipeline, "modelo_rf_ohe.pkl")

accuracy, conf_matrix, report


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/modelo_rf_ohe.pkl'