In [73]:
import pandas as pd

# Cargar los datasets que vamos a usar
df_historial = pd.read_csv("data/historial_medico.csv")
df_cancer = pd.read_csv("data/analisis_cancer.csv")

# Unir por 'id'
df_completo = df_historial.merge(df_cancer, on="id", how="inner")

# Verificar la unión
print(df_completo.head())
print(df_completo.columns)
print(df_completo.shape)


   id Sexo  Age Family history smoke alcohol     obesity      diet  \
0   1    M   63             No   Yes     Yes      Normal  Moderate   
1   2    M   64            Yes   Yes      No      Normal       Low   
2   3    F   50             No   Yes      No      Normal  Moderate   
3   4    M   67            Yes    No      No      Normal  Moderate   
4   5    M   81             No   Yes      No  Overweight  Moderate   

  Screening_History Healthcare_Access Survival_Prediction cancer_stage  \
0             Never              High                 Yes    Localized   
1         Irregular          Moderate                 Yes    Localized   
2           Regular          Moderate                 Yes    Localized   
3           Regular               Low                 Yes    Localized   
4             Never              High                 Yes    Localized   

   tumor_size early_detection inflammatory_bowel_disease relapse  
0          48             Yes                         No      No  


In [74]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Cargar datos (ya lo hiciste tú antes)
df = df_completo.copy()  # o como lo hayas llamado

# Eliminar columna 'id' si está
df = df.drop(columns=["id"], errors="ignore")

# Codificar columnas categóricas
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = LabelEncoder().fit_transform(df[col])

# Mostrar primeras filas y dtypes
print(df.head())
print(df.dtypes)
print(df.shape)


   Sexo  Age  Family history  smoke  alcohol  obesity  diet  \
0     1   63               0      1        1        0     2   
1     1   64               1      1        0        0     1   
2     0   50               0      1        0        0     2   
3     1   67               1      0        0        0     2   
4     1   81               0      1        0        2     2   

   Screening_History  Healthcare_Access  Survival_Prediction  cancer_stage  \
0                  1                  0                    1             0   
1                  0                  2                    1             0   
2                  2                  2                    1             0   
3                  2                  1                    1             0   
4                  1                  0                    1             0   

   tumor_size  early_detection  inflammatory_bowel_disease  relapse  
0          48                1                           0        0  
1          3

In [None]:
import pandas as pd

# Usamos el DataFrame ya codificado
cor_matrix = df.corr(numeric_only=True)

# Guardar matriz de correlación completa
cor_matrix.to_csv("matriz_correlacion.csv")

# === 1. Eliminar columnas con correlación mutua alta ===
umbral = 0.85
columnas_correladas = set()

for i in range(len(cor_matrix.columns)):
    for j in range(i):
        if abs(cor_matrix.iloc[i, j]) > umbral:
            colname = cor_matrix.columns[i]
            columnas_correladas.add(colname)

print("🔴 Columnas eliminadas por alta correlación (> 0.85):")
print(sorted(columnas_correladas))

# === 2. Obtener correlación con la variable objetivo ===
cor_target = cor_matrix["Survival_Prediction"].drop("Survival_Prediction")
cor_target_ordenado = cor_target.abs().sort_values(ascending=False)

print("\n🔵 Variables más correlacionadas con 'Survival_Prediction':")
print(cor_target_ordenado)

# Opcional: guardar correlaciones con la variable objetivo
cor_target_ordenado.to_csv("correlacion_con_objetivo.csv")


🔴 Columnas eliminadas por alta correlación (> 0.85):
[]


KeyError: 'Survival_Prediction'

In [None]:
import pandas as pd

# Cargar los datasets subidos por el usuario
df_historial = pd.read_csv("data/historial_medico.csv")
df_cancer = pd.read_csv("data/analisis_cancer.csv")

# Unir los datasets por la columna 'id'
df = pd.merge(df_historial, df_cancer, on="id", how="inner")

# Seleccionar solo las columnas necesarias para el nuevo modelo
columnas_utiles = [
    "Sexo",
    "tumor_size",
    "relapse",
    "early_detection",
    "inflammatory_bowel_disease",
    "obesity",
    "Family history",
    "Survival_Prediction"
]

df_modelo = df[columnas_utiles].dropna()

# Mostrar un ejemplo de los datos limpios
df_modelo.head()


Unnamed: 0,Sexo,tumor_size,relapse,early_detection,inflammatory_bowel_disease,obesity,Family history,Survival_Prediction
0,M,48,No,Yes,No,Normal,No,Yes
1,M,33,No,Yes,No,Normal,Yes,Yes
2,F,34,No,No,No,Normal,No,Yes
3,M,34,No,No,No,Normal,Yes,Yes
4,M,31,No,Yes,No,Overweight,No,Yes


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Cargar el dataset preparado
df = pd.read_csv("/mnt/data/df_modelo_final.csv")

# Separar variables
y = df["Survival_Prediction"]
X = df.drop(columns=["Survival_Prediction"])

# Columnas categóricas y numéricas
columnas_categoricas = ["Sexo", "relapse", "early_detection", "inflammatory_bowel_disease", "obesity", "Family history"]
columnas_numericas = ["tumor_size"]

# Preprocesador
preprocessor = ColumnTransformer(transformers=[
    ("cat", OneHotEncoder(handle_unknown="ignore"), columnas_categoricas),
    ("num", StandardScaler(), columnas_numericas)
])

# Modelo
modelo = RandomForestClassifier(
    n_estimators=200,
    max_depth=4,
    min_samples_split=2,
    min_samples_leaf=2,
    class_weight={0: 1.5, 1: 1},
    random_state=42
)

# Pipeline
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", modelo)
])

# División de datos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar
pipeline.fit(X_train, y_train)

# Evaluar
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Guardar modelo
modelo_path = "modelo_rf_ampliado.pkl"
joblib.dump(pipeline, modelo_path)

accuracy, conf_matrix, report, modelo_path


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/df_modelo_final.csv'