In [76]:
import pandas as pd

# Cargar los datasets que vamos a usar
df_historial = pd.read_csv("data/historial_medico.csv")
df_cancer = pd.read_csv("data/analisis_cancer.csv")

# Unir por 'id'
df_completo = df_historial.merge(df_cancer, on="id", how="inner")

# Verificar la unión
print(df_completo.head())
print(df_completo.columns)
print(df_completo.shape)


   id Sexo  Age Family history smoke alcohol     obesity      diet  \
0   1    M   63             No   Yes     Yes      Normal  Moderate   
1   2    M   64            Yes   Yes      No      Normal       Low   
2   3    F   50             No   Yes      No      Normal  Moderate   
3   4    M   67            Yes    No      No      Normal  Moderate   
4   5    M   81             No   Yes      No  Overweight  Moderate   

  Screening_History Healthcare_Access Survival_Prediction cancer_stage  \
0             Never              High                 Yes    Localized   
1         Irregular          Moderate                 Yes    Localized   
2           Regular          Moderate                 Yes    Localized   
3           Regular               Low                 Yes    Localized   
4             Never              High                 Yes    Localized   

   tumor_size early_detection inflammatory_bowel_disease relapse  
0          48             Yes                         No      No  


In [77]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Cargar datos (ya lo hiciste tú antes)
df = df_completo.copy()  # o como lo hayas llamado

# Eliminar columna 'id' si está
df = df.drop(columns=["id"], errors="ignore")

# Codificar columnas categóricas
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = LabelEncoder().fit_transform(df[col])

# Mostrar primeras filas y dtypes
print(df.head())
print(df.dtypes)
print(df.shape)


   Sexo  Age  Family history  smoke  alcohol  obesity  diet  \
0     1   63               0      1        1        0     2   
1     1   64               1      1        0        0     1   
2     0   50               0      1        0        0     2   
3     1   67               1      0        0        0     2   
4     1   81               0      1        0        2     2   

   Screening_History  Healthcare_Access  Survival_Prediction  cancer_stage  \
0                  1                  0                    1             0   
1                  0                  2                    1             0   
2                  2                  2                    1             0   
3                  2                  1                    1             0   
4                  1                  0                    1             0   

   tumor_size  early_detection  inflammatory_bowel_disease  relapse  
0          48                1                           0        0  
1          3

In [78]:
import pandas as pd

# Usamos el DataFrame ya codificado
cor_matrix = df.corr(numeric_only=True)

# Guardar matriz de correlación completa
cor_matrix.to_csv("matriz_correlacion.csv")

# === 1. Eliminar columnas con correlación mutua alta ===
umbral = 0.85
columnas_correladas = set()

for i in range(len(cor_matrix.columns)):
    for j in range(i):
        if abs(cor_matrix.iloc[i, j]) > umbral:
            colname = cor_matrix.columns[i]
            columnas_correladas.add(colname)

print("🔴 Columnas eliminadas por alta correlación (> 0.85):")
print(sorted(columnas_correladas))

# === 2. Obtener correlación con la variable objetivo ===
cor_target = cor_matrix["Survival_Prediction"].drop("Survival_Prediction")
cor_target_ordenado = cor_target.abs().sort_values(ascending=False)

print("\n🔵 Variables más correlacionadas con 'Survival_Prediction':")
print(cor_target_ordenado)

# Opcional: guardar correlaciones con la variable objetivo
cor_target_ordenado.to_csv("correlacion_con_objetivo.csv")


🔴 Columnas eliminadas por alta correlación (> 0.85):
[]

🔵 Variables más correlacionadas con 'Survival_Prediction':
relapse                       0.443826
tumor_size                    0.018327
early_detection               0.015699
Sexo                          0.012890
inflammatory_bowel_disease    0.011989
obesity                       0.009300
Family history                0.008924
Age                           0.003754
diet                          0.002968
Screening_History             0.002691
smoke                         0.002642
Healthcare_Access             0.001841
alcohol                       0.001200
cancer_stage                  0.000122
Name: Survival_Prediction, dtype: float64


In [81]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# === 1. Cargar los datos ===
df_historial = pd.read_csv("data/historial_medico.csv")
df_cancer = pd.read_csv("data/analisis_cancer.csv")
df = pd.merge(df_historial, df_cancer, on="id", how="inner")

# === 2. Seleccionar columnas relevantes ===
columnas_utiles = [
    "Sexo",
    "tumor_size",
    "relapse",
    "early_detection",
    "inflammatory_bowel_disease",
    "obesity",
    "Family history",
    "Survival_Prediction"
]

df_modelo = df[columnas_utiles].dropna()

# === 3. Dividir en X e y ===
X = df_modelo.drop(columns=["Survival_Prediction"])
y = df_modelo["Survival_Prediction"]

# === 4. Separar tipos de columnas ===
columnas_categoricas = [col for col in X.columns if X[col].dtype == "object"]
columnas_numericas = [col for col in X.columns if col not in columnas_categoricas]

# === 5. Preprocesamiento ===
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), columnas_numericas),
    ("cat", OneHotEncoder(drop="first", sparse_output=False), columnas_categoricas)
])

# === 6. Modelo ===
modelo = RandomForestClassifier(
    n_estimators=200,
    max_depth=4,
    class_weight="balanced",
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", modelo)
])

# === 7. Entrenamiento y evaluación ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("🔍 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Matriz de confusión:")
print(confusion_matrix(y_test, y_pred))
print("\n📝 Informe de clasificación:")
print(classification_report(y_test, y_pred))

# === 8. Guardar el modelo ===
joblib.dump(pipeline, "modelo_rf_v2.pkl")
print("✅ Modelo guardado como modelo_rf_v2.pkl")


🔍 Accuracy: 0.693

📊 Matriz de confusión:
[[440 548]
 [ 66 946]]

📝 Informe de clasificación:
              precision    recall  f1-score   support

          No       0.87      0.45      0.59       988
         Yes       0.63      0.93      0.75      1012

    accuracy                           0.69      2000
   macro avg       0.75      0.69      0.67      2000
weighted avg       0.75      0.69      0.67      2000

✅ Modelo guardado como modelo_rf_v2.pkl
