In [18]:
import pandas as pd
import os

In [19]:
# Ruta base del proyecto (ajustar si cambias de ordenador)
BASE_DIR = r"D:\Proyecto 1 - Neumonia"
DATA_DIR = os.path.join(BASE_DIR, "data")
IMG_DIR = os.path.join(DATA_DIR, "images")
CSV_PATH = os.path.join(DATA_DIR, "Data_Entry_2017_v2020.csv")
RESULTS_DIR = os.path.join(BASE_DIR, "results")

print("Ruta base:", BASE_DIR)

Ruta base: D:\Proyecto 1 - Neumonia


In [13]:
df = pd.read_csv(CSV_PATH)

In [14]:
df.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Sex,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y]
0,00000001_000.png,Cardiomegaly,0,1,57,M,PA,2682,2749,0.143,0.143
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168
3,00000002_000.png,No Finding,0,2,80,M,PA,2500,2048,0.171,0.171
4,00000003_001.png,Hernia,0,3,74,F,PA,2500,2048,0.168,0.168


In [15]:
print("Total filas:", len(df))
print("Columnas:", df.columns.tolist())

Total filas: 112120
Columnas: ['Image Index', 'Finding Labels', 'Follow-up #', 'Patient ID', 'Patient Age', 'Patient Sex', 'View Position', 'OriginalImage[Width', 'Height]', 'OriginalImagePixelSpacing[x', 'y]']


In [24]:
import os

IMG_DIR = r"D:\Proyecto 1 - Neumonia\data\images"

df_pneumonia = df[df['Finding Labels'].str.contains('Pneumonia')]

# Contamos cuántas imágenes existen físicamente
available = df_pneumonia['Image Index'].apply(lambda x: os.path.exists(os.path.join(IMG_DIR, x)))
print("Total Pneumonía etiquetadas:", len(df_pneumonia))
print("De esas, disponibles en tu carpeta:", available.sum())



Total Pneumonía etiquetadas: 1431
De esas, disponibles en tu carpeta: 184


In [None]:
df_pneumonia['exists'] = df_pneumonia['Image Index'].apply(lambda x: os.path.exists(os.path.join(IMG_DIR, x)))
df_pneu_avail = df_pneumonia[df_pneumonia['exists']].copy()

In [27]:
df_normal = df[df['Finding Labels'] == 'No Finding'].copy()

# 2️⃣ Tomamos una muestra de 1000 filas para no revisar las 60 000
df_normal_sample = df_normal.sample(1000, random_state=42).copy()

# 3️⃣ Verificamos cuáles de esas imágenes existen físicamente
df_normal_sample['exists'] = df_normal_sample['Image Index'].apply(
    lambda x: os.path.exists(os.path.join(IMG_DIR, x))
)

# 4️⃣ Nos quedamos con las que realmente existen
df_norm_avail = df_normal_sample[df_normal_sample['exists']].copy()

# 5️⃣ Revisamos cuántas hay
print("Imágenes normales disponibles:", len(df_norm_avail))

Imágenes normales disponibles: 161


In [28]:
# 1️⃣ Igualamos el número de imágenes
n_samples = min(len(df_pneu_avail), len(df_norm_avail))

df_pneu_bal = df_pneu_avail.sample(n_samples, random_state=42).copy()
df_norm_bal = df_norm_avail.sample(n_samples, random_state=42).copy()

# 2️⃣ Añadimos las etiquetas binarias
df_pneu_bal['label'] = 1   # neumonía
df_norm_bal['label'] = 0   # normales

# 3️⃣ Unimos ambos subconjuntos y mezclamos aleatoriamente
df_subset = pd.concat([df_pneu_bal, df_norm_bal], ignore_index=True)
df_subset = df_subset.sample(frac=1, random_state=42).reset_index(drop=True)

# 4️⃣ Comprobamos el resultado
print("Total imágenes en el subset:", len(df_subset))
print(df_subset['label'].value_counts())


Total imágenes en el subset: 322
label
0    161
1    161
Name: count, dtype: int64


In [29]:
subset_path = os.path.join(IMG_DIR, "..", "subset_balanced.csv")
df_subset.to_csv(subset_path, index=False)
print("Subset guardado en:", subset_path)

Subset guardado en: D:\Proyecto 1 - Neumonia\data\images\..\subset_balanced.csv


In [30]:
import cv2
import numpy as np

def preprocess_image(path):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        return None  # por si falta algún archivo
    img = cv2.resize(img, (128, 128))
    img = img / 255.0
    img = img.flatten()
    return img

In [31]:
path = os.path.join(IMG_DIR, df_subset.iloc[0]['Image Index'])
img_vector = preprocess_image(path)
print(img_vector.shape, img_vector.min(), img_vector.max())

(16384,) 0.00392156862745098 0.8745098039215686


In [33]:
X = []
y = []

for i, row in df_subset.iterrows():
    path = os.path.join(IMG_DIR, row['Image Index'])
    img_vector = preprocess_image(path)
    if img_vector is not None:
        X.append(img_vector)
        y.append(row['label'])

X = np.array(X)
y = np.array(y)

print(X.shape, y.shape)

(322, 16384) (322,)


In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2,         # 20% de los datos para test
    random_state=42,       # semilla aleatoria (para que siempre salga igual)
    stratify=y             # mantiene equilibrio de clases 0/1
)


In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# 1️⃣ Crear el modelo
model = LogisticRegression(max_iter=200, solver='liblinear')

# 2️⃣ Entrenar
model.fit(X_train, y_train)

# 3️⃣ Predecir sobre el conjunto de test
y_pred = model.predict(X_test)

# 4️⃣ Calcular métricas
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {acc:.3f}")
print(f"AUC: {auc:.3f}")


Accuracy: 0.600
AUC: 0.599


In [37]:
cm = confusion_matrix(y_test, y_pred)
print("Matriz de confusión:")
print(cm)


Matriz de confusión:
[[21 12]
 [14 18]]


In [38]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score

# 1️⃣ Crear el modelo
svm_model = SVC(kernel='rbf', C=1, gamma='scale', probability=True)

# 2️⃣ Entrenar
svm_model.fit(X_train, y_train)

# 3️⃣ Predecir
y_pred_svm = svm_model.predict(X_test)

# 4️⃣ Evaluar
acc_svm = accuracy_score(y_test, y_pred_svm)
auc_svm = roc_auc_score(y_test, y_pred_svm)

print(f"SVM Accuracy: {acc_svm:.3f}")
print(f"SVM AUC: {auc_svm:.3f}")

# 5️⃣ Matriz de confusión
cm_svm = confusion_matrix(y_test, y_pred_svm)
print("Matriz de confusión:")
print(cm_svm)


SVM Accuracy: 0.585
SVM AUC: 0.585
Matriz de confusión:
[[19 14]
 [13 19]]


In [39]:
from sklearn.decomposition import PCA
import numpy as np

# 1️⃣ Definimos el PCA
# n_components puede ser un número (ej. 100) o una fracción (ej. 0.95 = conservar 95% de la varianza)
pca = PCA(n_components=0.95, random_state=42)

# 2️⃣ Ajustamos PCA solo con los datos de entrenamiento
pca.fit(X_train)

# 3️⃣ Transformamos train y test
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

print("Tamaño original:", X_train.shape)
print("Tamaño reducido:", X_train_pca.shape)


Tamaño original: (257, 16384)
Tamaño reducido: (257, 83)


In [44]:
model = LogisticRegression(max_iter=200, solver='liblinear')
model.fit(X_train_pca, y_train)
y_pred = model.predict(X_test_pca)
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(f"Accuracy: {acc_svm:.3f}")
print(f"SVM AUC: {auc_svm:.3f}")

Accuracy: 0.615
SVM AUC: 0.615


In [45]:
svm = SVC(kernel='rbf', C=1, gamma='scale')
svm.fit(X_train_pca, y_train)
y_pred = svm.predict(X_test_pca)

acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(f"SVM (PCA) Accuracy: {acc:.3f}")
print(f"SVM (PCA) AUC: {auc:.3f}")


SVM (PCA) Accuracy: 0.600
SVM (PCA) AUC: 0.599


In [46]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# 1️⃣ Crear modelo
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# 2️⃣ Entrenar
rf.fit(X_train_pca, y_train)

# 3️⃣ Predecir
y_pred_rf = rf.predict(X_test_pca)

# 4️⃣ Evaluar
acc_rf = accuracy_score(y_test, y_pred_rf)
auc_rf = roc_auc_score(y_test, y_pred_rf)

print(f"Random Forest Accuracy: {acc_rf:.3f}")
print(f"Random Forest AUC: {auc_rf:.3f}")

cm_rf = confusion_matrix(y_test, y_pred_rf)
print("Matriz de confusión:")
print(cm_rf)


Random Forest Accuracy: 0.615
Random Forest AUC: 0.615
Matriz de confusión:
[[21 12]
 [13 19]]


In [47]:
pip install xgboost


Collecting xgboost
  Downloading xgboost-3.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.1-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   -------- ------------------------------- 14.9/72.0 MB 94.1 MB/s eta 0:00:01
   ------------------ --------------------- 32.8/72.0 MB 90.5 MB/s eta 0:00:01
   ---------------------- ----------------- 41.2/72.0 MB 84.5 MB/s eta 0:00:01
   --------------------------- ------------ 48.8/72.0 MB 62.1 MB/s eta 0:00:01
   ---------------------------------------  71.8/72.0 MB 71.6 MB/s eta 0:00:01
   ---------------------------------------- 72.0/72.0 MB 64.7 MB/s  0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-3.1.1
Note: you may need to restart the kernel to use updated packages.


In [50]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
# 1️⃣ Crear modelo
xgb_model = xgb.XGBClassifier(
    n_estimators=200,      # número de árboles
    learning_rate=0.05,    # tasa de aprendizaje (más bajo = más estable)
    max_depth=4,           # profundidad de los árboles
    subsample=0.8,         # usa 80% de los datos en cada árbol
    colsample_bytree=0.8,  # usa 80% de las features en cada árbol
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

# 2️⃣ Entrenar
xgb_model.fit(X_train_pca, y_train)

# 3️⃣ Predecir
y_pred_xgb = xgb_model.predict(X_test_pca)

# 4️⃣ Evaluar
acc_xgb = accuracy_score(y_test, y_pred_xgb)
auc_xgb = roc_auc_score(y_test, y_pred_xgb)
cm_xgb = confusion_matrix(y_test, y_pred_xgb)

print(f"XGBoost Accuracy: {acc_xgb:.3f}")
print(f"XGBoost AUC: {auc_xgb:.3f}")
print("Matriz de confusión:")
print(cm_xgb)


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 0.554
XGBoost AUC: 0.554
Matriz de confusión:
[[18 15]
 [14 18]]
