# CheXNet
Uso della CheXNet per migliorare gli embedding e le predizioni.\
Per prima cosa carico la CheXNet dall'architettura DenseNet121 e dai pesi scaricati nella apposita directory.

In [1]:
import os, pandas as pd, numpy as np
from tqdm import tqdm
from keras import models, applications
from keras.api.preprocessing.image import load_img, img_to_array

root = '../..'
dataset_path = f"{root}/datasets/covid_cxr"
chexnet_path = f"{root}/models/CheXNet.keras"
batch = 32
size = (224, 224)

if not os.path.exists(chexnet_path):
    base = applications.densenet.DenseNet121(weights=None, include_top=True, input_shape=(224,224,3), classes=14)
    base.load_weights(f"{root}/models/weight_only/CheXNet.h5")
    base.save(chexnet_path)

base = models.load_model(chexnet_path, compile=True)

A questo punto carico la CheXNet e predico tutti gli embedding per le immagini.\
Per fare questo devo quindi rimuovere l'ultimo layer per la classificazione.

In [2]:
cheXnet = models.Model(inputs=base.input, outputs=base.layers[-2].output)
for layer in cheXnet.layers: layer.trainable = False

In questo punto carico il dataset e lo trasformo in embedding

In [3]:
# Load all the names of the images in one pandas
all_files = []
for t in ['train', 'val', 'test']:
    df = pd.read_csv(f"{dataset_path}/{t}.txt", delimiter=' ', header=None)
    df[1] = df[1].apply(lambda x: f"{dataset_path}/{t}/{x}")
    all_files.append(df)
df = pd.concat(all_files)
df.columns = ['id', 'filename', 'class', 'source']

# Convert the classes to a numpy array
predictions = np.array(pd.factorize(df['class'])[0])

# Create the embeddings for the images
images = df['filename']
embeddings = np.zeros((len(images), 1024), dtype="float32")
img_batch = []
for i, img_name in enumerate(tqdm(images)):
    img = load_img(img_name, target_size=size, color_mode='rgb')
    img = img_to_array(img)
    img_batch.append(img)
    if len(img_batch) == batch or i == len(images) - 1:
        img_batch = np.array(img_batch)
        batch_embeddings = cheXnet.predict(img_batch, verbose=0)
        embeddings[i - len(img_batch) + 1:i + 1] = batch_embeddings
        img_batch = []

100%|██████████| 84818/84818 [11:11<00:00, 126.29it/s]


In [4]:
# Split the dataset into train, validation, and test sets
train_tot = len(all_files[0])
val_tot = train_tot + len(all_files[1])
test_tot = val_tot + len(all_files[2])
print(f"Train: {train_tot}, Validation: {val_tot - train_tot}, Test: {test_tot - val_tot}")

# Save the embeddings and predictions to a numpy file
np.savez(
    f"{dataset_path}_embeddings.npz",
    x_train=embeddings[:train_tot],
    y_train=predictions[:train_tot],
    x_val=embeddings[train_tot:val_tot],
    y_val=predictions[train_tot:val_tot],
    x_test=embeddings[val_tot:test_tot],
    y_test=predictions[val_tot:test_tot],
)

Train: 67863, Validation: 8473, Test: 8482
