In [1]:
import os
import numpy as np
from skimage.io import imread
from skimage.transform import resize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
import os, shutil, gdown, zipfile
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
# Crear directorio para almacenar los datos descargados
download_dir = "downloaded_data"
os.makedirs(download_dir, exist_ok = True)

url = "https://drive.google.com/uc?export=download&id=19ZgvHREcc3SGYST_Nczl5XiHHgwvFPbp"
output = download_dir + '/dataset-xray.zip'
gdown.download(url, output , quiet = False)
print("Descarga Completa!")

print("Extrayendo archivos...")
with zipfile.ZipFile(output, 'r') as zip_ref:
    # Extrae todos los archivos en el directorio de destino
    zip_ref.extractall(download_dir)
print("Datos extraidos!")

Downloading...
From (uriginal): https://drive.google.com/uc?export=download&id=19ZgvHREcc3SGYST_Nczl5XiHHgwvFPbp
From (redirected): https://drive.google.com/uc?export=download&id=19ZgvHREcc3SGYST_Nczl5XiHHgwvFPbp&confirm=t&uuid=0a80f9a1-b131-47fb-b67a-941875856e42
To: d:\NextCloud\github\BigData-ML-Proyectos\Trabajo Final\downloaded_data\dataset-xray.zip
100%|██████████| 199M/199M [00:16<00:00, 12.2MB/s] 


Descarga Completa!
Extrayendo archivos...
Datos extraidos!


In [3]:
# Definir las categorías
categories = ["COVID-19", "NEUMONIA", "NORMAL"]

# Cargar datos de las carpetas extraídas
data = []
label = []

print("Cargando datos...")
for category in categories:
    print(f"Cargando radiografias de {category}...")
    for file in os.listdir(download_dir + "/" + category):
        dir = download_dir + "/" + category + "/" + file
        img = imread(dir, as_gray = True)
        img = resize(img, (256, 256))
        data.append(img.flatten())
        label.append(categories.index(category))
print("Todos los datos cargados!")

Cargando datos...
Cargando radiografias de COVID-19...
Cargando radiografias de NEUMONIA...
Cargando radiografias de NORMAL...
Todos los datos cargados!


In [4]:
print("Eliminando el directorio descargado...")
shutil.rmtree(download_dir)
print("Directorio eliminado.")

Eliminando el directorio descargado...
Directorio eliminado.


In [5]:
data = np.asarray(data)
label = np.asarray(label)

x_train, x_test, y_train, y_test = train_test_split(data, label, test_size = 0.2, shuffle = True, stratify = label, random_state = 42)

In [12]:
# Definimos un diccionario que especifica las combinaciones de hiperparámetros a explorar
param_grid = {
    'C': [0.1, 1, 10, 15],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.1, 1, 10]
}

# Creamos un modelo SVC
grid_svc = SVC()

# Creamos un objeto GridSearch para la busqueda de parametros y ajustamos a los datos de entrenamiento
grid_search = GridSearchCV(grid_svc, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)

# Obtenemos los mejores hiperparametros encontrados
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Imprimimos el mejor modelo junto a su puntuacion
print("Model:", best_params)
print("Best Score:", best_score)

Model: {'C': 15, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score: 0.9086984536082474


In [13]:
param_grid = {
    'n_neighbors': [3,5,7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
    'algorithm': ['auto', 'ball_tree', 'kd_tree'],
    'leaf_size': [10, 20, 30, 40, 50]
}

grid_knn = KNeighborsClassifier()

# Creamos un objeto GridSearch para la busqueda de parametros y ajustamos a los datos de entrenamiento
grid_search = GridSearchCV(grid_knn, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)

# Obtenemos los mejores hiperparametros encontrados
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Imprimimos el mejor modelo junto a su puntuacion
print("Model:", best_params)
print("Best Score:", best_score)

Model: {'algorithm': 'auto', 'leaf_size': 10, 'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}
Best Score: 0.8505369415807561


In [11]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2', None]
}

grid_dtc = DecisionTreeClassifier()

# Creamos un objeto GridSearch para la busqueda de parametros y ajustamos a los datos de entrenamiento
grid_search = GridSearchCV(grid_dtc, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)

# Obtenemos los mejores hiperparametros encontrados
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Imprimimos el mejor modelo junto a su puntuacion
print("Model:", best_params)
print("Best Score:", best_score)

540 fits failed out of a total of 2160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
330 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Martín\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Martín\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\Martín\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra

Model: {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Score: 0.7761383161512028


In [6]:
param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

grid_gnb = GaussianNB()

# Creamos un objeto GridSearch para la busqueda de parametros y ajustamos a los datos de entrenamiento
grid_search = GridSearchCV(grid_gnb, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)

# Obtenemos los mejores hiperparametros encontrados
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Imprimimos el mejor modelo junto a su puntuacion
print("Model:", best_params)
print("Best Score:", best_score)

Model: {'var_smoothing': 0.008111308307896872}
Best Score: 0.7966924398625429


In [6]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_rtc = RandomForestClassifier(random_state = 42)

# Creamos un objeto GridSearch para la busqueda de parametros y ajustamos a los datos de entrenamiento
grid_search = GridSearchCV(grid_rtc, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)

# Obtenemos los mejores hiperparametros encontrados
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Imprimimos el mejor modelo junto a su puntuacion
print("Model:", best_params)
print("Best Score:", best_score)

Model: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.8653135738831615


In [8]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_gb = GradientBoostingClassifier(random_state = 42)

# Creamos un objeto GridSearch para la busqueda de parametros y ajustamos a los datos de entrenamiento
grid_search = GridSearchCV(grid_gb, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)

# Obtenemos los mejores hiperparametros encontrados
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Imprimimos el mejor modelo junto a su puntuacion
print("Model:", best_params)
print("Best Score:", best_score)

KeyboardInterrupt: 

In [6]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100, 50), (100, 100)],
    'activation': ['logistic', 'tanh', 'relu'],
    'alpha': [0.0001, 0.001, 0.01],
    'max_iter': [200, 400, 600]
}

grid_mlp = MLPClassifier(random_state = 42)

# Creamos un objeto GridSearch para la busqueda de parametros y ajustamos a los datos de entrenamiento
grid_search = GridSearchCV(grid_mlp, param_grid, cv=5, n_jobs=-1)
grid_search.fit(x_train, y_train)

# Obtenemos los mejores hiperparametros encontrados
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Imprimimos el mejor modelo junto a su puntuacion
print("Model:", best_params)
print("Best Score:", best_score)

KeyboardInterrupt: 