In [1]:
# Import the dependencies.
import csv
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report,accuracy_score
# Import the train_test_learn module
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression
from collections import Counter
import tensorflow as tf


In [2]:
COVID2020_training_data_df=pd.read_csv('COVID2020_TRAINING_DATA.CSV')

In [3]:
#Preview Dataframe from CSV
COVID2020_training_data_df.head()

Unnamed: 0.1,Unnamed: 0,ORIGEN,SECTOR,ENTIDAD_UM,SEXO,ENTIDAD_NAC,ENTIDAD_RES,MUNICIPIO_RES,TIPO_PACIENTE,FECHA_INGRESO,...,CARDIOVASCULAR,OBESIDAD,RENAL_CRONICA,TABAQUISMO,OTRO_CASO,RESULTADO_LAB,RESULTADO_ANTIGENO,CLASIFICACION_FINAL,MIGRANTE,UCI
0,0,1,6,24,1,24,24,28,1,2020-12-06,...,2,2,2,2,2,1,97,3,99,97
1,1,1,6,24,2,24,24,28,1,2020-02-20,...,2,2,2,2,1,97,97,6,99,97
2,2,1,12,14,1,14,14,85,1,2020-04-22,...,2,2,2,2,2,2,97,7,99,97
3,3,2,12,8,2,8,8,37,1,2020-07-28,...,2,2,2,2,2,1,97,3,99,97
4,4,1,12,9,2,9,9,7,1,2020-08-18,...,2,2,2,2,2,2,97,7,99,97


In [4]:
#Remove Innamed:0 column
COVID2020_training_data_df.drop(columns=['Unnamed: 0'],inplace=True)
COVID2020_training_data_df.head()

Unnamed: 0,ORIGEN,SECTOR,ENTIDAD_UM,SEXO,ENTIDAD_NAC,ENTIDAD_RES,MUNICIPIO_RES,TIPO_PACIENTE,FECHA_INGRESO,FECHA_SINTOMAS,...,CARDIOVASCULAR,OBESIDAD,RENAL_CRONICA,TABAQUISMO,OTRO_CASO,RESULTADO_LAB,RESULTADO_ANTIGENO,CLASIFICACION_FINAL,MIGRANTE,UCI
0,1,6,24,1,24,24,28,1,2020-12-06,2020-11-30,...,2,2,2,2,2,1,97,3,99,97
1,1,6,24,2,24,24,28,1,2020-02-20,2020-02-18,...,2,2,2,2,1,97,97,6,99,97
2,1,12,14,1,14,14,85,1,2020-04-22,2020-04-18,...,2,2,2,2,2,2,97,7,99,97
3,2,12,8,2,8,8,37,1,2020-07-28,2020-07-20,...,2,2,2,2,2,1,97,3,99,97
4,1,12,9,2,9,9,7,1,2020-08-18,2020-08-17,...,2,2,2,2,2,2,97,7,99,97


In [5]:
#Review columns
COVID2020_training_data_df.dtypes

ORIGEN                  int64
SECTOR                  int64
ENTIDAD_UM              int64
SEXO                    int64
ENTIDAD_NAC             int64
ENTIDAD_RES             int64
MUNICIPIO_RES           int64
TIPO_PACIENTE           int64
FECHA_INGRESO          object
FECHA_SINTOMAS         object
FECHA_DEF              object
INTUBADO                int64
NEUMONIA                int64
EDAD                    int64
EMBARAZO                int64
HABLA_LENGUA_INDIG      int64
INDIGENA                int64
DIABETES                int64
EPOC                    int64
ASMA                    int64
INMUSUPR                int64
HIPERTENSION            int64
OTRA_COM                int64
CARDIOVASCULAR          int64
OBESIDAD                int64
RENAL_CRONICA           int64
TABAQUISMO              int64
OTRO_CASO               int64
RESULTADO_LAB           int64
RESULTADO_ANTIGENO      int64
CLASIFICACION_FINAL     int64
MIGRANTE                int64
UCI                     int64
dtype: obj

In [6]:
#Replace FECHA_DEF with PATIENT_STATUS (0) alive, (1) dead. FECHA_DEF< 9999-99-99 = DEAD
COVID2020_training_data_df['PATIENT_STATUS']=COVID2020_training_data_df['FECHA_DEF'].apply(lambda x: 0 if x == '9999-99-99' else 1)

In [7]:
#Review that patients with FECHA_DEF == 9999-99-99 are ALIVE (0)
COVID2020_training_data_df[['FECHA_DEF','PATIENT_STATUS']].head(10)

Unnamed: 0,FECHA_DEF,PATIENT_STATUS
0,9999-99-99,0
1,9999-99-99,0
2,9999-99-99,0
3,9999-99-99,0
4,9999-99-99,0
5,9999-99-99,0
6,9999-99-99,0
7,9999-99-99,0
8,9999-99-99,0
9,9999-99-99,0


In [8]:
#Review that patients with FECHA_DEF =! 9999-99-99 are DEAD (1)
COVID2020_training_data_df[['FECHA_DEF','PATIENT_STATUS']].tail(10)

Unnamed: 0,FECHA_DEF,PATIENT_STATUS
422530,2021-02-07,1
422531,2020-07-26,1
422532,2020-04-27,1
422533,2020-12-29,1
422534,2021-01-02,1
422535,2020-12-19,1
422536,2020-04-28,1
422537,2020-12-08,1
422538,2020-09-12,1
422539,2020-05-14,1


In [9]:
#Drop FECHA_DEF column
COVID2020_training_data_df.drop(columns=['FECHA_DEF'],inplace=True)

In [10]:
#DROP OTHER COLUMNS THAT ARE NOT IMPORTANT FOR THE ANALYSIS
COVID2020_training_data_df.drop(columns=['ENTIDAD_UM','ENTIDAD_NAC','MUNICIPIO_RES','FECHA_INGRESO'],inplace=True)

In [11]:
COVID2020_training_data_df.nunique()

ORIGEN                   2
SECTOR                  13
SEXO                     2
ENTIDAD_RES             32
TIPO_PACIENTE            2
FECHA_SINTOMAS         366
INTUBADO                 4
NEUMONIA                 3
EDAD                   112
EMBARAZO                 4
HABLA_LENGUA_INDIG       3
INDIGENA                 3
DIABETES                 3
EPOC                     3
ASMA                     3
INMUSUPR                 3
HIPERTENSION             3
OTRA_COM                 3
CARDIOVASCULAR           3
OBESIDAD                 3
RENAL_CRONICA            3
TABAQUISMO               3
OTRO_CASO                3
RESULTADO_LAB            4
RESULTADO_ANTIGENO       3
CLASIFICACION_FINAL      7
MIGRANTE                 3
UCI                      4
PATIENT_STATUS           2
dtype: int64

In [12]:
#Create bin for EDAD

# Define bin edges
bin_ages = list(range(0, max(COVID2020_training_data_df['EDAD']) + 10, 10))

# Create labels for the bins
bin_labels = [f"{i + 5}" for i in bin_ages[:-1]]

# Add a new column 'Age Group' with the bin labels
COVID2020_training_data_df['Age_Group'] = pd.cut(COVID2020_training_data_df['EDAD'], bins=bin_ages, labels=bin_labels, include_lowest=True)

#Drop EDAD column
COVID2020_training_data_df.drop(columns=['EDAD'],inplace=True)

# Display the result
COVID2020_training_data_df['Age_Group']

0         55
1         55
2         45
3         35
4         55
          ..
422535    85
422536    35
422537    85
422538    35
422539    45
Name: Age_Group, Length: 422540, dtype: category
Categories (12, object): ['5' < '15' < '25' < '35' ... '85' < '95' < '105' < '115']

In [13]:
#Create bin for FECHA_SINTOMAS

# Convert the 'Date' column to datetime format
COVID2020_training_data_df['FECHA_SINTOMAS'] = pd.to_datetime(COVID2020_training_data_df['FECHA_SINTOMAS'])

# Create a new column 'Month' with the month of each date
COVID2020_training_data_df['SYMPTOMS_MONTH'] = COVID2020_training_data_df['FECHA_SINTOMAS'].dt.month

#Drop FECHA_INGRESO column
COVID2020_training_data_df.drop(columns=['FECHA_SINTOMAS'],inplace=True)

# Display the result
COVID2020_training_data_df['SYMPTOMS_MONTH']

0         11
1          2
2          4
3          7
4          8
          ..
422535    12
422536     4
422537    12
422538     9
422539     4
Name: SYMPTOMS_MONTH, Length: 422540, dtype: int32

In [14]:
COVID2020_training_data_df.nunique()

ORIGEN                  2
SECTOR                 13
SEXO                    2
ENTIDAD_RES            32
TIPO_PACIENTE           2
INTUBADO                4
NEUMONIA                3
EMBARAZO                4
HABLA_LENGUA_INDIG      3
INDIGENA                3
DIABETES                3
EPOC                    3
ASMA                    3
INMUSUPR                3
HIPERTENSION            3
OTRA_COM                3
CARDIOVASCULAR          3
OBESIDAD                3
RENAL_CRONICA           3
TABAQUISMO              3
OTRO_CASO               3
RESULTADO_LAB           4
RESULTADO_ANTIGENO      3
CLASIFICACION_FINAL     7
MIGRANTE                3
UCI                     4
PATIENT_STATUS          2
Age_Group              12
SYMPTOMS_MONTH         12
dtype: int64

In [15]:
COVID2020_training_data_df.head()

Unnamed: 0,ORIGEN,SECTOR,SEXO,ENTIDAD_RES,TIPO_PACIENTE,INTUBADO,NEUMONIA,EMBARAZO,HABLA_LENGUA_INDIG,INDIGENA,...,TABAQUISMO,OTRO_CASO,RESULTADO_LAB,RESULTADO_ANTIGENO,CLASIFICACION_FINAL,MIGRANTE,UCI,PATIENT_STATUS,Age_Group,SYMPTOMS_MONTH
0,1,6,1,24,1,97,2,2,2,2,...,2,2,1,97,3,99,97,0,55,11
1,1,6,2,24,1,97,2,97,2,2,...,2,1,97,97,6,99,97,0,55,2
2,1,12,1,14,1,97,2,2,2,2,...,2,2,2,97,7,99,97,0,45,4
3,2,12,2,8,1,97,2,97,2,2,...,2,2,1,97,3,99,97,0,35,7
4,1,12,2,9,1,97,2,97,2,2,...,2,2,2,97,7,99,97,0,55,8


In [16]:
# Split our preprocessed data into our features and target arrays
y=COVID2020_training_data_df['PATIENT_STATUS']
X=COVID2020_training_data_df.drop(columns=['PATIENT_STATUS'])

# Split the preprocessed data into a training and testing dataset
X_train,X_test,y_train,y_test=train_test_split(X, y)

In [17]:
X.to_csv('X_data.csv')
y.to_csv('y_data.csv')

In [18]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [19]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])
    
    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=50,
        step=5), activation=activation, input_dim=28))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=50,
            step=5),
            activation=activation))
    
    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])
    
    return nn_model

In [20]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

Using TensorFlow backend
Reloading Tuner from ./untitled_project/tuner0.json


In [21]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

In [22]:
best_parameters=tuner.get_best_hyperparameters(2)

for i in best_parameters:
    print(i.values)

{'activation': 'relu', 'first_units': 21, 'num_layers': 3, 'units_0': 31, 'units_1': 36, 'units_2': 16, 'units_3': 46, 'units_4': 46, 'units_5': 21, 'tuner/epochs': 20, 'tuner/initial_epoch': 7, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0049'}
{'activation': 'relu', 'first_units': 36, 'num_layers': 1, 'units_0': 41, 'units_1': 16, 'units_2': 36, 'units_3': 41, 'units_4': 31, 'units_5': 46, 'tuner/epochs': 20, 'tuner/initial_epoch': 7, 'tuner/bracket': 1, 'tuner/round': 1, 'tuner/trial_id': '0022'}


In [25]:
#Get the best model
best_model=tuner.get_best_models(1)[0]



In [26]:
# Export our model to HDF5 file
file='covid2020_trained_model.h5'
best_model.save(file, save_format='h5')

  saving_api.save_model(


In [27]:
#Evaluate model
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

3302/3302 - 1s - loss: 0.0963 - accuracy: 0.9581 - 951ms/epoch - 288us/step
Loss: 0.09633636474609375, Accuracy: 0.9581104516983032


In [28]:
#Review quantity of layers an neurons in the model
best_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 21)                609       
                                                                 
 dense_1 (Dense)             (None, 31)                682       
                                                                 
 dense_2 (Dense)             (None, 36)                1152      
                                                                 
 dense_3 (Dense)             (None, 16)                592       
                                                                 
 dense_4 (Dense)             (None, 1)                 17        
                                                                 
Total params: 3052 (11.92 KB)
Trainable params: 3052 (11.92 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
