# PRIMEIRO PROJECTO DE MACHINE LEARNING
## Neste Projecto, será desenvolvido modelo de machine learning para o diagnóstico de COVID-19 usando dados de exames de rotina dos pacientes COVID-19 e saudaveis  do Hospital Israelita Albert Eisten
Link do documento: https://github.com/AlexandreCOBRE/code/blob/main/covid19dataset.xlsx

In [1]:
# Fluxograma de execução do Projecto
## passo 1: importação e visualização do banco de dados
## Passo 2: Mineração de dados
## Passo 3: Análise exploratória 
## Passo 4: Seleção de variáveis
## Passo 5: divisão dos dados de treinamento e de teste
## Passo 6: Treinamento de diversos algoritmos
## Passo 7: Seleção do melhor algoritmo
## Passo 8: Otimização dos hiperparámetros
## Passo 9: Teste do melhor algorítmo
## Passo 10: Fazer o deploy do modelo

In [2]:
# passo 1: importação e visualização do banco de dados
import pandas as pd

# Leitura dos datasets
from google.colab import files
uploaded = files.upload()


Saving dataset.xlsx to dataset.xlsx


In [3]:
dataset_df = pd.read_excel("dataset.xlsx")
dataset_df

Unnamed: 0.1,Unnamed: 0,Sex,Age,CA,CK,CREA,ALP,GGT,GLU,AST,...,MO,EO,BA,NET,LYT,MOT,EOT,BAT,Suspect,target
0,A00345_2020-03-25,1.0,82.0,2.090000,,1.150000,95.0,40.0,78.000000,26.0,...,9.500000,2.900000,0.500000,6.400000,1.200000,0.800000,0.300000,0.000000,1.0,0
1,A00741_2020-03-04,1.0,58.0,2.110000,,1.000000,80.0,147.0,106.000000,41.0,...,7.300000,0.300000,0.100000,5.450000,0.750000,0.500000,0.000000,0.000000,1.0,0
2,A00605_2020-04-15,0.0,82.0,2.270000,138.0,0.755000,123.5,176.5,106.000000,114.0,...,9.500000,1.700000,0.900000,3.600000,2.600000,0.700000,0.100000,0.100000,0.5,0
3,A00417_2020-02-24,1.0,79.0,2.070000,73.0,1.810000,62.0,36.5,96.000000,28.0,...,10.000000,8.500000,0.500000,0.400000,0.500000,0.100000,0.100000,0.000000,1.0,0
4,A00042_2020-04-05,0.0,9.0,2.290000,104.0,0.640000,131.0,16.0,105.000000,25.0,...,,,,,,,,,0.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1731,2052,0.0,66.0,2.145700,,0.550000,,54.0,,22.5,...,13.650000,3.450000,0.500000,2.320000,1.040000,0.560000,0.140000,0.020000,0.0,1
1732,2053,0.0,80.0,2.140710,,0.550000,,,91.200000,22.2,...,6.266667,0.316667,0.266667,6.840000,1.373333,0.530000,0.023333,0.020000,1.0,1
1733,2055,1.0,58.0,2.012633,79.0,0.736667,42.0,20.0,100.333333,21.0,...,11.800000,1.366667,0.100000,1.886667,1.116667,0.410000,0.050000,0.003333,1.0,1
1734,2057,1.0,76.0,,,,76.0,22.0,,26.0,...,7.200000,2.200000,0.200000,4.150000,1.100000,0.420000,0.130000,0.010000,0.0,1


In [4]:
## Passo 2: Mineração de dados
## 2.1. observando possiveis problemas persentes no banco de dados
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1736 entries, 0 to 1735
Data columns (total 36 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  1736 non-null   object 
 1   Sex         1736 non-null   float64
 2   Age         1682 non-null   float64
 3   CA          1643 non-null   float64
 4   CK          704 non-null    float64
 5   CREA        1662 non-null   float64
 6   ALP         1262 non-null   float64
 7   GGT         1300 non-null   float64
 8   GLU         1638 non-null   float64
 9   AST         1638 non-null   float64
 10  ALT         1640 non-null   float64
 11  LDH         1433 non-null   float64
 12  PCR         1639 non-null   float64
 13  KAL         1656 non-null   float64
 14  NAT         1663 non-null   float64
 15  UREA        1060 non-null   float64
 16  WBC         1673 non-null   float64
 17  RBC         1673 non-null   float64
 18  HGB         1673 non-null   float64
 19  HCT         1673 non-null  

In [5]:
## 2.2. Removendo as variáveis pouco informativas: "Unnamed: 0" e "Suspect"
dataset_df = dataset_df.drop("Unnamed: 0", axis = 1)
dataset_df = dataset_df.drop("Suspect", axis = 1)

In [6]:
# Visualizando os dados após a remoção das duas variáveis
display(dataset_df)

Unnamed: 0,Sex,Age,CA,CK,CREA,ALP,GGT,GLU,AST,ALT,...,LY,MO,EO,BA,NET,LYT,MOT,EOT,BAT,target
0,1.0,82.0,2.090000,,1.150000,95.0,40.0,78.000000,26.0,21.000000,...,13.400000,9.500000,2.900000,0.500000,6.400000,1.200000,0.800000,0.300000,0.000000,0
1,1.0,58.0,2.110000,,1.000000,80.0,147.0,106.000000,41.0,36.000000,...,11.200000,7.300000,0.300000,0.100000,5.450000,0.750000,0.500000,0.000000,0.000000,0
2,0.0,82.0,2.270000,138.0,0.755000,123.5,176.5,106.000000,114.0,63.000000,...,36.500000,9.500000,1.700000,0.900000,3.600000,2.600000,0.700000,0.100000,0.100000,0
3,1.0,79.0,2.070000,73.0,1.810000,62.0,36.5,96.000000,28.0,38.500000,...,44.000000,10.000000,8.500000,0.500000,0.400000,0.500000,0.100000,0.100000,0.000000,0
4,0.0,9.0,2.290000,104.0,0.640000,131.0,16.0,105.000000,25.0,13.000000,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1731,0.0,66.0,2.145700,,0.550000,,54.0,,22.5,21.500000,...,25.600000,13.650000,3.450000,0.500000,2.320000,1.040000,0.560000,0.140000,0.020000,1
1732,0.0,80.0,2.140710,,0.550000,,,91.200000,22.2,27.800000,...,17.466667,6.266667,0.316667,0.266667,6.840000,1.373333,0.530000,0.023333,0.020000,1
1733,1.0,58.0,2.012633,79.0,0.736667,42.0,20.0,100.333333,21.0,35.000000,...,31.933333,11.800000,1.366667,0.100000,1.886667,1.116667,0.410000,0.050000,0.003333,1
1734,1.0,76.0,,,,76.0,22.0,,26.0,22.000000,...,18.900000,7.200000,2.200000,0.200000,4.150000,1.100000,0.420000,0.130000,0.010000,1


In [7]:
## 2.3. Prenchendo os valores vazios com as medianas das colunas
dataset_df = dataset_df.fillna(dataset_df.median())

In [8]:
display(dataset_df)

Unnamed: 0,Sex,Age,CA,CK,CREA,ALP,GGT,GLU,AST,ALT,...,LY,MO,EO,BA,NET,LYT,MOT,EOT,BAT,target
0,1.0,82.0,2.090000,86.0,1.150000,95.0,40.0,78.000000,26.0,21.000000,...,13.400000,9.500000,2.900000,0.500000,6.400000,1.200000,0.800000,0.300000,0.000000,0
1,1.0,58.0,2.110000,86.0,1.000000,80.0,147.0,106.000000,41.0,36.000000,...,11.200000,7.300000,0.300000,0.100000,5.450000,0.750000,0.500000,0.000000,0.000000,0
2,0.0,82.0,2.270000,138.0,0.755000,123.5,176.5,106.000000,114.0,63.000000,...,36.500000,9.500000,1.700000,0.900000,3.600000,2.600000,0.700000,0.100000,0.100000,0
3,1.0,79.0,2.070000,73.0,1.810000,62.0,36.5,96.000000,28.0,38.500000,...,44.000000,10.000000,8.500000,0.500000,0.400000,0.500000,0.100000,0.100000,0.000000,0
4,0.0,9.0,2.290000,104.0,0.640000,131.0,16.0,105.000000,25.0,13.000000,...,16.600000,7.450000,0.200000,0.300000,5.300000,1.195000,0.500000,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1731,0.0,66.0,2.145700,86.0,0.550000,71.5,54.0,104.000000,22.5,21.500000,...,25.600000,13.650000,3.450000,0.500000,2.320000,1.040000,0.560000,0.140000,0.020000,1
1732,0.0,80.0,2.140710,86.0,0.550000,71.5,34.0,91.200000,22.2,27.800000,...,17.466667,6.266667,0.316667,0.266667,6.840000,1.373333,0.530000,0.023333,0.020000,1
1733,1.0,58.0,2.012633,79.0,0.736667,42.0,20.0,100.333333,21.0,35.000000,...,31.933333,11.800000,1.366667,0.100000,1.886667,1.116667,0.410000,0.050000,0.003333,1
1734,1.0,76.0,2.200000,86.0,0.940000,76.0,22.0,104.000000,26.0,22.000000,...,18.900000,7.200000,2.200000,0.200000,4.150000,1.100000,0.420000,0.130000,0.010000,1


In [11]:
## 2.4. Visualizando os resultados após o tratamento do dataset
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1736 entries, 0 to 1735
Data columns (total 34 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Sex     1736 non-null   float64
 1   Age     1736 non-null   float64
 2   CA      1736 non-null   float64
 3   CK      1736 non-null   float64
 4   CREA    1736 non-null   float64
 5   ALP     1736 non-null   float64
 6   GGT     1736 non-null   float64
 7   GLU     1736 non-null   float64
 8   AST     1736 non-null   float64
 9   ALT     1736 non-null   float64
 10  LDH     1736 non-null   float64
 11  PCR     1736 non-null   float64
 12  KAL     1736 non-null   float64
 13  NAT     1736 non-null   float64
 14  UREA    1736 non-null   float64
 15  WBC     1736 non-null   float64
 16  RBC     1736 non-null   float64
 17  HGB     1736 non-null   float64
 18  HCT     1736 non-null   float64
 19  MCV     1736 non-null   float64
 20  MCH     1736 non-null   float64
 21  MCHC    1736 non-null   float64
 22  

In [12]:
## 2.5. Criando o lable da variável dependente target: 1 = COVID-19 e 0 = Healthy
dataset_df["target"] = dataset_df["target"].replace(1, "COVID-19")
dataset_df["target"] = dataset_df["target"].replace(0, "Healthy")

In [13]:
# Visualziando o dataset após a codifição das amostras em saudáveis e COVID-19
display(dataset_df.head())

Unnamed: 0,Sex,Age,CA,CK,CREA,ALP,GGT,GLU,AST,ALT,...,LY,MO,EO,BA,NET,LYT,MOT,EOT,BAT,target
0,1.0,82.0,2.09,86.0,1.15,95.0,40.0,78.0,26.0,21.0,...,13.4,9.5,2.9,0.5,6.4,1.2,0.8,0.3,0.0,Healthy
1,1.0,58.0,2.11,86.0,1.0,80.0,147.0,106.0,41.0,36.0,...,11.2,7.3,0.3,0.1,5.45,0.75,0.5,0.0,0.0,Healthy
2,0.0,82.0,2.27,138.0,0.755,123.5,176.5,106.0,114.0,63.0,...,36.5,9.5,1.7,0.9,3.6,2.6,0.7,0.1,0.1,Healthy
3,1.0,79.0,2.07,73.0,1.81,62.0,36.5,96.0,28.0,38.5,...,44.0,10.0,8.5,0.5,0.4,0.5,0.1,0.1,0.0,Healthy
4,0.0,9.0,2.29,104.0,0.64,131.0,16.0,105.0,25.0,13.0,...,16.6,7.45,0.2,0.3,5.3,1.195,0.5,0.0,0.0,Healthy


In [14]:

dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1736 entries, 0 to 1735
Data columns (total 34 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Sex     1736 non-null   float64
 1   Age     1736 non-null   float64
 2   CA      1736 non-null   float64
 3   CK      1736 non-null   float64
 4   CREA    1736 non-null   float64
 5   ALP     1736 non-null   float64
 6   GGT     1736 non-null   float64
 7   GLU     1736 non-null   float64
 8   AST     1736 non-null   float64
 9   ALT     1736 non-null   float64
 10  LDH     1736 non-null   float64
 11  PCR     1736 non-null   float64
 12  KAL     1736 non-null   float64
 13  NAT     1736 non-null   float64
 14  UREA    1736 non-null   float64
 15  WBC     1736 non-null   float64
 16  RBC     1736 non-null   float64
 17  HGB     1736 non-null   float64
 18  HCT     1736 non-null   float64
 19  MCV     1736 non-null   float64
 20  MCH     1736 non-null   float64
 21  MCHC    1736 non-null   float64
 22  

# Passo 3:  Análise exploratória




In [17]:
## 3.1. Analisando a a variação dos níveis dos biomarcadores entre os pacienets COVID-19 e o grupod e saudáveis

### etapa 1: Importando a bilioteca plotly para criar os gráficos 
import plotly.express as px

### Etapa 2: Criar os gráficos

for biomarcador in dataset_df.columns:
  gráfico = px.histogram(dataset_df, x = biomarcador, color = "target", text_auto = True)


  ## Etapa 3: Exibir os gráficos

  gráfico.show()

In [18]:
## Criando os insights:
## No geral os níveis de todos os biomarcadores variaram entre os pacientes COVID-19 e saudáveis
## No geral os Pacientes COVID-19 tiveram niveis altos de biomarcadores do que pacientes saudáveis
## A infeção por COVID-19 parece que esta modificando o perfil bioquimicos dos pacintes
## Todas as varoáveis parecem ser importantes para análsie dos dois grupos de pacientes
## Essas diferenças mostram que um estudo aprofundado de machine learning supervisionado é justificavel

In [22]:
## Salvando o dataset
dataset_df.to_csv("dataset_tratado.csv", index = False)