In [4]:
# Importando os módulos necessários
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
# Carregando o dataset
df = pd.read_excel('dataset.xlsx')

# Removendo colunas que não serão utilizadas
df = df.drop(columns=['Patient ID', 'Patient addmited to regular ward (1=yes, 0=no)', 'Patient addmited to semi-intensive unit (1=yes, 0=no)', 'Patient addmited to intensive care unit (1=yes, 0=no)'])

# Mapeando os valores categóricos para numéricos
for col in df.columns:
    if df[col].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(df[col].values))
        df[col] = lbl.transform(df[col].values)

# Removendo colunas com valores faltantes, com um threshold de 90%
df = df[df.columns[df.isna().sum()/df.shape[0] < 0.9]]

# Preenchendo os valores faltantes com a mediana
df = df.fillna(df.median())

# Salva um arquivo auxiliar do dataset tratado
df.to_excel('dataset_tra.xlsx', index=False)

# Carrega o dataset tratado e normaliza os dados
df_aux = pd.read_excel('dataset_tra.xlsx')
df_aux = df_aux.drop(columns=['SARS-Cov-2 exam result'])
df_normalized = preprocessing.normalize(df_aux)

In [10]:
# Árvore de decisão

#variáveis de entrada e saída
X = df_normalized
Y = df['SARS-Cov-2 exam result']

# Divide o dataset em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=101)

# Define o modelo e o treina
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

print('Avaliação da Árvore de Decisão\n')

# Imprime as métricas de avaliação da árvore de decisão
prds = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, prds).ravel()
print(f'True Negatives: {tn}, False Positives: {fp}, False Negatives: {fn}, True Positives: {tp}', '\n',
      'Accuracy:', (accuracy_score(y_test, prds)), '\n',
      'Classification Report:\n', (classification_report(y_test, prds)))

Avaliação da Árvore de Decisão

True Negatives: 1504, False Positives: 28, False Negatives: 152, True Positives: 10 
 Accuracy: 0.8937426210153483 
 Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.98      0.94      1532
           1       0.26      0.06      0.10       162

    accuracy                           0.89      1694
   macro avg       0.59      0.52      0.52      1694
weighted avg       0.85      0.89      0.86      1694



In [11]:
# KNN

# divide os dados em conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=101)

# define o modelo KNN e ajusta o modelo aos dados de treino
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

print('Avaliação do KNN\n')

#Imprime as métricas de avaliação do KNN

prds = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, prds).ravel()
print(f'True Negatives: {tn}, False Positives: {fp}, False Negatives: {fn}, True Positives: {tp}', '\n',
'Accuracy:', (accuracy_score(y_test, prds)), '\n',
'Classification Report:\n', (classification_report(y_test, prds)))

Avaliação do KNN

True Negatives: 1528, False Positives: 4, False Negatives: 160, True Positives: 2 
 Accuracy: 0.9031877213695395 
 Classification Report:
               precision    recall  f1-score   support

           0       0.91      1.00      0.95      1532
           1       0.33      0.01      0.02       162

    accuracy                           0.90      1694
   macro avg       0.62      0.50      0.49      1694
weighted avg       0.85      0.90      0.86      1694



In [12]:
# Naive Bayes

# Criação do vetorizador CountVectorizer
vectorizer = CountVectorizer()
# Transformação dos dados de entrada (concatenação de colunas) em um vetor
X = vectorizer.fit_transform(df['Patient age quantile'].astype(str) + ' ' + df['Leukocytes'].astype(str) + ' ' + df['Platelets'].astype(str) + ' ' + df['Monocytes'].astype(str) + ' ' + df['Red blood Cells'].astype(str))
Y = df['SARS-Cov-2 exam result']

# Separação dos dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=101)

# Criação do modelo Naive Bayes e Treinamento do modelo com os dados de treino
model = MultinomialNB()
model.fit(X_train, y_train)

print('Avaliação do Naive Bayes\n')

#Imprime as métricas de avaliação do Naive Bayes

prds = model.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, prds).ravel()
print(f'True Negatives: {tn}, False Positives: {fp}, False Negatives: {fn}, True Positives: {tp}', '\n',
'Accuracy:', (accuracy_score(y_test, prds)), '\n',
'Classification Report:\n', (classification_report(y_test, prds)))

Avaliação do Naive Bayes

True Negatives: 1381, False Positives: 151, False Negatives: 135, True Positives: 27 
 Accuracy: 0.8311688311688312 
 Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.90      0.91      1532
           1       0.15      0.17      0.16       162

    accuracy                           0.83      1694
   macro avg       0.53      0.53      0.53      1694
weighted avg       0.84      0.83      0.83      1694

