# Import dos dados

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
import skops.io as sio

# Funcoes Helpers

In [24]:
def calculate_precision(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")

    # Classification report
    print("Classification Report:")
    print(classification_report(y_test, y_pred))

    # Confusion matrix
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [3]:
def export_model(model, model_name):
    sio.dump(model, model_name + ".skops")

# Tratamento dos Dados

In [5]:
df = pd.read_csv("dengue_sinan.csv", usecols=[
    "FEBRE",
    "MIALGIA",
    "CEFALEIA",
    "EXANTEMA",
    "VOMITO",
    "NAUSEA",
    "DOR_COSTAS",
    "CONJUNTVIT",
    "ARTRITE",
    "ARTRALGIA",
    "PETEQUIA_N",
    "LEUCOPENIA",
    "LACO",
    "DOR_RETRO",
    "CLASSI_FIN",
    "CRITERIO",
    "ALRM_HIPOT",
    "ALRM_PLAQ",
    "ALRM_VOM",
    "ALRM_LETAR",
    "ALRM_SANG",
    "ALRM_HEMAT",
    "ALRM_HEPAT",
    "ALRM_LIQ",
    "GRAV_PULSO",
    "GRAV_CONV",
    "GRAV_ENCH",
    "GRAV_INSUF",
    "GRAV_TAQUI",
    "GRAV_EXTRE",
    "GRAV_HIPOT",
    "GRAV_HEMAT",
    "GRAV_MELEN",
    "GRAV_METRO",
    "GRAV_SANG",
    "GRAV_AST",
    "GRAV_MIOC",
    "GRAV_CONSC",
    "GRAV_ORGAO",
    "DIABETES", # Doencas pre-existentes
    "HEMATOLOG",
    "HEPATOPAT",
    "RENAL",
    "HIPERTENSA",
    "ACIDO_PEPT",
    "AUTO_IMUNE",
    "RESUL_SORO", # Resultados exames
    "RESUL_NS1",
    "RESUL_VI_N",
    "RESUL_PCR_",
], low_memory=False)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 620211 entries, 0 to 620210
Data columns (total 50 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   FEBRE       540788 non-null  float64
 1   MIALGIA     540788 non-null  float64
 2   CEFALEIA    540788 non-null  float64
 3   EXANTEMA    540788 non-null  float64
 4   VOMITO      540788 non-null  float64
 5   NAUSEA      540788 non-null  float64
 6   DOR_COSTAS  540788 non-null  float64
 7   CONJUNTVIT  540788 non-null  float64
 8   ARTRITE     540788 non-null  float64
 9   ARTRALGIA   540788 non-null  float64
 10  PETEQUIA_N  540788 non-null  float64
 11  LEUCOPENIA  540788 non-null  float64
 12  LACO        540788 non-null  float64
 13  DOR_RETRO   540788 non-null  float64
 14  DIABETES    540788 non-null  float64
 15  HEMATOLOG   540788 non-null  float64
 16  HEPATOPAT   540788 non-null  float64
 17  RENAL       540788 non-null  float64
 18  HIPERTENSA  540788 non-null  float64
 19  AC

Removendo as linhas com "valores criticos" nulos

In [7]:
df.dropna(subset=[
    "FEBRE",
    "MIALGIA",
    "CEFALEIA",
    "EXANTEMA",
    "VOMITO",
    "NAUSEA",
    "DOR_COSTAS",
    "CONJUNTVIT",
    "ARTRITE",
    "ARTRALGIA",
    "PETEQUIA_N",
    "LEUCOPENIA",
    "LACO",
    "DOR_RETRO",
    "CLASSI_FIN",
    "CRITERIO"
    ], inplace=True, how='any')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 379442 entries, 2 to 615873
Data columns (total 50 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   FEBRE       379442 non-null  float64
 1   MIALGIA     379442 non-null  float64
 2   CEFALEIA    379442 non-null  float64
 3   EXANTEMA    379442 non-null  float64
 4   VOMITO      379442 non-null  float64
 5   NAUSEA      379442 non-null  float64
 6   DOR_COSTAS  379442 non-null  float64
 7   CONJUNTVIT  379442 non-null  float64
 8   ARTRITE     379442 non-null  float64
 9   ARTRALGIA   379442 non-null  float64
 10  PETEQUIA_N  379442 non-null  float64
 11  LEUCOPENIA  379442 non-null  float64
 12  LACO        379442 non-null  float64
 13  DOR_RETRO   379442 non-null  float64
 14  DIABETES    379442 non-null  float64
 15  HEMATOLOG   379442 non-null  float64
 16  HEPATOPAT   379442 non-null  float64
 17  RENAL       379442 non-null  float64
 18  HIPERTENSA  379442 non-null  float64
 19  ACIDO_P

## Removendo os dados que nao fazem sentido para a analise

In [9]:
df = df[df['CLASSI_FIN'].isin([5, 10, 11, 12])]

Removendo os criticos e alarmes sem dados dos alarmes e criticos

In [10]:
grav = [
    "GRAV_PULSO",
    "GRAV_CONV",
    "GRAV_ENCH",
    "GRAV_INSUF",
    "GRAV_TAQUI",
    "GRAV_EXTRE",
    "GRAV_HIPOT",
    "GRAV_HEMAT",
    "GRAV_MELEN",
    "GRAV_METRO",
    "GRAV_SANG",
    "GRAV_AST",
    "GRAV_MIOC",
    "GRAV_CONSC",
    "GRAV_ORGAO"
]
df = df[~(df['CLASSI_FIN'] == 12 & df[grav].isna().all(axis=1))]

In [11]:
alarm = [
    "ALRM_HIPOT",
    "ALRM_PLAQ",
    "ALRM_VOM",
    "ALRM_LETAR",
    "ALRM_SANG",
    "ALRM_HEMAT",
    "ALRM_HEPAT",
    "ALRM_LIQ",
]
df = df[~(df['CLASSI_FIN'] == 11 & df[alarm].isna().all(axis=1))]

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 371754 entries, 2 to 615873
Data columns (total 50 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   FEBRE       371754 non-null  float64
 1   MIALGIA     371754 non-null  float64
 2   CEFALEIA    371754 non-null  float64
 3   EXANTEMA    371754 non-null  float64
 4   VOMITO      371754 non-null  float64
 5   NAUSEA      371754 non-null  float64
 6   DOR_COSTAS  371754 non-null  float64
 7   CONJUNTVIT  371754 non-null  float64
 8   ARTRITE     371754 non-null  float64
 9   ARTRALGIA   371754 non-null  float64
 10  PETEQUIA_N  371754 non-null  float64
 11  LEUCOPENIA  371754 non-null  float64
 12  LACO        371754 non-null  float64
 13  DOR_RETRO   371754 non-null  float64
 14  DIABETES    371754 non-null  float64
 15  HEMATOLOG   371754 non-null  float64
 16  HEPATOPAT   371754 non-null  float64
 17  RENAL       371754 non-null  float64
 18  HIPERTENSA  371754 non-null  float64
 19  ACIDO_P

## Tratando os Resultados dos testes

In [13]:
resul = [
    "RESUL_SORO", # Resultados exames
    "RESUL_NS1",
    "RESUL_VI_N",
    "RESUL_PCR_",
]

df[resul] = df[resul].replace(3, 0)
df[resul] = df[resul].replace(4, 0)

# Mudando os numeros para facilitar visualizacao

In [14]:
resul = [
    "RESUL_SORO", # Resultados exames
    "RESUL_NS1",
    "RESUL_VI_N",
    "RESUL_PCR_",
    "CRITERIO"
]


df[df.columns.difference(resul)] = df[df.columns.difference(resul)].fillna(value=2.0)
df[resul] = df[resul].fillna(value=0)
df[df.columns.difference(resul)] = df[df.columns.difference(resul)].replace(2, 0)

In [15]:
df.describe()

Unnamed: 0,FEBRE,MIALGIA,CEFALEIA,EXANTEMA,VOMITO,NAUSEA,DOR_COSTAS,CONJUNTVIT,ARTRITE,ARTRALGIA,...,GRAV_EXTRE,GRAV_HIPOT,GRAV_HEMAT,GRAV_MELEN,GRAV_METRO,GRAV_SANG,GRAV_AST,GRAV_MIOC,GRAV_CONSC,GRAV_ORGAO
count,371754.0,371754.0,371754.0,371754.0,371754.0,371754.0,371754.0,371754.0,371754.0,371754.0,...,371754.0,371754.0,371754.0,371754.0,371754.0,371754.0,371754.0,371754.0,371754.0,371754.0
mean,0.852093,0.731941,0.753224,0.202042,0.238873,0.325223,0.263637,0.038835,0.099934,0.263053,...,0.00046,0.000336,0.000379,0.000304,0.000129,4.6e-05,6.5e-05,2.4e-05,0.000336,0.000161
std,0.355008,0.442949,0.431136,0.401524,0.426396,0.468459,0.440605,0.193201,0.299913,0.440292,...,0.021442,0.018334,0.019472,0.017432,0.011362,0.006762,0.008035,0.00492,0.018334,0.012703
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Removendo tabelas com dengue mas sem informacao

In [16]:
df = df[~((df['CLASSI_FIN'].isin([1])) & (df.drop(columns=['CLASSI_FIN']).eq(0)).all(axis=1))]

In [17]:

df['CLASSI_FIN'] = df['CLASSI_FIN'].replace(5, 0)
df['CLASSI_FIN'] = df['CLASSI_FIN'].replace(10, 1)
df['CLASSI_FIN'] = df['CLASSI_FIN'].replace(11, 1)
df['CLASSI_FIN'] = df['CLASSI_FIN'].replace(12, 1)

In [18]:
df = df[~(df['CRITERIO'] == 3)]

# Trainando

In [19]:
X = df.drop(["CLASSI_FIN"], axis=1)
y = df['CLASSI_FIN']

In [20]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

In [25]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', metric='euclidean')
knn.fit(X_train, y_train)
y_val_pred = knn.predict(X_val)
calculate_precision(y_val, y_val_pred)

print('---------')
y_test_pred = knn.predict(X_test)
calculate_precision(y_test, y_test_pred)

Accuracy: 0.7382806471075192
Classification Report:
              precision    recall  f1-score   support

         0.0       0.68      0.73      0.71     30727
         1.0       0.79      0.74      0.76     40544

    accuracy                           0.74     71271
   macro avg       0.73      0.74      0.74     71271
weighted avg       0.74      0.74      0.74     71271

Confusion Matrix:
[[22491  8236]
 [10417 30127]]
---------
Accuracy: 0.7387156066282219
Classification Report:
              precision    recall  f1-score   support

         0.0       0.68      0.73      0.71     30689
         1.0       0.79      0.74      0.76     40582

    accuracy                           0.74     71271
   macro avg       0.73      0.74      0.74     71271
weighted avg       0.74      0.74      0.74     71271

Confusion Matrix:
[[22550  8139]
 [10483 30099]]


In [26]:
dt_classifier = DecisionTreeClassifier(criterion='entropy', min_samples_split=2)
dt_classifier.fit(X_train, y_train)
y_val_pred = dt_classifier.predict(X_val)
calculate_precision(y_val, y_val_pred)

print('--------')

y_test_pred = dt_classifier.predict(X_test)
calculate_precision(y_test, y_test_pred)

Accuracy: 0.7795035849083077
Classification Report:
              precision    recall  f1-score   support

         0.0       0.82      0.63      0.71     30727
         1.0       0.76      0.89      0.82     40544

    accuracy                           0.78     71271
   macro avg       0.79      0.76      0.77     71271
weighted avg       0.79      0.78      0.77     71271

Confusion Matrix:
[[19308 11419]
 [ 4296 36248]]
--------
Accuracy: 0.7821414039370852
Classification Report:
              precision    recall  f1-score   support

         0.0       0.82      0.64      0.72     30689
         1.0       0.76      0.89      0.82     40582

    accuracy                           0.78     71271
   macro avg       0.79      0.76      0.77     71271
weighted avg       0.79      0.78      0.78     71271

Confusion Matrix:
[[19511 11178]
 [ 4349 36233]]


In [27]:
rf_classifier = RandomForestClassifier(n_estimators=100, criterion='entropy')
rf_classifier.fit(X_train, y_train)
y_val_pred = rf_classifier.predict(X_val)
calculate_precision(y_val, y_val_pred)

print('------')

y_val_pred = rf_classifier.predict(X_test)
calculate_precision(y_test, y_test_pred)

Accuracy: 0.7844986039202481
Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.61      0.71     30727
         1.0       0.76      0.92      0.83     40544

    accuracy                           0.78     71271
   macro avg       0.80      0.76      0.77     71271
weighted avg       0.80      0.78      0.78     71271

Confusion Matrix:
[[18768 11959]
 [ 3400 37144]]
------
Accuracy: 0.7821414039370852
Classification Report:
              precision    recall  f1-score   support

         0.0       0.82      0.64      0.72     30689
         1.0       0.76      0.89      0.82     40582

    accuracy                           0.78     71271
   macro avg       0.79      0.76      0.77     71271
weighted avg       0.79      0.78      0.78     71271

Confusion Matrix:
[[19511 11178]
 [ 4349 36233]]


In [28]:
logistic_classifier = LogisticRegression(max_iter=100, penalty='l2', solver='lbfgs')
logistic_classifier.fit(X_train, y_train)
y_pred = logistic_classifier.predict(X_val)
calculate_precision(y_val, y_val_pred)

print('-------')

y_val_pred = logistic_classifier.predict(X_test)
calculate_precision(y_test, y_test_pred)

Accuracy: 0.5234527367372424
Classification Report:
              precision    recall  f1-score   support

         0.0       0.43      0.31      0.36     30727
         1.0       0.57      0.68      0.62     40544

    accuracy                           0.52     71271
   macro avg       0.50      0.50      0.49     71271
weighted avg       0.51      0.52      0.51     71271

Confusion Matrix:
[[ 9604 21123]
 [12841 27703]]
-------
Accuracy: 0.7821414039370852
Classification Report:
              precision    recall  f1-score   support

         0.0       0.82      0.64      0.72     30689
         1.0       0.76      0.89      0.82     40582

    accuracy                           0.78     71271
   macro avg       0.79      0.76      0.77     71271
weighted avg       0.79      0.78      0.78     71271

Confusion Matrix:
[[19511 11178]
 [ 4349 36233]]


In [29]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(2,))
mlp_classifier.fit(X_train, y_train)
y_pred = mlp_classifier.predict(X_val)
calculate_precision(y_val, y_val_pred)

print('----')
y_val_pred = mlp_classifier.predict(X_test)
calculate_precision(y_test, y_test_pred)

Accuracy: 0.5259362152909318
Classification Report:
              precision    recall  f1-score   support

         0.0       0.43      0.29      0.34     30727
         1.0       0.57      0.71      0.63     40544

    accuracy                           0.53     71271
   macro avg       0.50      0.50      0.49     71271
weighted avg       0.51      0.53      0.51     71271

Confusion Matrix:
[[ 8874 21853]
 [11934 28610]]
----
Accuracy: 0.7821414039370852
Classification Report:
              precision    recall  f1-score   support

         0.0       0.82      0.64      0.72     30689
         1.0       0.76      0.89      0.82     40582

    accuracy                           0.78     71271
   macro avg       0.79      0.76      0.77     71271
weighted avg       0.79      0.78      0.78     71271

Confusion Matrix:
[[19511 11178]
 [ 4349 36233]]


In [None]:
export_model(dt_classifier, "DecisionTree")
export_model(knn, "knn")
export_model(rf_classifier, "RandomForest")
export_model(mlp_classifier, "NeuralNetwork")
export_model(logistic_classifier, "Logistic")