In [4]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

# Importando os dados -----------------------------------------------------
str_path = "./data-raw"

#filenames = glob.glob(os.path.join(str_path, "*.dbc"))

#db = pd.concat([read.dbc(filename) for filename in filenames], keys=tools.file_path_sans_ext(basename(filenames)))
db = pd.read_csv('hepatite_br.csv', encoding='ISO-8859-1').drop(columns='Unnamed: 0')

# Visualizando a estrutura dos dados --------------------------------------
print(db)
print(db.info())
print(db.describe())

# Organizando a base ------------------------------------------------------
tidy_db = db.copy()

tidy_db['EXPOSICAO'] = tidy_db.apply(
    lambda row: 1 if (row['SEXUAL'] == 1 or row['OUTRAS'] == 2) else (0 if (row['SEXUAL'] == 3 and row['OUTRAS'] == 3) else None),
    axis=1
)

tidy_db = tidy_db[['CS_SEXO', 'CS_RACA', 'CS_ESCOL_N', 'HEPATITE_N', 'HEPATITA', 'HEPATITB', 'HIV', 'OUTRA_DST', 'EXPOSICAO',
                   'ANTIHAVIGM','GEN_VHC', 'CLASSI_FIN', 'FORMA', 'CLAS_ETIOL']]

hepa_data = tidy_db[(tidy_db['HEPATITE_N'] == 2) & (tidy_db['CS_SEXO'] != "I") & (tidy_db['HEPATITA'] != 9) & 
                    (tidy_db['HEPATITB'] != 9) & (tidy_db['HIV'] != 9) & (tidy_db['OUTRA_DST'] != 9)].copy()

hepa_data = hepa_data[['CS_SEXO', 'HEPATITA', 'HEPATITB', 'HIV', 'OUTRA_DST', 'EXPOSICAO', 'ANTIHAVIGM', 'GEN_VHC', 'CLASSI_FIN']]

# Usando o encoder
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

hepa_data_copy = hepa_data.copy()
hepa_data_copy['CS_SEXO'] = label_encoder.fit_transform(hepa_data_copy['CS_SEXO'])

# Fill na
hepa_data_copy = hepa_data_copy.fillna(0)
# Dividindo o conjunto de dados em treinamento e teste
display(hepa_data_copy)
X = hepa_data_copy.drop(columns=['CLASSI_FIN'])
y = hepa_data_copy['CLASSI_FIN']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# randomForest ------------------------------------------------------------
# Criando modelo de classificação
rf_model = RandomForestClassifier(random_state=123)
rf_model.fit(X_train, y_train)

# Fazendo previsões no conjunto de testes
rf_predictions = rf_model.predict(X_test)

# Avaliando o desempenho do modelo
rf_confusion_matrix = confusion_matrix(y_test, rf_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions, average='weighted')
rf_recall = recall_score(y_test, rf_predictions, average='weighted')

print("\nRandom Forest:")
print(rf_confusion_matrix)
print(f"Acurácia: {rf_accuracy}")
print(f"Precision: {rf_precision}")
print(f"Recall: {rf_recall}")

# ----------------------- OUTROS TESTES ------------------ #

# naiveBayes --------------------------------------------------------------
# Criando modelo de classificação Naïve Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Fazendo previsões no conjunto de testes
nb_predictions = nb_model.predict(X_test)

# Avaliando o desempenho do modelo
nb_confusion_matrix = confusion_matrix(y_test, nb_predictions)
nb_accuracy = accuracy_score(y_test, nb_predictions)
nb_precision = precision_score(y_test, nb_predictions, average='weighted')
nb_recall = recall_score(y_test, nb_predictions, average='weighted')

print("\nNaïve Bayes:")
print(nb_confusion_matrix)
print(f"Acurácia: {nb_accuracy}")
print(f"Precision: {nb_precision}")
print(f"Recall: {nb_recall}")


# nnet --------------------------------------------------------------------
# Criando modelo de rede neural
nn_model = MLPClassifier(hidden_layer_sizes=(10,), activation='logistic', max_iter=100, random_state=123)
nn_model.fit(X_train, y_train)


# Fazendo previsões no conjunto de testes
nn_predictions = nn_model.predict(X_test)

# Avaliando o desempenho do modelo
nn_confusion_matrix = confusion_matrix(y_test, nn_predictions)
nn_accuracy = accuracy_score(y_test, nn_predictions)
nn_precision = precision_score(y_test, nn_predictions, average='weighted')
nn_recall = recall_score(y_test, nn_predictions, average='weighted')

print("\nRede Neural:")
print(nn_confusion_matrix)
print(f"Acurácia: {nn_accuracy}")
print(f"Precision: {nn_precision}")
print(f"Recall: {nn_recall}")

# gradientBoosting --------------------------------------------------------
# Criando modelo Gradient Boosting
gb_model = GradientBoostingClassifier(random_state=123)
gb_model.fit(X_train, y_train)

# Fazendo previsões no conjunto de testes
gb_predictions = gb_model.predict(X_test)

# Avaliando o desempenho do modelo
gb_confusion_matrix = confusion_matrix(y_test, gb_predictions)
gb_accuracy = accuracy_score(y_test, gb_predictions)
gb_precision = precision_score(y_test, gb_predictions, average='weighted')
gb_recall = recall_score(y_test, gb_predictions, average='weighted')

print("\nGradient Boosting:")
print(gb_confusion_matrix)
print(f"Acurácia: {gb_accuracy}")
print(f"Precision: {gb_precision}")
print(f"Recall: {gb_recall}")

  db = pd.read_csv('hepatite_br.csv', encoding='ISO-8859-1').drop(columns='Unnamed: 0')


        TP_NOT ID_AGRAVO  DT_NOTIFIC  SEM_NOT  NU_ANO  SG_UF_NOT  ID_MUNICIP  \
0            2       B19  2007-01-02   200701    2007       11.0      110140   
1            2       B19  2007-01-22   200704    2007       11.0      110030   
2            2       B19  2007-01-01   200701    2007       11.0      110030   
3            2       B19  2007-01-05   200701    2007       11.0      110004   
4            2       B19  2007-01-09   200702    2007       11.0      110045   
...        ...       ...         ...      ...     ...        ...         ...   
594230       2       B19  2020-12-24   202052    2020       53.0      530010   
594231       2       B19  2020-12-28   202053    2020       53.0      530010   
594232       2       B19  2020-12-28   202053    2020       53.0      530010   
594233       2       B19  2020-12-29   202053    2020       53.0      530010   
594234       2       B19  2020-12-30   202053    2020       53.0      530010   

        ID_REGIONA  DT_SIN_PRI  SEM_PRI

Unnamed: 0,CS_SEXO,HEPATITA,HEPATITB,HIV,OUTRA_DST,EXPOSICAO,ANTIHAVIGM,GEN_VHC,CLASSI_FIN
0,0,3.0,3.0,2.0,2.0,0.0,4.0,7.0,1.0
1,0,3.0,2.0,2.0,2.0,0.0,4.0,7.0,1.0
3,0,3.0,1.0,2.0,1.0,0.0,4.0,7.0,1.0
4,0,3.0,1.0,2.0,2.0,0.0,4.0,9.0,1.0
5,0,3.0,1.0,2.0,2.0,0.0,4.0,7.0,4.0
...,...,...,...,...,...,...,...,...,...
594223,0,1.0,1.0,2.0,2.0,0.0,4.0,7.0,1.0
594224,1,3.0,2.0,2.0,2.0,0.0,4.0,0.0,1.0
594225,0,3.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0
594226,0,1.0,1.0,2.0,2.0,0.0,4.0,7.0,1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Random Forest:
[[    0   111     0     0     0     0]
 [    0 46462     0     2    23    28]
 [    0   155     0     0     0     0]
 [    0   930     0     0     1     0]
 [    0  3237     0     0    23     1]
 [    0  4135     0     1     2     2]]
Acurácia: 0.8434852031281186
Precision: 0.7452058368099114
Recall: 0.8434852031281186

Naïve Bayes:
[[    0   109     0     0     0     2]
 [    1 44492    18     0   629  1375]
 [    0   151     0     0     2     2]
 [    0   900     0     0     6    25]
 [    0  3073     1     0    53   134]
 [    0  3837     0     0    51   252]]
Acurácia: 0.8128209315406528
Precision: 0.7292198714812633
Recall: 0.8128209315406528


  _warn_prf(average, modifier, msg_start, len(result))



Rede Neural:
[[    0   111     0     0     0     0]
 [    0 46515     0     0     0     0]
 [    0   155     0     0     0     0]
 [    0   931     0     0     0     0]
 [    0  3261     0     0     0     0]
 [    0  4140     0     0     0     0]]
Acurácia: 0.8439932502313429
Precision: 0.7123246064360661
Recall: 0.8439932502313429

Gradient Boosting:
[[    0   111     0     0     0     0]
 [    1 46500     0     0    14     0]
 [    0   155     0     0     0     0]
 [    0   930     0     0     1     0]
 [    0  3240     0     0    21     0]
 [    0  4138     0     0     2     0]]
Acurácia: 0.8441021174677481
Precision: 0.745298007401944
Recall: 0.8441021174677481


  _warn_prf(average, modifier, msg_start, len(result))
