In [1]:
import pandas as pd
import numpy as np
import json
import tqdm
import re
import nltk

import seaborn as sns
import matplotlib.pyplot as plt


from string import punctuation
from scipy.stats import norm
import scipy.special as sc

import warnings
import subprocess
import sys

from os.path import exists
from collections import defaultdict

from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.metrics import classification_report, f1_score, accuracy_score, roc_auc_score, recall_score, precision_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB, CategoricalNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import label_binarize
# TensorFlow/Keras imports
# import tensorflow as tf
# from keras.models import Sequential
# from keras.layers import LSTM, GRU,SimpleRNN, Dense, Activation, Dropout, Embedding, BatchNormalization
# from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
# from keras.layers import TextVectorization
#from keras.preprocessing import sequence, text as txt
# from keras.callbacks import EarlyStopping


nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import fetch_20newsgroups

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\70275727459\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\70275727459\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\70275727459\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\70275727459\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### 0.2. Load data

In [2]:
path = r'C:\Desktop\IntersectBayesClassifier\ohs_data\ohsumed-clean-disease.csv'
data = pd.read_csv(path).dropna().reset_index(drop=True)

In [3]:
data

Unnamed: 0,file_name,text,Category
0,11,haemophilus influenzae meningitis prolong hosp...,Bacterial Infections and Mycoses
1,21,augmentation mentoplasty use mersilene mesh ma...,Bacterial Infections and Mycoses
2,33,multiple intracranial mucoceles associate para...,Bacterial Infections and Mycoses
3,52,replacement aortic valve cusp neonatal endocar...,Bacterial Infections and Mycoses
4,157,mucosal intussusception avoid ascend cholangit...,Bacterial Infections and Mycoses
...,...,...,...
34384,50113,pituitary hormone response hormone secondary a...,Pathological Conditions
34385,50126,effect antiarrhythmic drug canine atrial flutt...,Pathological Conditions
34386,50144,effect immediate postoperative enteral nutriti...,Pathological Conditions
34387,50148,effect enteral fat emulsion fat absorption obs...,Pathological Conditions


In [4]:
text_col = "text"
label_col = "Category"

In [5]:
data[label_col].value_counts()

Category
Neoplasms                                              6070
Cardiovascular Diseases                                4657
Nervous System Diseases                                2844
Bacterial Infections and Mycoses                       2540
Digestive System Diseases                              2144
Pathological Conditions                                1924
Respiratory Tract Diseases                             1650
Urologic and Male Genital Diseases                     1583
Disorders of Environmental Origin                      1572
Musculoskeletal Diseases                               1376
Immunologic Diseases                                   1308
Nutritional and Metabolic Diseases                     1049
Virus Diseases                                          983
Female Genital Diseases and Pregnancy Complications     902
Skin and Connective Tissue Diseases                     798
Eye Diseases                                            639
Hemic and Lymphatic Diseases   

In [6]:
categories_to_remove = [
    'Pathological Conditions', 'Respiratory Tract Diseases',
    'Urologic and Male Genital Diseases', 'Disorders of Environmental Origin',
    'Musculoskeletal Diseases', 'Immunologic Diseases',
    'Nutritional and Metabolic Diseases', 'Virus Diseases',
    'Female Genital Diseases and Pregnancy Complications',
    'Skin and Connective Tissue Diseases', 'Eye Diseases',
    'Hemic and Lymphatic Diseases', 'Neonatal Diseases and Abnormalities',
    'Otorhinolaryngologic Diseases', 'Parasitic Diseases',
    'Stomatognathic Diseases', 'Endocrine Diseases', 'Animal Diseases'
]


data_filter = data[~data['Category'].isin(categories_to_remove)]
data_filter[label_col].value_counts()

Category
Neoplasms                           6070
Cardiovascular Diseases             4657
Nervous System Diseases             2844
Bacterial Infections and Mycoses    2540
Digestive System Diseases           2144
Name: count, dtype: int64

In [7]:
X = data_filter[text_col].values.reshape(-1,1)  # Features
y = data_filter[label_col]  # Target

sampling_strategy_under = {
    "Neoplasms": 2000,
    "Cardiovascular Diseases": 2000,
    "Nervous System Diseases": 2000,
    "Bacterial Infections and Mycoses": 2000,
    "Digestive System Diseases": 2000
}



undersampler = RandomUnderSampler(sampling_strategy=sampling_strategy_under, random_state=42) #reduzir número de amostras
X_under, y_under = undersampler.fit_resample(X, y)


data_df = pd.DataFrame({
    text_col: X_under.flatten(),  # converter de matriz 2D para 1D
    label_col: y_under
})

print("Distribuição final das categorias:")
print(data_df[label_col].value_counts())

Distribuição final das categorias:
Category
Bacterial Infections and Mycoses    2000
Cardiovascular Diseases             2000
Digestive System Diseases           2000
Neoplasms                           2000
Nervous System Diseases             2000
Name: count, dtype: int64


In [8]:
text_col = "text"
label_col = "Category"

In [9]:
data_df = data_df.sample(frac=1, random_state=42).reset_index(drop=True)

## 1. Create Words data structures

In [10]:
word_encoding_dict = dict()
label_encoding_dict = dict()

presence_word_dict = dict()
presence_label_dict = dict()

num_words = num_labels = 0
for i, (text, label) in enumerate(zip(data_df[text_col], data_df[label_col])):
    # Label encoding
    if label in label_encoding_dict:
        presence_label_dict[label_encoding_dict[label]].add(i)
    else:
        num_labels += 1
        label_encoding_dict[label] = num_labels
        presence_label_dict[num_labels] = {i}
    # Text encoding
    splitted = text.split(" ")
    for word in splitted:
        if word in word_encoding_dict:
            presence_word_dict[word_encoding_dict[word]].add(i)
        else:
            num_words += 1
            word_encoding_dict[word] = num_words
            presence_word_dict[num_words] = {i}

In [82]:
# prob_matrix = np.zeros((200, 5))
# prob_matrix

In [11]:
print(label_encoding_dict)

{'Neoplasms': 1, 'Digestive System Diseases': 2, 'Bacterial Infections and Mycoses': 3, 'Cardiovascular Diseases': 4, 'Nervous System Diseases': 5}


# Intersect Bayes

In [21]:
matrix_intersection = {}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
#train_df, test_df = train_test_split(data_df, test_size=0.2, random_state=42)

fold_metrics = []

for fold, (train_index, test_index) in enumerate(kf.split(data_df)):
    print(f"Fold {fold + 1}")

    # split test and train data
    train_data = data_df.iloc[train_index]
    test_data = data_df.iloc[test_index]

    word_encoding_dict = {}
    presence_word_dict = {}
    label_encoding_dict = {}
    num_words = 0

    # recreates de dictioanries
    for i, (text, label) in enumerate(zip(train_data[text_col], train_data[label_col])):
        if label not in label_encoding_dict:
            label_encoding_dict[label] = len(label_encoding_dict) + 1
        splitted = text.split(" ")
        for word in splitted:
            if word not in word_encoding_dict:
                num_words += 1
                word_encoding_dict[word] = num_words
                presence_word_dict[num_words] = set()
            presence_word_dict[word_encoding_dict[word]].add(i)

y_pred = np.zeros(len(test_data))
y_true = np.zeros(len(test_data))
# prob_matrix = np.zeros((10000, 5))
prob_matrix = np.zeros((len(test_data), len(label_encoding_dict))) # matrix of probabilities


for i, (text, target) in enumerate(zip(test_data[text_col], test_data[label_col])):
    splitted = list(set(text.split(" ")))
    clusters = []
    for j in range(len(splitted) - 1):
        if splitted[j] not in word_encoding_dict:
            continue
        a = word_encoding_dict[splitted[j]]
        for k in range(j + 1, len(splitted)):
            if splitted[k] not in word_encoding_dict:
                continue
            b = word_encoding_dict[splitted[k]]
            min_index = min(a, b)
            max_index = max(a, b)

            if min_index in matrix_intersection:
                if max_index in matrix_intersection[min_index]:
                    intersection = matrix_intersection[min_index][max_index]
                else:
                    intersection = len(presence_word_dict[a] & presence_word_dict[b])
                    matrix_intersection[min_index][max_index] = intersection
            else:
                matrix_intersection[min_index] = dict()
                intersection = len(presence_word_dict[a] & presence_word_dict[b])
                matrix_intersection[min_index][max_index] = intersection
            if intersection > 0:
                clusters.append({
                    "a": min_index,
                    "b": max_index,
                    "intersection": intersection
                })
            #if len(intersection) > 0:
            #    clusters.append({
            #        "a": min_index,
            #        "b": max_index,
            #        "intersection": len(intersection)
            #    })

    # Sort the clusters
    sorted_clusters = sorted(clusters, key=lambda x: x['intersection'], reverse=True)
    # Create the partition
    covered_elements = set()
    remaining = len(splitted)
    partition = []
    k = 0.5
    probs = {}
    for cluster in sorted_clusters:
        a = cluster['a']
        b = cluster['b']
        if a not in covered_elements and b not in covered_elements:
            #partition.append(cluster)
            covered_elements.add(a)
            covered_elements.add(b)
            remaining -= 2
            counts = train_data.iloc[list(presence_word_dict[a] &
                                    presence_word_dict[b])]['Category'].value_counts()
            # Compute the probabilities for the cluster
            for count in counts.items():
                label = label_encoding_dict[count[0]]
                prob = (k + count[1]) / (2 * k + counts.sum())
                if label in probs:
                    probs[label].append(prob)
                else:
                    probs[label] = [prob]

        if remaining < 1: # There is no one left to add with a intersection
            break
    # Compute the prior probabilities (log-space to prevent precision issues)
    prioris = {}
    for label, prob_list in probs.items():
        category_name = list(label_encoding_dict.keys())[list(label_encoding_dict.values()).index(label)]
        p = 0  # Start with 0 for summation in log-space

        # Sum the log-probabilities for the present elements
        for prob in prob_list:
            p += np.log(prob)
        prioris[category_name] = p

        # Account for the clusters where the label is absent
        absent_clusters = len(partition) - len(prob_list)
        if absent_clusters == len(partition):
          p = np.log(k / (2 * k))
          # if absent_clusters > 0:
          #   p += absent_clusters * np.log(k / (2 * k))

        prioris[label] = p

    # Compute the global prior counts for each label
    count_labels = train_data[label_col].value_counts().to_dict()
    count_labels = {label_encoding_dict[label]: count_labels[label] for label in count_labels}

    # Naive Bayes denominator (in log-space for stability)
    log_denominator = None
    for label in count_labels:
        log_value = prioris[label] + np.log(count_labels[label])
        log_denominator = np.logaddexp(log_denominator, log_value) if log_denominator is not None else log_value

    # Determine the label with the maximum posterior probability
    max_prob = -np.inf
    max_label = None
    posterioris = {}
    for label in count_labels:
        category_name = list(label_encoding_dict.keys())[list(label_encoding_dict.values()).index(label)]
        log_numerator = prioris[label] + np.log(count_labels[label])
        posterior_prob = np.exp(log_numerator - log_denominator)  # Convert back to probability space
        posterioris[label] = posterior_prob
        if posterior_prob > max_prob:
            max_prob = posterior_prob
            max_label = category_name
    
    # Fill the probability matrix
    for label, prob in posterioris.items():
        if label in label_encoding_dict:
            prob_matrix[i, label_encoding_dict[label]] = prob


    y_pred[i] = label_encoding_dict[max_label]
    y_true[i] = label_encoding_dict[target]

    # Result
    print(f"Predicted Label: {max_label}, Probability: {max_prob} | Real: {label_encoding_dict[target]}")
    if i > 500:
        break

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Predicted Label: Digestive System Diseases, Probability: 0.7262529102106959 | Real: 3
Predicted Label: Cardiovascular Diseases, Probability: 0.9985360859688912 | Real: 3
Predicted Label: Cardiovascular Diseases, Probability: 0.9964604003198373 | Real: 1
Predicted Label: Bacterial Infections and Mycoses, Probability: 0.9367840753087537 | Real: 2
Predicted Label: Neoplasms, Probability: 0.9947922964083964 | Real: 1
Predicted Label: Digestive System Diseases, Probability: 0.817349934851859 | Real: 3
Predicted Label: Cardiovascular Diseases, Probability: 0.9999951989167146 | Real: 4
Predicted Label: Cardiovascular Diseases, Probability: 0.999990882638276 | Real: 4
Predicted Label: Cardiovascular Diseases, Probability: 0.9998393937513661 | Real: 4
Predicted Label: Bacterial Infections and Mycoses, Probability: 0.9763496652307931 | Real: 1
Predicted Label: Cardiovascular Diseases, Probability: 0.9959210832683846 | Real: 2
Predicted Label: Bacterial Infectio

In [85]:
# y_pred[:100]

In [86]:
# print("Categorias e seus respectivos labels:")
# for category, label in label_encoding_dict.items():
#     print(f"Label: {label}, Categoria: {category}")

In [22]:
# just to print the results

num_classes = len(label_encoding_dict)
y_true_binary = np.zeros((len(y_true), num_classes))

for i, label in enumerate(y_true):
  y_true_binary[i, int(label) - 1] = 1


prob_matrix = np.exp(prob_matrix) / np.sum(np.exp(prob_matrix), axis=1, keepdims=True)


roc_auc_scores = [roc_auc_score(y_true_binary[:, i], prob_matrix[:, i]) for i in range(num_classes)]

f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
acc = accuracy_score(y_true, y_pred)
recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)

print("INTERSECT BAYES")
print(f"Fold {fold + 1} - Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {acc:.4f}, F1: {f1:.4f}, ROC AUC: {np.mean(roc_auc_scores):.4f}")

fold_metrics.append({
  "precision": precision,
  "recall": recall,
  "accuracy": acc,
  "f1": f1,
  "roc_auc": np.mean(roc_auc_scores)
})


final_metrics = {metric: np.mean([fold[metric] for fold in fold_metrics]) for metric in fold_metrics[0]}
print("\nResultados finais (Média dos 5 Folds):")
for metric, value in final_metrics.items():
  print(f"{metric.capitalize()}: {value:.4f}")


INTERSECT BAYES
Fold 5 - Precision: 0.8802, Recall: 0.8545, Accuracy: 0.8545, F1: 0.8519, ROC AUC: 0.5000

Resultados finais (Média dos 5 Folds):
Precision: 0.8802
Recall: 0.8545
Accuracy: 0.8545
F1: 0.8519
Roc_auc: 0.5000


# Multinomial Naive bayes

In [89]:
def CountOrTfidf_vec(train_texts, test_texts, method):
    if method == "count":
        vectorizer = CountVectorizer(min_df=5, max_features=30000)
    elif method == "tfidf":
        vectorizer = TfidfVectorizer(min_df=5, max_features=30000)
    else:
        raise ValueError("O método deve ser 'count' ou 'tfidf'")
    x_train = vectorizer.fit_transform(train_texts)
    x_test = vectorizer.transform(test_texts)
    return x_train, x_test

In [90]:
f1_mnb, auc_mnb, acc_mnb, recall_mnb, precision_mnb = (
    np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5)
)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_index, test_index) in enumerate(kf.split(data_df)):
    print(f"Fold {fold + 1}/5")

    train = data_df.iloc[train_index]
    test = data_df.iloc[test_index].head(500) # limit the number of samples

    x_trainCorT, x_testCorT = CountOrTfidf_vec(train[text_col], test[text_col], "count")

    nb = MultinomialNB()
    nb.fit(x_trainCorT, train[label_col])

    pred_mnb = nb.predict(x_testCorT)
    pred_proba_mnb = nb.predict_proba(x_testCorT)

    f1_mnb[fold] = f1_score(test[label_col], pred_mnb, average="weighted")
    auc_mnb[fold] = roc_auc_score(test[label_col], pred_proba_mnb, multi_class="ovr")
    acc_mnb[fold] = accuracy_score(test[label_col], pred_mnb)
    recall_mnb[fold] = recall_score(test[label_col], pred_mnb, average="weighted")
    precision_mnb[fold] = precision_score(test[label_col], pred_mnb, average="weighted")


print("MULTINOMIAL NAIVE BAYES")
print(f"F1-Score: {np.mean(f1_mnb):.4f}")
print(f"ROC-AUC: {np.mean(auc_mnb):.4f}")
print(f"Accuracy: {np.mean(acc_mnb):.4f}")
print(f"Recall: {np.mean(recall_mnb):.4f}")
print(f"Precision: {np.mean(precision_mnb):.4f}")


#break

Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
Fold 5/5
MULTINOMIAL NAIVE BAYES
F1-Score: 0.8455
ROC-AUC: 0.9640
Accuracy: 0.8455
Recall: 0.8455
Precision: 0.8462


# Naive Bayes

In [91]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_metrics = []


for fold, (train_index, test_index) in enumerate(kf.split(data_df)):
    print(f"Fold {fold + 1}")

    # split test and train data
    train_data = data_df.iloc[train_index]
    test_data = data_df.iloc[test_index]

    # create dictionaries
    word_freq_per_class = {}
    class_freq = {}
    total_words_per_class = {}
    vocab = set()

    # Construir frequências com os dados de treino
    for text, label in zip(train_data[text_col], train_data[label_col]):
        if label not in word_freq_per_class:
            word_freq_per_class[label] = {}
            class_freq[label] = 0
            total_words_per_class[label] = 0

        class_freq[label] += 1
        words = text.split()
        for word in words:
            vocab.add(word)
            if word not in word_freq_per_class[label]:
                word_freq_per_class[label][word] = 0
            word_freq_per_class[label][word] += 1
            total_words_per_class[label] += 1

    vocab_size = len(vocab)
    k = 0.5  # Laplace smoothing

    # predict vectors
    y_pred = []
    y_true = []

    for i, (text, true_label) in enumerate(zip(test_data[text_col], test_data[label_col])):
        # if i >= 100:
        #   break
        words = text.split()

        # calculate the probabilities for each class
        class_probabilities = {}
        for label in class_freq:
            log_prob = np.log(class_freq[label] / len(train_data))  # probability prior

            for word in words:
                word_count = word_freq_per_class[label].get(word, 0)
                # conditional probability
                log_prob += np.log((word_count + k) / (total_words_per_class[label] + k * vocab_size))

            class_probabilities[label] = log_prob

        # choose class with highest probability
        predicted_label = max(class_probabilities, key=class_probabilities.get)
        predicted_probability = np.exp(class_probabilities[label])
        y_pred.append(predicted_label)
        y_true.append(true_label)

        print(f"Previsto: {predicted_label}, Real: {true_label}")

        if i > 500:
          break

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
Previsto: Bacterial Infections and Mycoses, Real: Bacterial Infections and Mycoses
Previsto: Cardiovascular Diseases, Real: Cardiovascular Diseases
Previsto: Digestive System Diseases, Real: Digestive System Diseases
Previsto: Cardiovascular Diseases, Real: Cardiovascular Diseases
Previsto: Neoplasms, Real: Bacterial Infections and Mycoses
Previsto: Digestive System Diseases, Real: Digestive System Diseases
Previsto: Cardiovascular Diseases, Real: Cardiovascular Diseases
Previsto: Digestive System Diseases, Real: Digestive System Diseases
Previsto: Digestive System Diseases, Real: Digestive System Diseases
Previsto: Bacterial Infections and Mycoses, Real: Cardiovascular Diseases
Previsto: Digestive System Diseases, Real: Digestive System Diseases
Previsto: Bacterial Infections and Mycoses, Real: Bacterial Infections and Mycoses
Previsto: Neoplasms, Real: Neoplasms
Previsto: Bacterial Infections and Mycoses, Real: 

In [92]:
classes = list(class_freq.keys())
y_true_bin = label_binarize(y_true, classes=classes)
y_pred_bin = label_binarize(y_pred, classes=classes)

roc_auc_nb = roc_auc_score(y_true_bin, y_pred_bin, average="macro", multi_class="ovr")

f1_nb = f1_score(y_true, y_pred, average="weighted", zero_division=0)
acc_nb = accuracy_score(y_true, y_pred)
recall_nb = recall_score(y_true, y_pred, average="weighted", zero_division=0)
precision_nb = precision_score(y_true, y_pred, average="weighted", zero_division=0)

fold_metrics.append({
  "f1": f1_nb,
  "accuracy": acc_nb,
  "recall": recall_nb,
  "precision": precision_nb,
  "roc_auc": roc_auc_nb
    })


metrics_df = pd.DataFrame(fold_metrics)


print("\nMétricas médias:")
print(metrics_df.mean())


Métricas médias:
f1           0.832384
accuracy     0.832500
recall       0.832500
precision    0.833877
roc_auc      0.896097
dtype: float64


# Resultado final

In [96]:
results = {
    "Metricas com todas as amostras": ["Precision", "Recall", "Accuracy", "F1", "ROC AUC"],

    "Intersect Bayes": [
        f"{precision:.4f}",
        f"{recall:.4f}",
        f"{acc:.4f}",
        f"{f1:.4f}",
        f"{np.mean(roc_auc_scores):.4f}",
    ],

    "Naive Bayes": [
        f"{precision_nb:.4f}",
        f"{recall_nb:.4f}",
        f"{acc_nb:.4f}",
        f"{f1_nb:.4f}",
        f"{roc_auc_nb:.4f}",
    ],

    "Multinomial Naive Bayes": [
        f"{np.mean(precision_mnb):.4f}",
        f"{np.mean(recall_mnb):.4f}",
        f"{np.mean(acc_mnb):.4f}",
        f"{np.mean(f1_mnb):.4f}",
        f"{np.mean(auc_mnb):.4f}",
    ],
}


df_results = pd.DataFrame(results)

df_results


Unnamed: 0,Metricas com todas as amostras,Intersect Bayes,Naive Bayes,Multinomial Naive Bayes
0,Precision,0.51,0.8339,0.8462
1,Recall,0.431,0.8325,0.8455
2,Accuracy,0.431,0.8325,0.8455
3,F1,0.4103,0.8324,0.8455
4,ROC AUC,0.5,0.8961,0.964
