## Preliminaries

In [4]:
import pandas as pd
import numpy as np
# import json
# import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import norm
from collections import defaultdict
import scipy.special as sc

import warnings
import subprocess
import sys

from os.path import exists

from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.metrics import classification_report, f1_score, accuracy_score, roc_auc_score, recall_score, precision_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB, CategoricalNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
def kde(column, target, data, ax=None, title=None, xlabel="", ylabel=""):
    if ax is None: fig, ax = plt.subplots(1, 1, figsize=(15, 3))
    g = sns.kdeplot(data[column][(data[target] == 'ham')], ax=ax, color="Red")
    g = sns.kdeplot(data[column][(data[target] == 'spam')], ax=g, color="Blue")
    g.set_xlabel(xlabel)
    g.set_ylabel(ylabel)
    if title is not None: g.set_title(title)
    #g = g.legend(['Não conluio', 'Conluio'])

## Licitações

In [3]:
country = "Italy"

collusion = pd.read_csv(f"1-s2/DB_Collusion_{country}_processed.csv")

In [None]:
tenders = {}
labels = {}
for row in collusion.itertuples():
    if row.Tender in tenders:
        tenders[row.Tender] += f" {row.Competitors}"
    else:
        tenders[row.Tender] = f"{row.Competitors}"
        labels[row.Tender] = row.Collusive_competitor_original

In [None]:
data = pd.DataFrame({"Competitors": tenders, "Collusive": labels})
data['id'] = data.index
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

## Carregar dados

In [None]:
path = "/content/drive/MyDrive/Mestrado_códigos/SpamCode/emailNPL/notebooks"

files = {
    "email": "CleanEmail",
}

data = pd.read_csv(f"{path}/{files['email']}.csv", index_col=0).dropna().reset_index(drop=True)
# data['id'] = data.index
data.head(1)

In [15]:
# Atualize aqui o nome da coluna de texto e rótulo
text = 'Email'
#text = "Competitors"
#target = "Collusive"
#data[target] = data[target].apply(lambda x: 'ham' if x == 0 else 'spam')
target = 'Spam/Ham'

### K-Fold

In [None]:
K = 5
data['fold'] = [i % K for i in range(len(data))]

In [None]:

word_number = [len(doc.split()) for doc in data[text]]

result_word = sum(word_number)

print(f"total de palavras: {result_word}")

In [None]:
data.info()

### Train/Test split

In [None]:
# Train / test split
np.random.seed(42)

#dict2_bidding = df['l'].unique()

train_indices = np.random.choice(len(data), int(np.round(len(data) * 0.8)), replace=False)

train = data.loc[data['id'].isin(train_indices), :]
test = data.loc[~data['id'].isin(train_indices), :]

## Create Words Dict

In [None]:
# count words without max_feature. 0 = non-spam and 1 = spam
def create_words_dict(train, text, target):
    words_dict_0, words_dict_1, words_dict = dict(), dict(), dict()

    for i, (entry, yi) in enumerate(zip(train[text], train[target])):
        for word in set(entry.split(" ")):
            if word in words_dict:
                words_dict[word].add(i)
            else:
                words_dict[word] = {i}
            if yi == 'ham':
                if word in words_dict_0:
                    words_dict_0[word].add(i)
                else:
                    words_dict_0[word] = {i}
            else:
                if word in words_dict_1:
                    words_dict_1[word].add(i)
                else:
                    words_dict_1[word] = {i}

    print(f"words_dict_0: {len(words_dict_0)} words_dict_1:{len(words_dict_1)}")

    return words_dict_1, words_dict_0, words_dict

In [None]:
words_dict_1, words_dict_0, words_dict = create_words_dict(data, text, target)
vocabulary_size = len(words_dict)

## Redes Neurais

In [None]:
max_len = data[text].apply(lambda x: len(x.split(" "))).max()

#### Vectorizer

In [None]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      context = tf.concat([tf.squeeze(context_class,1), negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

### LSTM

In [None]:
# A simple LSTM with glove embeddings and one dense layer
model = Sequential()
model.add(Embedding(vocabulary_size + 1,
                    25,
                    #weights=[embedding_matrix],
                    input_length=max_len,
                    trainable=False))

model.add(LSTM(50, dropout=0.3, recurrent_dropout=0.3))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

model.summary()

In [None]:
# using keras tokenizer here
token = txt.Tokenizer(num_words=None)

token.fit_on_texts(list(train[text]) + list(test[text]))

xtrain_seq = token.texts_to_sequences(train[text])
xvalid_seq = token.texts_to_sequences(test[text])

#zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)

word_index = token.word_index

In [None]:
model.fit(xtrain_pad, np.array(train[target].apply(lambda x: 1 if x == 'spam' else 0).values), epochs=5, batch_size=512)

## IntersectBayesClassifier

In [None]:
# Função para processar o particionamento da versão exata
def solution_clusters(js):
    remaining = set(entry.split(' '))
    solution = []
    for cluster in js['Clusters']:
        if len(cluster['Intersection']) > 0:
            a = entry_dict[cluster['Elements'][0]]
            b = entry_dict[cluster['Elements'][1]]
            intersection_0 = intersection_1 = set()
            if a in words_dict_0 and b in words_dict_0:
                intersection_0 = words_dict_0[a] & words_dict_0[b]
            if a in words_dict_1 and b in words_dict_1:
                intersection_1 = words_dict_1[a] & words_dict_1[b]
            solution.append({
                'a': a,
                'b': b,
                'int': len(cluster['Intersection']),
                'int_0': len(intersection_0),
                'int_1': len(intersection_1)
            })
            for element in cluster['Elements']:
                remaining.remove(entry_dict[element])
            if len(remaining) <= 1:
                break

    for word in remaining:
        if word in words_dict:
            solution.append({
                    'a': word,
                    'b': word,
                    'int': len(words_dict[word]),
                    'int_0': len(words_dict_0[word]) if word in words_dict_0 else 0,
                    'int_1': len(words_dict_1[word]) if word in words_dict_1 else 0
                })

    return solution

In [None]:
# Classe que cria os candidatos a serem grupos
class Pairs:
    def __init__(self, words_dict, words_dict_0, words_dict_1):
        self.pairs_matrix = dict()
        self.words_dict = words_dict
        self.words_dict_1 = words_dict_1
        self.words_dict_0 = words_dict_0

    def get_pairs(self, entry):
        pairs = []
        words = list(set(entry.split(' ')))
        for j in range(len(words)):
            for k in range(j+1, len(words)):
                if words[j] not in self.words_dict or words[k] not in self.words_dict:
                    continue
                if words[j] in self.pairs_matrix and words[k] in self.pairs_matrix[words[j]]:
                    if self.pairs_matrix[words[j]][words[k]][0] > 0:
                        pairs.append({
                            'a': words[j],
                            'b': words[k],
                            'int': self.pairs_matrix[words[j]][words[k]][0],
                            'int_0': self.pairs_matrix[words[j]][words[k]][1],
                            'int_1': self.pairs_matrix[words[j]][words[k]][2]
                        })
                    continue

                intersection = self.words_dict[words[j]] & self.words_dict[words[k]]
                intersection_0, intersection_1 = dict(), dict()
                if words[j] in self.words_dict_0 and words[k] in self.words_dict_0:
                    intersection_0 = words_dict_0[words[j]] & self.words_dict_0[words[k]]
                if words[j] in self.words_dict_1 and words[k] in self.words_dict_1:
                    intersection_1 = self.words_dict_1[words[j]] & self.words_dict_1[words[k]]

                vec = [len(intersection), len(intersection_0), len(intersection_1)]
                if words[j] not in self.pairs_matrix:
                    self.pairs_matrix[words[j]] = {words[k]: vec}
                else:
                    self.pairs_matrix[words[j]][words[k]] = vec
                if words[k] not in self.pairs_matrix:
                    self.pairs_matrix[words[k]] = {words[j]: vec}
                else:
                    self.pairs_matrix[words[k]][words[j]] = vec

                if len(intersection) > 0:
                    pairs.append({
                        'a': words[j],
                        'b': words[k],
                        'int': self.pairs_matrix[words[j]][words[k]][0],
                        'int_0': self.pairs_matrix[words[j]][words[k]][1],
                        'int_1': self.pairs_matrix[words[j]][words[k]][2]
                    })

        return pairs

In [None]:
# Heuristica gulosa
# pairs = candidatos, supoe-se que está ordenado pelo tamanho da interseção
# entry = texto do email
def get_solution(pairs, entry):
    remaining = set(entry.split(' '))
    solution = []
    # Se houver grupos
    if len(pairs) > 0:
        # Para cada grupo
        for row in pairs.itertuples():
            # Se o grupo pode ser adicionado no particionamento, o adicone
            if row.a in remaining and row.b in remaining:
                solution.append({
                    'a': row.a,
                    'b': row.b,
                    'int': row.int,
                    'int_0': row.int_0,
                    'int_1': row.int_1
                })
                remaining.remove(row.a)
                remaining.remove(row.b)
                if len(remaining) <= 1:
                    break

    # Caso existam palavras fora do particionamento, criar grupos unitários para elas.
    # Tanto faz criar um único ou vários, não muda a função objetivo
    for word in remaining:
        if word in words_dict:
            solution.append({
                    'a': word,
                    'b': word,
                    'int': len(words_dict[word]),
                    'int_0': len(words_dict_0[word]) if word in words_dict_0 else 0,
                    'int_1': len(words_dict_1[word]) if word in words_dict_1 else 0
                })

    return solution

### Predição utilizando particionamentos $P^{spam}$ e $P^{ham}$

In [None]:
%%time
k = 0.001

f1_2, auc_2, acc_2, recall_2, precision_2 = (
    np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5)
)

for fold in range(5):
    break
    print(f"Fold {fold}/5")

    train = data.loc[data['fold'] != fold, :]
    test = data.loc[data['fold'] == fold, :]

    words_dict_1, words_dict_0, words_dict = create_words_dict(train, text, target)
    obj_pairs_0 = Pairs(words_dict, words_dict_0, {})
    obj_pairs_1 = Pairs(words_dict, {}, words_dict_1)

    prioris = train[target].value_counts(normalize=True)
    counts = train[target].value_counts(normalize=False)

    xtest_prob_0, xtest_prob_1 = np.zeros(len(test)), np.zeros(len(test))
    solutions = np.zeros(len(test))
    for i, entry in enumerate(test[text]):
        print(f"{i}/{len(test)}")
        try:
            pairs_0 = obj_pairs_0.get_pairs(entry)
            pairs_1 = obj_pairs_1.get_pairs(entry)

            pairs_0 = pd.DataFrame(pairs_0)
            pairs_1 = pd.DataFrame(pairs_1)
            if len(pairs_0) > 0:
                pairs_0 = pairs_0.sort_values(by='int', ascending=False)
            if len(pairs_1) > 0:
                pairs_1 = pairs_1.sort_values(by='int', ascending=False)

            solution_0 = pd.DataFrame(get_solution(pairs_0, entry)).sort_values(by='int', ascending=False)
            solution_1 = pd.DataFrame(get_solution(pairs_1, entry)).sort_values(by='int', ascending=False)

            priori_words_0 = priori_words_1 = 1
            solutions[i] = len(solution_0) + len(solution_1)
            for row in solution_0.itertuples():
                    priori_words_0 *= (k + row.int_0) / (2*k + row.int)
            for row in solution_1.itertuples():
                    priori_words_1 *= (k + row.int_1) / (2*k + row.int)

            xtest_prob_0[i] = (priori_words_0 * prioris['ham']) / (priori_words_0 * prioris['ham'] +
                                                                priori_words_1 * prioris['spam'])
            xtest_prob_1[i] = (priori_words_1 * prioris['spam']) / (priori_words_0 * prioris['ham'] +
                                                                 priori_words_1 * prioris['spam'])
            #print(f"I = {solution['int'].sum()} I_0 = {solution['int_0'].sum()} I_1 = {solution['int_1'].sum()}")
        except Exception as e:
            print(e)
            #print(f"Can't label {i}, '{entry}' words not present in train")

     # Evaluate
    pred = xtest_prob_0 < xtest_prob_1
    pred_proba = np.nan_to_num(xtest_prob_1, nan=0.5)
    f1_2[fold] = f1_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred)
    auc_2[fold] = roc_auc_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred_proba)
    acc_2[fold] = accuracy_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred)
    recall_2[fold] = recall_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred)
    precision_2[fold] = precision_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred)

In [None]:
print(f"Precision: {precision_2.mean():.4f} +/- {precision_2.std():.4f}")
print(f"Recall: {recall_2.mean():.4f} +/- {recall_2.std():.4f}")
print(f"Accuracy: {acc_2.mean():.4f} +/- {acc_2.std():.4f}")
print(f"F1: {f1_2.mean():.4f} +/- {f1_2.std():.4f}")
print(f"ROC AUC: {auc_2.mean():.4f} +/- {auc_2.std():.4f}")

### Predição utilizando particionamento único

In [None]:
#intersect bayes
%%time
k = 0.001

f1, auc, acc, recall, precision = (
    np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5)
)

for fold in range(5):
    print(f"Fold {fold}/5")

    train = data.loc[data['fold'] != fold, :]
    test = data.loc[data['fold'] == fold, :]

    words_dict_1, words_dict_0, words_dict = create_words_dict(train, text, target)
    obj_pairs = Pairs(words_dict, words_dict_0, words_dict_1)

    prioris = train[target].value_counts(normalize=True)
    counts = train[target].value_counts(normalize=False)

    xtest_prob_0, xtest_prob_1 = np.zeros(len(test)), np.zeros(len(test))
    solutions = np.zeros(len(test))
    for i, entry in enumerate(test[text]):
        print(f"{i}/{len(test)}")
        try:
            pairs = pd.DataFrame(obj_pairs.get_pairs(entry))
            if len(pairs) > 0:
                pairs = pairs.sort_values(by='int', ascending=False)
            solution = pd.DataFrame(get_solution(pairs, entry)).sort_values(by='int', ascending=False)
            priori_words_0 = priori_words_1 = 1
            solutions[i] = len(solution)
            for row in solution.itertuples():
                    priori_words_0 *= (k + row.int_0) / (2*k + row.int)
                    priori_words_1 *= (k + row.int_1) / (2*k + row.int)

            xtest_prob_0[i] = (priori_words_0 * prioris['ham']) / (priori_words_0 * prioris['ham'] +
                                                                priori_words_1 * prioris['spam'])
            xtest_prob_1[i] = (priori_words_1 * prioris['spam']) / (priori_words_0 * prioris['ham'] +
                                                                 priori_words_1 * prioris['spam'])
            #print(f"I = {solution['int'].sum()} I_0 = {solution['int_0'].sum()} I_1 = {solution['int_1'].sum()}")
        except Exception as e:
            print(f"Can't label {i}, '{entry}' words not present in train")

     # Evaluate
    pred = xtest_prob_0 < xtest_prob_1
    pred_proba = np.nan_to_num(xtest_prob_1, nan=0.5)
    f1[fold] = f1_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred)
    auc[fold] = roc_auc_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred_proba)
    acc[fold] = accuracy_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred)
    recall[fold] = recall_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred)
    precision[fold] = precision_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred)
    #break

In [None]:
print(f"Recall: {recall.mean():.4f} +/- {recall.std():.4f}")
print(f"Precision: {precision.mean():.4f} +/- {precision.std():.4f}")
print(f"Accuracy: {acc.mean():.4f} +/- {acc.std():.4f}")
print(f"F1: {f1.mean():.4f} +/- {f1.std():.4f}")
print(f"ROC AUC: {auc.mean():.4f} +/- {auc.std():.4f}")

In [None]:
#naive bayes
%%time
k = 0.001

f1_pure, auc_pure, acc_pure, recall_pure, precision_pure = (
    np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5)
)

for fold in range(5):
    print(f"Fold {fold}/5")

    train = data.loc[data['fold'] != fold, :]
    test = data.loc[data['fold'] == fold, :]

    words_dict_1, words_dict_0, words_dict = create_words_dict(train, text, target)

    prioris = train[target].value_counts(normalize=True)
    counts = train[target].value_counts(normalize=False)

    xtest_prob_0_pure, xtest_prob_1_pure = np.zeros(len(test)), np.zeros(len(test))

    for i, entry in enumerate(test[text]):
        priori_words_0 = priori_words_1 = 1
        for word in set(entry.split(" ")):
            if word in words_dict_0:
                priori_words_0 *= (k + len(words_dict_0[word])) / (2*k + counts[0])
            else:
                priori_words_0 *= k / 2*k
            if word in words_dict_1:
                priori_words_1 *= (k + len(words_dict_1[word])) / (2*k + counts[1])
            else:
                priori_words_1 *= k / 2*k

        xtest_prob_0_pure[i] = (priori_words_0 * prioris['ham']) / (priori_words_0 * prioris['ham'] +
                                                               priori_words_1 * prioris['spam'])
        xtest_prob_1_pure[i] = (priori_words_1 * prioris['spam']) / (priori_words_0 * prioris['ham'] +
                                                                priori_words_1 * prioris['spam'])
    # Evaluate
    pred_pure = xtest_prob_0_pure < xtest_prob_1_pure
    pred_proba_pure = np.nan_to_num(xtest_prob_1_pure, nan=0.5)
    f1_pure[fold] = f1_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred_pure)
    auc_pure[fold] = roc_auc_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred_proba_pure)
    acc_pure[fold] = accuracy_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred_pure)
    recall_pure[fold] = recall_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred_pure)
    precision_pure[fold] = precision_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred_pure)

In [None]:
print(f"Recall: {recall_pure.mean():.4f} +/- {recall_pure.std():.4f}")
print(f"Precision: {precision_pure.mean():.4f} +/- {precision_pure.std():.4f}")
print(f"Accuracy: {acc_pure.mean():.4f} +/- {acc_pure.std():.4f}")
print(f"F1: {f1_pure.mean():.4f} +/- {f1_pure.std():.4f}")
print(f"ROC AUC: {auc_pure.mean():.4f} +/- {auc_pure.std():.4f}")

### Vectorizing

In [None]:
def CountOrTfidf_vec(train,test,opt):
#does the transformation for given type of vectorizer
    if(opt == "tfidf"):
        transformer = TfidfVectorizer(min_df=5, max_features=30000)
    elif(opt == "count"):
        transformer = CountVectorizer(min_df=5, max_features=30000)
    train_CorTf = transformer.fit_transform(train)
    test_CorTf = transformer.transform(test)
    return train_CorTf,test_CorTf

In [None]:
# multinomail naive bayes
f1_mnb, auc_mnb, acc_mnb, recall_mnb, precision_mnb = (
    np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5)
)

for fold in range(5):
    print(f"Fold {fold}/5")

    train = data.loc[data['fold'] != fold, :]
    test = data.loc[data['fold'] == fold, :]

    x_trainCorT, x_testCorT = CountOrTfidf_vec(train[text], test[text], "count")
    nb = MultinomialNB()
    nb.fit(x_trainCorT.toarray(), train[target].apply(lambda x: 1 if x == 'spam' else 0))

    pred_mnb = nb.predict(x_testCorT.toarray())
    pred_proba_mnb = nb.predict_proba(x_testCorT.toarray())[:, 1]

    f1_mnb[fold] = f1_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred_mnb)
    auc_mnb[fold] = roc_auc_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred_proba_mnb)
    acc_mnb[fold] = accuracy_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred_mnb)
    recall_mnb[fold] = recall_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred_mnb)
    precision_mnb[fold] = precision_score(test[target].apply(lambda x: True if x == 'spam' else False),
                             pred_mnb)
    break

In [None]:
print(f"Precision: {precision_mnb.mean():.4f} +/- {precision_mnb.std():.4f}")
print(f"Recall: {recall_mnb.mean():.4f} +/- {recall_mnb.std():.4f}")
print(f"Accuracy: {acc_mnb.mean():.4f} +/- {acc_mnb.std():.4f}")
print(f"F1: {f1_mnb.mean():.4f} +/- {f1_mnb.std():.4f}")
print(f"ROC AUC: {auc_mnb.mean():.4f} +/- {auc_mnb.std():.4f}")

In [None]:
test['pure'] = pred_proba_pure
test['intersect'] = pred_proba
#test['mnb'] = pred_proba_mnb

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 3), dpi=500)
plt.rcParams.update({'font.size': 14})

kde('pure', 'Spam/Ham', test, ax=axs[0], title="Naive Bayes")
axs[0].set_ylim([0,25])
kde('intersect', 'Spam/Ham', test, ax=axs[1], title="Intersect Bayes")
axs[1].set_ylim([0,25])
#kde('mnb', 'Spam/Ham', test, ax=axs[2])
fig.legend(["$\overline{spam}$", "$spam$"])

fig.text(0.5, 0.00, 'Probabilidade', ha='center')
fig.text(0.08, 0.5, 'Frequência', va='center', rotation='vertical')
fig.savefig("kde.png")

### testando novos dados. - IB com MNB

In [None]:
%%time
# cria um dicionário de grupos
def create_group_dict(train, text, target, solution_func):
    group_dict = defaultdict(set)
    for i, entry in enumerate(train[text]):
        pairs = solution_func(entry)  # interseção de grupos de palavras
        for group in pairs:
            group_id = f"{group['a']}_{group['b']}"  # id unico
            group_dict[group_id].add(i)
    return group_dict

# substituir palavras pelos ids dos grupos
def replace_with_groups(entry, solution_func):
    pairs = solution_func(entry)  # pares de palavras
    transformed_text = []
    for group in pairs:
        group_id = f"{group['a']}_{group['b']}"  # id único
        transformed_text.append(group_id)
    return ' '.join(transformed_text)

# calcular frequência das palavras
def calculate_word_frequencies(data, text_column):
    word_frequency = defaultdict(int) # dicionário para contar a frequência de palavras
    for entry in data[text_column]:
        for word in entry.split():
            word_frequency[word] += 1
    return word_frequency

# classe para criar pares de palavras
class Pairs:
    def __init__(self, words_dict, words_dict_0, words_dict_1, word_frequency, min_threshold):
        self.pairs_matrix = dict()
        self.words_dict = words_dict
        self.words_dict_1 = words_dict_1
        self.words_dict_0 = words_dict_0
        self.word_frequency = word_frequency
        self.min_threshold = min_threshold

    def get_pairs(self, entry):
        pairs = []
        words = list(set(entry.split(' ')))

        # filtrar palavras acima da frequencia minima
        words = [word for word in words if self.word_frequency[word] > self.min_threshold]

        for j in range(len(words)):
            for k in range(j+1, len(words)):
                if words[j] not in self.words_dict or words[k] not in self.words_dict:
                    continue
                intersection = self.words_dict[words[j]] & self.words_dict[words[k]]
                intersection_0 = self.words_dict_0.get(words[j], set()) & self.words_dict_0.get(words[k], set())
                intersection_1 = self.words_dict_1.get(words[j], set()) & self.words_dict_1.get(words[k], set())

                if len(intersection) > 0:
                    pairs.append({
                        'a': words[j],
                        'b': words[k],
                        'int': len(intersection),
                        'int_0': len(intersection_0),
                        'int_1': len(intersection_1)
                    })
        return pairs


# heuristica gulosa
def get_solution(pairs, entry):
    remaining = set(entry.split(' '))
    solution = []
    if len(pairs) > 0:
        for row in pairs:
            if row['a'] in remaining and row['b'] in remaining:
                solution.append(row)
                remaining.remove(row['a'])
                remaining.remove(row['b'])
                if len(remaining) <= 1:
                    break
    for word in remaining:
        solution.append({'a': word, 'b': word, 'int': 0, 'int_0': 0, 'int_1': 0})
    return solution

# aplicação do mnb
def train_mnb(train, test, text, target, solution_func):

    train[text] = train[text].apply(lambda x: replace_with_groups(x, solution_func))
    test[text] = test[text].apply(lambda x: replace_with_groups(x, solution_func))

    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(train[text])
    X_test = vectorizer.transform(test[text])

    clf = MultinomialNB()
    clf.fit(X_train, train[target])

    predictions = clf.predict(X_test)
    probabilities = clf.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(test[target], predictions)
    f1 = f1_score(test[target], predictions, average='weighted')
    precision = precision_score(test[target], predictions, average='weighted')
    recall = recall_score(test[target], predictions, average='weighted')
    roc_auc = roc_auc_score(test[target], probabilities)

    return accuracy, f1, precision, recall, roc_auc

if __name__ == "__main__":


    # 80/20 treino/teste
    train, test = train_test_split(data, test_size=0.2, random_state=42, stratify=data[target])

    # frequencia das palavras
    word_frequency = calculate_word_frequencies(train, text)

    # limiar de frequencia minima das palavras
    min_threshold = 2

    # criação dos dicionários de palavras
    words_dict_1, words_dict_0, words_dict = defaultdict(set), defaultdict(set), defaultdict(set)
    obj_pairs = Pairs(words_dict, words_dict_0, words_dict_1, word_frequency, min_threshold)

    # cria a solução
    def solution_func(entry):
        pairs = obj_pairs.get_pairs(entry)
        return get_solution(pairs, entry)  
        # solution = get_solution(pairs, entry) 
        # print(f"Grupos criados para o texto '{entry}': {solution}")  # Opcional: para depuração
        # return solution

    accuracy, f1, precision, recall, roc_auc = train_mnb(train, test, text, target, solution_func)
    print(f"Accuracy: {accuracy}")
    print(f"F1-Score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"Roc-Auc: {roc_auc}")