## Biliotecas

In [2]:
import pandas as pd
import numpy as np
# import json
# import tqdm

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import norm
from collections import defaultdict
import scipy.special as sc

import warnings
import subprocess
import sys

from os.path import exists

from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.metrics import classification_report, f1_score, accuracy_score, roc_auc_score, recall_score, precision_score
from sklearn.naive_bayes import MultinomialNB, GaussianNB, CategoricalNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

## Importe de Dados

In [4]:
path = "/content/drive/MyDrive/Mestrado_códigos/SpamCode/emailNPL/notebooks/corpusFakeRecogna/fakeNews_cleanTESTE.csv"


data = pd.read_csv(path).dropna().reset_index(drop=True)
data['id'] = data.index
data.head()

Unnamed: 0,Categoria,Classe,Texto,id
0,entretenimento,0,papa francisco preso sob acusação tráfico cria...,0
1,saúde,1,equador prepara cova coletiva mortos covid gua...,1
2,saúde,1,air france voltará operar voo direto pequimpar...,2
3,saúde,1,marfrig intensifica venda carne brasil eua apó...,3
4,entretenimento,0,parciais eleições alternaram vezes vitória aéc...,4


In [5]:
data['Classe'] = data['Classe'].astype(int).replace({0: 'fakenews', 1: 'real'})
data.head()

Unnamed: 0,Categoria,Classe,Texto,id
0,entretenimento,fakenews,papa francisco preso sob acusação tráfico cria...,0
1,saúde,real,equador prepara cova coletiva mortos covid gua...,1
2,saúde,real,air france voltará operar voo direto pequimpar...,2
3,saúde,real,marfrig intensifica venda carne brasil eua apó...,3
4,entretenimento,fakenews,parciais eleições alternaram vezes vitória aéc...,4


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11902 entries, 0 to 11901
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Categoria  11902 non-null  object
 1   Classe     11902 non-null  object
 2   Texto      11902 non-null  object
 3   id         11902 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 372.1+ KB


In [6]:
text = 'Texto'
target = 'Classe'

## K-Fold

In [None]:
K = 5
data['fold'] = [i % K for i in range(len(data))]

## Train / Test Split

In [None]:
# Train / test split
np.random.seed(42)

#dict2_bidding = df['l'].unique()

train_indices = np.random.choice(len(data), int(np.round(len(data) * 0.8)), replace=False)

train = data.loc[data['id'].isin(train_indices), :]
test = data.loc[~data['id'].isin(train_indices), :]

## Create Words Dict

In [None]:
# count words without max_feature. 0  = fake news and 1 = real
def create_words_dict(train, text, target):
    words_dict_0, words_dict_1, words_dict = dict(), dict(), dict()

    for i, (entry, yi) in enumerate(zip(train[text], train[target])):
        for word in set(entry.split(" ")):
            if word in words_dict:
                words_dict[word].add(i)
            else:
                words_dict[word] = {i}
            if yi == 'real':
                if word in words_dict_0:
                    words_dict_0[word].add(i)
                else:
                    words_dict_0[word] = {i}
            else:
                if word in words_dict_1:
                    words_dict_1[word].add(i)
                else:
                    words_dict_1[word] = {i}

    print(f"words_dict_0: {len(words_dict_0)} words_dict_1:{len(words_dict_1)}")

    return words_dict_1, words_dict_0, words_dict

In [None]:
words_dict_1, words_dict_0, words_dict = create_words_dict(data, text, target)
vocabulary_size = len(words_dict)

words_dict_0: 37426 words_dict_1:37822


## IntersectBayesClassifier

In [None]:
# Função para processar o particionamento da versão exata
def solution_clusters(js):
    remaining = set(entry.split(' '))
    solution = []
    for cluster in js['Clusters']:
        if len(cluster['Intersection']) > 0:
            a = entry_dict[cluster['Elements'][0]]
            b = entry_dict[cluster['Elements'][1]]
            intersection_0 = intersection_1 = set()
            if a in words_dict_0 and b in words_dict_0:
                intersection_0 = words_dict_0[a] & words_dict_0[b]
            if a in words_dict_1 and b in words_dict_1:
                intersection_1 = words_dict_1[a] & words_dict_1[b]
            solution.append({
                'a': a,
                'b': b,
                'int': len(cluster['Intersection']),
                'int_0': len(intersection_0),
                'int_1': len(intersection_1)
            })
            for element in cluster['Elements']:
                remaining.remove(entry_dict[element])
            if len(remaining) <= 1:
                break

    for word in remaining:
        if word in words_dict:
            solution.append({
                    'a': word,
                    'b': word,
                    'int': len(words_dict[word]),
                    'int_0': len(words_dict_0[word]) if word in words_dict_0 else 0,
                    'int_1': len(words_dict_1[word]) if word in words_dict_1 else 0
                })

    return solution

In [None]:
# Classe que cria os candidatos a serem grupos
class Pairs:
    def __init__(self, words_dict, words_dict_0, words_dict_1):
        self.pairs_matrix = dict()
        self.words_dict = words_dict
        self.words_dict_1 = words_dict_1
        self.words_dict_0 = words_dict_0

    def get_pairs(self, entry):
        pairs = []
        words = list(set(entry.split(' ')))
        for j in range(len(words)):
            for k in range(j+1, len(words)):
                if words[j] not in self.words_dict or words[k] not in self.words_dict:
                    continue
                if words[j] in self.pairs_matrix and words[k] in self.pairs_matrix[words[j]]:
                    if self.pairs_matrix[words[j]][words[k]][0] > 0:
                        pairs.append({
                            'a': words[j],
                            'b': words[k],
                            'int': self.pairs_matrix[words[j]][words[k]][0],
                            'int_0': self.pairs_matrix[words[j]][words[k]][1],
                            'int_1': self.pairs_matrix[words[j]][words[k]][2]
                        })
                    continue

                intersection = self.words_dict[words[j]] & self.words_dict[words[k]]
                intersection_0, intersection_1 = dict(), dict()
                if words[j] in self.words_dict_0 and words[k] in self.words_dict_0:
                    intersection_0 = words_dict_0[words[j]] & self.words_dict_0[words[k]]
                if words[j] in self.words_dict_1 and words[k] in self.words_dict_1:
                    intersection_1 = self.words_dict_1[words[j]] & self.words_dict_1[words[k]]

                vec = [len(intersection), len(intersection_0), len(intersection_1)]
                if words[j] not in self.pairs_matrix:
                    self.pairs_matrix[words[j]] = {words[k]: vec}
                else:
                    self.pairs_matrix[words[j]][words[k]] = vec
                if words[k] not in self.pairs_matrix:
                    self.pairs_matrix[words[k]] = {words[j]: vec}
                else:
                    self.pairs_matrix[words[k]][words[j]] = vec

                if len(intersection) > 0:
                    pairs.append({
                        'a': words[j],
                        'b': words[k],
                        'int': self.pairs_matrix[words[j]][words[k]][0],
                        'int_0': self.pairs_matrix[words[j]][words[k]][1],
                        'int_1': self.pairs_matrix[words[j]][words[k]][2]
                    })

        return pairs

In [None]:
# Heuristica gulosa
# pairs = candidatos, supoe-se que está ordenado pelo tamanho da interseção
# entry = texto do email
def get_solution(pairs, entry):
    remaining = set(entry.split(' '))
    solution = []
    # Se houver grupos
    if len(pairs) > 0:
        # Para cada grupo
        for row in pairs.itertuples():
            # Se o grupo pode ser adicionado no particionamento, o adicone
            if row.a in remaining and row.b in remaining:
                solution.append({
                    'a': row.a,
                    'b': row.b,
                    'int': row.int,
                    'int_0': row.int_0,
                    'int_1': row.int_1
                })
                remaining.remove(row.a)
                remaining.remove(row.b)
                if len(remaining) <= 1:
                    break

    # Caso existam palavras fora do particionamento, criar grupos unitários para elas.
    # Tanto faz criar um único ou vários, não muda a função objetivo
    for word in remaining:
        if word in words_dict:
            solution.append({
                    'a': word,
                    'b': word,
                    'int': len(words_dict[word]),
                    'int_0': len(words_dict_0[word]) if word in words_dict_0 else 0,
                    'int_1': len(words_dict_1[word]) if word in words_dict_1 else 0
                })

    return solution

### Predição utilizando particionamentos $P^{fakenews}$ e $P^{real}$

In [None]:
%%time
k = 0.001

f1_2, auc_2, acc_2, recall_2, precision_2 = (
    np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5)
)

for fold in range(5):
    break
    print(f"Fold {fold}/5")

    train = data.loc[data['fold'] != fold, :]
    test = data.loc[data['fold'] == fold, :]

    words_dict_1, words_dict_0, words_dict = create_words_dict(train, text, target)
    obj_pairs_0 = Pairs(words_dict, words_dict_0, {})
    obj_pairs_1 = Pairs(words_dict, {}, words_dict_1)

    prioris = train[target].value_counts(normalize=True)
    counts = train[target].value_counts(normalize=False)

    xtest_prob_0, xtest_prob_1 = np.zeros(len(test)), np.zeros(len(test))
    solutions = np.zeros(len(test))
    for i, entry in enumerate(test[text]):
        print(f"{i}/{len(test)}")
        try:
            pairs_0 = obj_pairs_0.get_pairs(entry)
            pairs_1 = obj_pairs_1.get_pairs(entry)

            pairs_0 = pd.DataFrame(pairs_0)
            pairs_1 = pd.DataFrame(pairs_1)
            if len(pairs_0) > 0:
                pairs_0 = pairs_0.sort_values(by='int', ascending=False)
            if len(pairs_1) > 0:
                pairs_1 = pairs_1.sort_values(by='int', ascending=False)

            solution_0 = pd.DataFrame(get_solution(pairs_0, entry)).sort_values(by='int', ascending=False)
            solution_1 = pd.DataFrame(get_solution(pairs_1, entry)).sort_values(by='int', ascending=False)

            priori_words_0 = priori_words_1 = 1
            solutions[i] = len(solution_0) + len(solution_1)
            for row in solution_0.itertuples():
                    priori_words_0 *= (k + row.int_0) / (2*k + row.int)
            for row in solution_1.itertuples():
                    priori_words_1 *= (k + row.int_1) / (2*k + row.int)

            xtest_prob_0[i] = (priori_words_0 * prioris['real']) / (priori_words_0 * prioris['real'] +
                                                                priori_words_1 * prioris['fakenews'])
            xtest_prob_1[i] = (priori_words_1 * prioris['fakenews']) / (priori_words_0 * prioris['real'] +
                                                                 priori_words_1 * prioris['fakenews'])
            #print(f"I = {solution['int'].sum()} I_0 = {solution['int_0'].sum()} I_1 = {solution['int_1'].sum()}")
        except Exception as e:
            print(e)
            #print(f"Can't label {i}, '{entry}' words not present in train")

     # Evaluate
    pred = xtest_prob_0 < xtest_prob_1
    pred_proba = np.nan_to_num(xtest_prob_1, nan=0.5)
    f1_2[fold] = f1_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred)
    auc_2[fold] = roc_auc_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred_proba)
    acc_2[fold] = accuracy_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred)
    recall_2[fold] = recall_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred)
    precision_2[fold] = precision_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred)

CPU times: user 30 µs, sys: 0 ns, total: 30 µs
Wall time: 33.4 µs


In [None]:
print(f"Precision: {precision_2.mean():.4f} +/- {precision_2.std():.4f}")
print(f"Recall: {recall_2.mean():.4f} +/- {recall_2.std():.4f}")
print(f"Accuracy: {acc_2.mean():.4f} +/- {acc_2.std():.4f}")
print(f"F1: {f1_2.mean():.4f} +/- {f1_2.std():.4f}")
print(f"ROC AUC: {auc_2.mean():.4f} +/- {auc_2.std():.4f}")

Precision: 0.0000 +/- 0.0000
Recall: 0.0000 +/- 0.0000
Accuracy: 0.0000 +/- 0.0000
F1: 0.0000 +/- 0.0000
ROC AUC: 0.0000 +/- 0.0000


# Intersect bayes usando a probabilidade do MNB


In [None]:
#intersect bayes
%%time
k = 0.001

f1, auc, acc, recall, precision = (
    np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5)
)

for fold in range(5):
    print(f"Fold {fold}/5")

    train = data.loc[data['fold'] != fold, :]
    test = data.loc[data['fold'] == fold, :]

    words_dict_1, words_dict_0, words_dict = create_words_dict(train, text, target)
    obj_pairs = Pairs(words_dict, words_dict_0, words_dict_1)

    prioris = train[target].value_counts(normalize=True)
    print(prioris)
    # counts = train[target].value_counts(normalize=False)

    xtest_prob_0, xtest_prob_1 = np.zeros(len(test)), np.zeros(len(test))
    solutions = np.zeros(len(test))
    for i, entry in enumerate(test[text]):
        print(f"{i}/{len(test)}")
        try:
            pairs = pd.DataFrame(obj_pairs.get_pairs(entry))
            if len(pairs) > 0:
                pairs = pairs.sort_values(by='int', ascending=False)
            solution = pd.DataFrame(get_solution(pairs, entry)).sort_values(by='int', ascending=False)
            priori_words_0 = prioris['real']
            priori_words_1 = prioris['fakenews']
            solutions[i] = len(solution)
            for row in solution.itertuples():
              priori_words_0 *= (k + row.int_0) / (k * len(words_dict_0) + sum(len(v) for v in words_dict_0.values()))
              priori_words_1 *= (k + row.int_1) / (k * len(words_dict_1) + sum(len(v) for v in words_dict_1.values()))


              xtest_prob_0[i] = (priori_words_0 * prioris['real']) / (priori_words_0 * prioris['real'] +
                                                                priori_words_1 * prioris['fakenews'])
              xtest_prob_1[i] = (priori_words_1 * prioris['fakenews']) / (priori_words_0 * prioris['real'] +
                                                                 priori_words_1 * prioris['fakenews'])
            #print(f"I = {solution['int'].sum()} I_0 = {solution['int_0'].sum()} I_1 = {solution['int_1'].sum()}")
        except Exception as e:
            print(f"Can't label {i}, '{entry}' words not present in train")

     # Evaluate
    pred = xtest_prob_0 < xtest_prob_1
    pred_proba = np.nan_to_num(xtest_prob_1, nan=0.5)
    f1[fold] = f1_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                                pred)
    auc[fold] = roc_auc_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                                pred_proba)
    acc[fold] = accuracy_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                                pred)
    recall[fold] = recall_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                                pred)
    precision[fold] = precision_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                                pred)
    #break

Fold 0/5
words_dict_0: 33176 words_dict_1:33009
Classe
real        0.501943
fakenews    0.498057
Name: proportion, dtype: float64
0/2381
1/2381
2/2381
3/2381
4/2381
5/2381
6/2381
7/2381
8/2381
9/2381
10/2381
11/2381
12/2381
13/2381
14/2381
15/2381
16/2381
17/2381
18/2381
19/2381
20/2381
21/2381
22/2381
23/2381
24/2381
25/2381
26/2381
27/2381
28/2381
29/2381
30/2381
31/2381
32/2381
33/2381
34/2381
35/2381
36/2381
37/2381
38/2381
39/2381
40/2381
41/2381
42/2381
43/2381
44/2381
45/2381
46/2381
47/2381
48/2381
49/2381
50/2381
51/2381
52/2381
53/2381
54/2381
55/2381
56/2381
57/2381
58/2381
59/2381
60/2381
61/2381
62/2381
63/2381
64/2381
65/2381
66/2381
67/2381
68/2381
69/2381
70/2381
71/2381
72/2381
73/2381
74/2381
75/2381
76/2381
77/2381
78/2381
79/2381
80/2381
81/2381
82/2381
83/2381
84/2381
85/2381
86/2381
87/2381
88/2381
89/2381
90/2381
91/2381
92/2381
93/2381
94/2381
95/2381
96/2381
97/2381
98/2381
99/2381
100/2381
101/2381
102/2381
103/2381
104/2381
105/2381
106/2381
107/2381
108/2381



156/2381
157/2381
158/2381
159/2381
160/2381
161/2381
162/2381
163/2381
164/2381
165/2381
166/2381
167/2381
168/2381
169/2381
170/2381
171/2381
172/2381
173/2381
174/2381
175/2381
176/2381
177/2381
178/2381
179/2381
180/2381
181/2381
182/2381
183/2381
184/2381
185/2381
186/2381
187/2381
188/2381
189/2381
190/2381
191/2381
192/2381
193/2381
194/2381
195/2381
196/2381
197/2381
198/2381
199/2381
200/2381
201/2381
202/2381
203/2381
204/2381
205/2381
206/2381
207/2381
208/2381
209/2381
210/2381
211/2381
212/2381
213/2381
214/2381
215/2381
216/2381
217/2381
218/2381
219/2381
220/2381
221/2381
222/2381
223/2381
224/2381
225/2381
226/2381
227/2381
228/2381
229/2381
230/2381
231/2381
232/2381
233/2381
234/2381
235/2381
236/2381
237/2381
238/2381
239/2381
240/2381
241/2381
242/2381
243/2381
244/2381
245/2381
246/2381
247/2381
248/2381
249/2381
250/2381
251/2381
252/2381
253/2381
254/2381
255/2381
256/2381
257/2381
258/2381
259/2381
260/2381
261/2381
262/2381
263/2381
264/2381
265/2381
266/2381
2



1080/2381
1081/2381
1082/2381
1083/2381
1084/2381
1085/2381
1086/2381
1087/2381
1088/2381
1089/2381
1090/2381
1091/2381
1092/2381
1093/2381
1094/2381
1095/2381
1096/2381
1097/2381
1098/2381
1099/2381
1100/2381
1101/2381
1102/2381
1103/2381
1104/2381
1105/2381
1106/2381
1107/2381
1108/2381
1109/2381
1110/2381
1111/2381
1112/2381
1113/2381
1114/2381
1115/2381
1116/2381
1117/2381
1118/2381
1119/2381
1120/2381
1121/2381
1122/2381
1123/2381
1124/2381
1125/2381
1126/2381
1127/2381
1128/2381
1129/2381
1130/2381
1131/2381
1132/2381
1133/2381
1134/2381
1135/2381
1136/2381
1137/2381
1138/2381
1139/2381
1140/2381
1141/2381
1142/2381
1143/2381
1144/2381
1145/2381
1146/2381
1147/2381
1148/2381
1149/2381
1150/2381
1151/2381
1152/2381
1153/2381
1154/2381
1155/2381
1156/2381
1157/2381
1158/2381
1159/2381
1160/2381
1161/2381
1162/2381
1163/2381
1164/2381
1165/2381
1166/2381
1167/2381
1168/2381
1169/2381
1170/2381
1171/2381
1172/2381
1173/2381
1174/2381
1175/2381
1176/2381
1177/2381
1178/2381
1179/2381




3/2381
4/2381
5/2381
6/2381
7/2381
8/2381
9/2381
10/2381
11/2381
12/2381
13/2381
14/2381
15/2381
16/2381
17/2381
18/2381
19/2381
20/2381
21/2381
22/2381
23/2381
24/2381
25/2381
26/2381
27/2381
28/2381
29/2381
30/2381
31/2381
32/2381
33/2381
34/2381
35/2381
36/2381
37/2381
38/2381
39/2381
40/2381
41/2381
42/2381
43/2381
44/2381
45/2381
46/2381
47/2381
48/2381
49/2381
50/2381
51/2381
52/2381
53/2381
54/2381
55/2381
56/2381
57/2381
58/2381
59/2381
60/2381
61/2381
62/2381
63/2381
64/2381
65/2381
66/2381
67/2381
68/2381
69/2381
70/2381
71/2381
72/2381
73/2381
74/2381
75/2381
76/2381
77/2381
78/2381
79/2381
80/2381
81/2381
82/2381
83/2381
84/2381
85/2381
86/2381
87/2381
88/2381
89/2381
90/2381
91/2381
92/2381
93/2381
94/2381
95/2381
96/2381
97/2381
98/2381
99/2381
100/2381
101/2381
102/2381
103/2381
104/2381
105/2381
106/2381
107/2381
108/2381
109/2381
110/2381
111/2381
112/2381
113/2381
114/2381
115/2381
116/2381
117/2381
118/2381
119/2381
120/2381
121/2381
122/2381
123/2381
124/2381
125/23



3/2380
4/2380
5/2380
6/2380
7/2380
8/2380
9/2380
10/2380
11/2380
12/2380
13/2380
14/2380
15/2380
16/2380
17/2380
18/2380
19/2380
20/2380
21/2380
22/2380
23/2380
24/2380
25/2380
26/2380
27/2380
28/2380
29/2380
30/2380
31/2380
32/2380
33/2380
34/2380
35/2380
36/2380
37/2380
38/2380
39/2380
40/2380
41/2380
42/2380
43/2380
44/2380
45/2380
46/2380
47/2380
48/2380
49/2380
50/2380
51/2380
52/2380
53/2380
54/2380
55/2380
56/2380
57/2380
58/2380
59/2380
60/2380
61/2380
62/2380
63/2380
64/2380
65/2380
66/2380
67/2380
68/2380
69/2380
70/2380
71/2380
72/2380
73/2380
74/2380
75/2380
76/2380
77/2380
78/2380
79/2380
80/2380
81/2380
82/2380
83/2380
84/2380
85/2380
86/2380
87/2380
88/2380
89/2380
90/2380
91/2380
92/2380
93/2380
94/2380
95/2380
96/2380
97/2380
98/2380
99/2380
100/2380
101/2380
102/2380
103/2380
104/2380
105/2380
106/2380
107/2380
108/2380
109/2380
110/2380
111/2380
112/2380
113/2380
114/2380
115/2380
116/2380
117/2380
118/2380
119/2380
120/2380
121/2380
122/2380
123/2380
124/2380
125/23



17/2380
18/2380
19/2380
20/2380
21/2380
22/2380
23/2380
24/2380
25/2380
26/2380
27/2380
28/2380
29/2380
30/2380
31/2380
32/2380
33/2380
34/2380
35/2380
36/2380
37/2380
38/2380
39/2380
40/2380
41/2380
42/2380
43/2380
44/2380
45/2380
46/2380
47/2380
48/2380
49/2380
50/2380
51/2380
52/2380
53/2380
54/2380
55/2380
56/2380
57/2380
58/2380
59/2380
60/2380
61/2380
62/2380
63/2380
64/2380
65/2380
66/2380
67/2380
68/2380
69/2380
70/2380
71/2380
72/2380
73/2380
74/2380
75/2380
76/2380
77/2380
78/2380
79/2380
80/2380
81/2380
82/2380
83/2380
84/2380
85/2380
86/2380
87/2380
88/2380
89/2380
90/2380
91/2380
92/2380
93/2380
94/2380
95/2380
96/2380
97/2380
98/2380
99/2380
100/2380
101/2380
102/2380
103/2380
104/2380
105/2380
106/2380
107/2380
108/2380
109/2380
110/2380
111/2380
112/2380
113/2380
114/2380
115/2380
116/2380
117/2380
118/2380
119/2380
120/2380
121/2380
122/2380
123/2380
124/2380
125/2380
126/2380
127/2380
128/2380
129/2380
130/2380
131/2380
132/2380
133/2380
134/2380
135/2380
136/2380
137



14/2380
15/2380
16/2380
17/2380
18/2380
19/2380
20/2380
21/2380
22/2380
23/2380
24/2380
25/2380
26/2380
27/2380
28/2380
29/2380
30/2380
31/2380
32/2380
33/2380
34/2380
35/2380
36/2380
37/2380
38/2380
39/2380
40/2380
41/2380
42/2380
43/2380
44/2380
45/2380
46/2380
47/2380
48/2380
49/2380
50/2380
51/2380
52/2380
53/2380
54/2380
55/2380
56/2380
57/2380
58/2380
59/2380
60/2380
61/2380
62/2380
63/2380
64/2380
65/2380
66/2380
67/2380
68/2380
69/2380
70/2380
71/2380
72/2380
73/2380
74/2380
75/2380
76/2380
77/2380
78/2380
79/2380
80/2380
81/2380
82/2380
83/2380
84/2380
85/2380
86/2380
87/2380
88/2380
89/2380
90/2380
91/2380
92/2380
93/2380
94/2380
95/2380
96/2380
97/2380
98/2380
99/2380
100/2380
101/2380
102/2380
103/2380
104/2380
105/2380
106/2380
107/2380
108/2380
109/2380
110/2380
111/2380
112/2380
113/2380
114/2380
115/2380
116/2380
117/2380
118/2380
119/2380
120/2380
121/2380
122/2380
123/2380
124/2380
125/2380
126/2380
127/2380
128/2380
129/2380
130/2380
131/2380
132/2380
133/2380
134/23

In [None]:
print(f"Recall: {recall.mean():.4f} +/- {recall.std():.4f}")
print(f"Precision: {precision.mean():.4f} +/- {precision.std():.4f}")
print(f"Accuracy: {acc.mean():.4f} +/- {acc.std():.4f}")
print(f"F1: {f1.mean():.4f} +/- {f1.std():.4f}")
print(f"ROC AUC: {auc.mean():.4f} +/- {auc.std():.4f}")

Recall: 0.8520 +/- 0.0128
Precision: 0.9460 +/- 0.0079
Accuracy: 0.9015 +/- 0.0068
F1: 0.8964 +/- 0.0062
ROC AUC: 0.9687 +/- 0.0027


# Predição utilizando particionamento único - IB

In [None]:
#intersect bayes
%%time
k = 0.001

f1, auc, acc, recall, precision = (
    np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5)
)

for fold in range(5):
    print(f"Fold {fold}/5")

    train = data.loc[data['fold'] != fold, :]
    test = data.loc[data['fold'] == fold, :]

    words_dict_1, words_dict_0, words_dict = create_words_dict(train, text, target)
    obj_pairs = Pairs(words_dict, words_dict_0, words_dict_1)

    prioris = train[target].value_counts(normalize=True)
    counts = train[target].value_counts(normalize=False)

    xtest_prob_0, xtest_prob_1 = np.zeros(len(test)), np.zeros(len(test))
    solutions = np.zeros(len(test))
    for i, entry in enumerate(test[text]):
        print(f"{i}/{len(test)}")
        try:
            pairs = pd.DataFrame(obj_pairs.get_pairs(entry))
            if len(pairs) > 0:
                pairs = pairs.sort_values(by='int', ascending=False)
            solution = pd.DataFrame(get_solution(pairs, entry)).sort_values(by='int', ascending=False)
            priori_words_0 = priori_words_1 = 1
            solutions[i] = len(solution)
            for row in solution.itertuples():
                    priori_words_0 *= (k + row.int_0) / (2*k + row.int)
                    priori_words_1 *= (k + row.int_1) / (2*k + row.int)

            xtest_prob_0[i] = (priori_words_0 * prioris['real']) / (priori_words_0 * prioris['real'] +
                                                                priori_words_1 * prioris['fakenews'])
            xtest_prob_1[i] = (priori_words_1 * prioris['fakenews']) / (priori_words_0 * prioris['real'] +
                                                                 priori_words_1 * prioris['fakenews'])
            #print(f"I = {solution['int'].sum()} I_0 = {solution['int_0'].sum()} I_1 = {solution['int_1'].sum()}")
        except Exception as e:
            print(f"Can't label {i}, '{entry}' words not present in train")

     # Evaluate
    pred = xtest_prob_0 < xtest_prob_1
    pred_proba = np.nan_to_num(xtest_prob_1, nan=0.5)
    f1[fold] = f1_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred)
    auc[fold] = roc_auc_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred_proba)
    acc[fold] = accuracy_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred)
    recall[fold] = recall_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred)
    precision[fold] = precision_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred)
    #break

[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
2146/2380
2147/2380
2148/2380
2149/2380
2150/2380
2151/2380
2152/2380
2153/2380
2154/2380
2155/2380
2156/2380
2157/2380
2158/2380
2159/2380
2160/2380
2161/2380
2162/2380
2163/2380
2164/2380
2165/2380
2166/2380
2167/2380
2168/2380
2169/2380
2170/2380
2171/2380
2172/2380
2173/2380
2174/2380
2175/2380
2176/2380
2177/2380
2178/2380
2179/2380
2180/2380
2181/2380
2182/2380
2183/2380
2184/2380
2185/2380
2186/2380
2187/2380
2188/2380
2189/2380
2190/2380
2191/2380
2192/2380
2193/2380
2194/2380
2195/2380
2196/2380
2197/2380
2198/2380
2199/2380
2200/2380
2201/2380
2202/2380
2203/2380
2204/2380
2205/2380
2206/2380
2207/2380
2208/2380
2209/2380
2210/2380
2211/2380
2212/2380
2213/2380
2214/2380
2215/2380
2216/2380
2217/2380
2218/2380
2219/2380
2220/2380
2221/2380
2222/2380
2223/2380
2224/2380
2225/2380
2226/2380
2227/2380
2228/2380
2229/2380
2230/2380
2231/2380
2232/2380
2233/2380
2234/2380
2235/2380
2236/2380
2237/2380
2238/23

In [None]:
print(f"Recall: {recall.mean():.4f} +/- {recall.std():.4f}")
print(f"Precision: {precision.mean():.4f} +/- {precision.std():.4f}")
print(f"Accuracy: {acc.mean():.4f} +/- {acc.std():.4f}")
print(f"F1: {f1.mean():.4f} +/- {f1.std():.4f}")
print(f"ROC AUC: {auc.mean():.4f} +/- {auc.std():.4f}")

Recall: 0.8315 +/- 0.0111
Precision: 0.9633 +/- 0.0082
Accuracy: 0.8998 +/- 0.0064
F1: 0.8924 +/- 0.0060
ROC AUC: 0.9728 +/- 0.0032


In [None]:
#naive bayes
%%time
k = 0.001

f1_pure, auc_pure, acc_pure, recall_pure, precision_pure = (
    np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5)
)

for fold in range(5):
    print(f"Fold {fold}/5")

    train = data.loc[data['fold'] != fold, :]
    test = data.loc[data['fold'] == fold, :]

    words_dict_1, words_dict_0, words_dict = create_words_dict(train, text, target)

    prioris = train[target].value_counts(normalize=True)
    counts = train[target].value_counts(normalize=False)

    xtest_prob_0_pure, xtest_prob_1_pure = np.zeros(len(test)), np.zeros(len(test))

    for i, entry in enumerate(test[text]):
        priori_words_0 = priori_words_1 = 1
        for word in set(entry.split(" ")):
            if word in words_dict_0:
                priori_words_0 *= (k + len(words_dict_0[word])) / (2*k + counts[0])
            else:
                priori_words_0 *= k / 2*k
            if word in words_dict_1:
                priori_words_1 *= (k + len(words_dict_1[word])) / (2*k + counts[1])
            else:
                priori_words_1 *= k / 2*k

        xtest_prob_0_pure[i] = (priori_words_0 * prioris['real']) / (priori_words_0 * prioris['real'] +
                                                               priori_words_1 * prioris['fakenews'])
        xtest_prob_1_pure[i] = (priori_words_1 * prioris['fakenews']) / (priori_words_0 * prioris['real'] +
                                                                priori_words_1 * prioris['fakenews'])
    # Evaluate
    pred_pure = xtest_prob_0_pure < xtest_prob_1_pure
    pred_proba_pure = np.nan_to_num(xtest_prob_1_pure, nan=0.5)
    f1_pure[fold] = f1_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred_pure)
    auc_pure[fold] = roc_auc_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred_proba_pure)
    acc_pure[fold] = accuracy_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred_pure)
    recall_pure[fold] = recall_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred_pure)
    precision_pure[fold] = precision_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred_pure)

Fold 0/5
words_dict_0: 33176 words_dict_1:33009




Fold 1/5
words_dict_0: 33080 words_dict_1:33100




Fold 2/5
words_dict_0: 32931 words_dict_1:33220




Fold 3/5
words_dict_0: 32837 words_dict_1:33371




Fold 4/5
words_dict_0: 32981 words_dict_1:33243




CPU times: user 29.1 s, sys: 331 ms, total: 29.4 s
Wall time: 29.8 s


In [None]:
print(f"Recall: {recall_pure.mean():.4f} +/- {recall_pure.std():.4f}")
print(f"Precision: {precision_pure.mean():.4f} +/- {precision_pure.std():.4f}")
print(f"Accuracy: {acc_pure.mean():.4f} +/- {acc_pure.std():.4f}")
print(f"F1: {f1_pure.mean():.4f} +/- {f1_pure.std():.4f}")
print(f"ROC AUC: {auc_pure.mean():.4f} +/- {auc_pure.std():.4f}")

Recall: 0.8121 +/- 0.0114
Precision: 0.9680 +/- 0.0076
Accuracy: 0.8925 +/- 0.0060
F1: 0.8832 +/- 0.0055
ROC AUC: 0.9727 +/- 0.0028


### Vectorizing

In [None]:
def CountOrTfidf_vec(train,test,opt):
#does the transformation for given type of vectorizer
    if(opt == "tfidf"):
        transformer = TfidfVectorizer(min_df=5, max_features=30000)
    elif(opt == "count"):
        transformer = CountVectorizer(min_df=5, max_features=30000)
    train_CorTf = transformer.fit_transform(train)
    test_CorTf = transformer.transform(test)
    return train_CorTf,test_CorTf

In [None]:
#multinomial Naive Bayes
%%time
f1_mnb, auc_mnb, acc_mnb, recall_mnb, precision_mnb = (
    np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5), np.zeros(5)
)

for fold in range(5):
    print(f"Fold {fold}/5")

    train = data.loc[data['fold'] != fold, :]
    test = data.loc[data['fold'] == fold, :]

    x_trainCorT, x_testCorT = CountOrTfidf_vec(train[text], test[text], "count")
    nb = MultinomialNB()
    nb.fit(x_trainCorT.toarray(), train[target].apply(lambda x: 1 if x == 'fakenews' else 0))

    pred_mnb = nb.predict(x_testCorT.toarray())
    pred_proba_mnb = nb.predict_proba(x_testCorT.toarray())[:, 1]

    f1_mnb[fold] = f1_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred_mnb)
    auc_mnb[fold] = roc_auc_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred_proba_mnb)
    acc_mnb[fold] = accuracy_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred_mnb)
    recall_mnb[fold] = recall_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred_mnb)
    precision_mnb[fold] = precision_score(test[target].apply(lambda x: True if x == 'fakenews' else False),
                             pred_mnb)
    #break

Fold 0/5
Fold 1/5
Fold 2/5
Fold 3/5
Fold 4/5
CPU times: user 40.7 s, sys: 5.29 s, total: 46 s
Wall time: 45.7 s


In [None]:
print(f"Precision: {precision_mnb.mean():.4f} +/- {precision_mnb.std():.4f}")
print(f"Recall: {recall_mnb.mean():.4f} +/- {recall_mnb.std():.4f}")
print(f"Accuracy: {acc_mnb.mean():.4f} +/- {acc_mnb.std():.4f}")
print(f"F1: {f1_mnb.mean():.4f} +/- {f1_mnb.std():.4f}")
print(f"ROC AUC: {auc_mnb.mean():.4f} +/- {auc_mnb.std():.4f}")

Precision: 0.9087 +/- 0.0077
Recall: 0.9039 +/- 0.0058
Accuracy: 0.9065 +/- 0.0048
F1: 0.9063 +/- 0.0039
ROC AUC: 0.9657 +/- 0.0045


In [None]:
test['pure'] = pred_proba_pure
test['intersect'] = pred_proba
#test['mnb'] = pred_proba_mnb

In [None]:
def kde(column, target, data, ax=None, title=None, xlabel="", ylabel=""):
    if ax is None: fig, ax = plt.subplots(1, 1, figsize=(15, 3))
    g = sns.kdeplot(data[column][(data[target] == 'real')], ax=ax, color="Red")
    g = sns.kdeplot(data[column][(data[target] == 'fakenews')], ax=g, color="Blue")
    g.set_xlabel(xlabel)
    g.set_ylabel(ylabel)
    if title is not None: g.set_title(title)
    #g = g.legend(['Não conluio', 'Conluio'])

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(15, 3), dpi=500)
plt.rcParams.update({'font.size': 14})

kde('pure', 'Classe', test, ax=axs[0], title="Naive Bayes")
axs[0].set_ylim([0,25])
kde('intersect', 'Classe', test, ax=axs[1], title="Intersect Bayes")
axs[1].set_ylim([0,25])
#kde('mnb', 'Spam/Ham', test, ax=axs[2])
fig.legend(["$\overline{fakenews}$", "$fakenews$"])

fig.text(0.5, 0.00, 'Probabilidade', ha='center')
fig.text(0.08, 0.5, 'Frequência', va='center', rotation='vertical')
fig.savefig("kde.png")

### testar novo código - Intersect Bayes com MNB

In [18]:
%%time
# cria um dicionário de grupos
def create_group_dict(train, text, target, solution_func):
    group_dict = defaultdict(set)
    for i, entry in enumerate(train[text]):
        pairs = solution_func(entry)  # interseção de grupos de palavras
        for group in pairs:
            group_id = f"{group['a']}_{group['b']}" # id único
            group_dict[group_id].add(i)
    return group_dict

# substituir palavras pelos ids dos grupos
def replace_with_groups(entry, solution_func):
    pairs = solution_func(entry)  # pares de palavras
    transformed_text = []
    for group in pairs:
        group_id = f"{group['a']}_{group['b']}" # id único
        transformed_text.append(group_id)
    return ' '.join(transformed_text)

# calcular frequência das palavras 
def calculate_word_frequencies(data, text_column):
    word_frequency = defaultdict(int) # dicionário para contar a frequência de palavras
    for entry in data[text_column]:
        for word in entry.split():
            word_frequency[word] += 1
    return word_frequency

# classe para criar pares de palavras
class Pairs:
    def __init__(self, words_dict, words_dict_0, words_dict_1, word_frequency, min_threshold):
        self.pairs_matrix = dict()
        self.words_dict = words_dict
        self.words_dict_1 = words_dict_1
        self.words_dict_0 = words_dict_0
        self.word_frequency = word_frequency
        self.min_threshold = min_threshold

    def get_pairs(self, entry):
        pairs = []
        words = list(set(entry.split(' ')))

        # filtrar palavras acima da frequencia minima
        words = [word for word in words if self.word_frequency[word] > self.min_threshold]

        for j in range(len(words)):
            for k in range(j+1, len(words)):
                if words[j] not in self.words_dict or words[k] not in self.words_dict:
                    continue
                intersection = self.words_dict[words[j]] & self.words_dict[words[k]]
                intersection_0 = self.words_dict_0.get(words[j], set()) & self.words_dict_0.get(words[k], set())
                intersection_1 = self.words_dict_1.get(words[j], set()) & self.words_dict_1.get(words[k], set())

                if len(intersection) > 0:
                    pairs.append({
                        'a': words[j],
                        'b': words[k],
                        'int': len(intersection),
                        'int_0': len(intersection_0),
                        'int_1': len(intersection_1)
                    })
        return pairs

# heuristica gulosa
def get_solution(pairs, entry):
    remaining = set(entry.split(' '))
    solution = []
    if len(pairs) > 0:
        for row in pairs:
            if row['a'] in remaining and row['b'] in remaining:
                solution.append(row)
                remaining.remove(row['a'])
                remaining.remove(row['b'])
                if len(remaining) <= 1:
                    break
    for word in remaining:
        solution.append({'a': word, 'b': word, 'int': 0, 'int_0': 0, 'int_1': 0})
    return solution

# aplicação do mnb
def train_mnb(train, test, text, target, solution_func):
   
    train[text] = train[text].apply(lambda x: replace_with_groups(x, solution_func))
    test[text] = test[text].apply(lambda x: replace_with_groups(x, solution_func))

    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(train[text])
    X_test = vectorizer.transform(test[text])

    clf = MultinomialNB()
    clf.fit(X_train, train[target])

    predictions = clf.predict(X_test)
    probabilities = clf.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(test[target], predictions)
    f1 = f1_score(test[target], predictions, average='weighted')
    precision = precision_score(test[target], predictions, average='weighted')
    recall = recall_score(test[target], predictions, average='weighted')
    roc_auc = roc_auc_score(test[target], probabilities)

    return accuracy, f1, precision, recall, roc_auc

if __name__ == "__main__":

    # 80/20 treino/teste
    train, test = train_test_split(data, test_size=0.2, random_state=42, stratify=data[target])

    # frequencia das palavras
    word_frequency = calculate_word_frequencies(train, text)

    # limiar de frequencia minima das palavras
    min_threshold = 2

    # criação dos dicionários de palavras
    words_dict_1, words_dict_0, words_dict = defaultdict(set), defaultdict(set), defaultdict(set)
    obj_pairs = Pairs(words_dict, words_dict_0, words_dict_1, word_frequency, min_threshold)

    # cria a solução
    def solution_func(entry):
        pairs = obj_pairs.get_pairs(entry)
        return get_solution(pairs, entry)


    accuracy, f1, precision, recall, roc_auc = train_mnb(train, test, text, target, solution_func)
    print(f"Accuracy: {accuracy}")
    print(f"F1-Score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"Roc-Auc: {roc_auc}")

Accuracy: 0.9231415371692566
F1-Score: 0.9231096374575507
Precision: 0.9238608943870871
Recall: 0.9231415371692566
Roc-Auc: 0.9767390583437405
CPU times: user 5.41 s, sys: 68.2 ms, total: 5.48 s
Wall time: 5.59 s
