# Data loader

In [36]:
import pandas as pd

from sklearn.model_selection import RepeatedStratifiedKFold
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1410)

class DatasetLoader:
    def get(self, name):
        print(f"loading {name}")
        if name == "fnkdd":
            data = pd.read_csv("/home/rkozik/Desktop/swarog_exp_disk/datasets/fakenewskdd/train.csv",sep="\t")
            data.head()
            body = data["text"].values
            labels = data["label"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))
        
        if name == "mmcovid":
            data = pd.read_csv("/media/rkozik/02FF-A831/data/swarog/datasets/mmcovid/news_collection.csv",sep="\t")
            data["label"] = [ 1 if v =="fake" else 0 for v in data["label"]]
            data["text"] = [ str(v) for v in data["text"]]
            data = data[data["lang"] == "en"]

            body = data["text"].values
            labels = data["label"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))
        
        if name == "liar":
            data = pd.read_csv("/media/rkozik/02FF-A831/data/swarog/datasets/liar.csv", sep="\t",encoding="utf-8")
            def mpx(x):
                if x in [0,2]:
                    return 0
                elif x in [4,5]:
                    return 1
                else:
                    return -1
            data["text"] = data["statement"]
            data["label"] = [mpx(x) for x in data["label"]]
            data=data[ data["label"] != -1] 
            body = data["text"].values
            labels = data["label"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))

        if name == "covidfn":
            data = pd.read_csv("covid_fake_news.csv", sep="\t")
            body = data["title"].values
            labels = data["label"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))
        
        return body, labels, total_number_of_claims

# Utility functions

In [97]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from imblearn.metrics import geometric_mean_score
from IPython.display import clear_output



class Metrics:
    def __init__(self):
        self.scores = {
            'Accuracy': {'func': accuracy_score},
            'Balanced Accuracy': {'func': balanced_accuracy_score},
            'F1': {'func': f1_score},
            'Precision': {'func': precision_score},
            'Recall': {'func': recall_score},
            'G-mean': {'func': geometric_mean_score}
        }
        
        for score_name, score_dict in self.scores.items():
            score_dict["list"] = []
            score_dict["lab"] = []

    def update(self, actual, prediction):
        for score_name, score_dict in self.scores.items():
            if score_name in ["F1","Precision","Recall","G-mean"]:
                scorvaln = score_dict['func'](actual, prediction, average=None)
                score_dict['lab'].append(scorvaln)
                scorval = score_dict['func'](actual, prediction, average="weighted")
                score_dict['list'].append(scorval)
                #print(score_name, scorval, scorvaln)  
            else:
                scorval=score_dict['func'](actual, prediction)
                score_dict['list'].append(scorval)
                
    def print_table(self, labels=None):
        # Print stats
        scores = self.scores
        numlabels = scores["F1"]["lab"][0].shape[0]
        scores["F1"]["lab"][0].shape[0] 
        head = "  %-20s  %-10s  " +  numlabels * " %-10s  " 
        headv = ["Score", "Average"]
        if labels:
            headv.extend([labels[i] for i in range(numlabels)])
        else:
            headv.extend(["Lab:"+str(i+1) for i in range(numlabels)])
        row=head % tuple(headv)
        # table header
        print("―"*len(row))
        print(row)
        print("―"*len(row))
        # table rows
        for score_name, score_dict in sorted(scores.items()) :
            headv = [score_name, np.mean(score_dict['list'])*100, np.std(score_dict['list'])*100]
            for i in range(numlabels):
                if score_name in ["F1","Precision","Recall", "G-mean"]:
                    head = "  %-20s  %4.1f ± %4.1f  " + numlabels* "%4.1f ± %4.1f  "
                    vals = [v[i] for v in scores[score_name]["lab"]]
                    headv.append(np.mean(vals)*100)
                    headv.append(np.std(vals)*100)
                else:
                    head = "  %-20s  %4.1f ± %4.1f  " + numlabels * "%-11s  " 
                    headv.append("-")
            print(head % tuple(headv))
        print("―"*len(row))


def get_graph_node_stats(vec, nearestDocIDs, y_train, bodyTrainTFIDF):   
    vecdense = vec.toarray()[0]
    docids = nearestDocIDs
    trlabels = np.array(y_train)
    labsum = trlabels[docids].sum()
    
    ivec = []
    labmask = []
    for hitdocid in docids:
        value=bodyTrainTFIDF[hitdocid].toarray()[0]
        intersection = (vecdense>0)*(value>0)
        ivec.append(intersection.sum())
        labmask.append(trlabels[hitdocid])
        
    masked_ivec =  np.array(ivec)*np.array(labmask)   
    masked_ivec_neg =  np.array(ivec)*(-1*(np.array(labmask)-1)) 
    ivec = np.array(ivec)
    masked_ivec = np.array(masked_ivec)
    masked_ivec_neg = np.array(masked_ivec_neg)
    
    newvec = [labsum, (vecdense>0).sum(),ivec.max(), ivec.max(), masked_ivec.max(), masked_ivec.min(), masked_ivec_neg.max(), masked_ivec_neg.min()]
    return newvec

# Swarog Model

In [101]:
import numpy as np
import bentoml
from bentoml.io import NumpyNdarray
from bentoml.io import JSON
from annoy import AnnoyIndex
import re

import transformers
transformers.logging.set_verbosity_error()
from torch.utils.data import DataLoader 

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import TFDistilBertModel, DistilBertTokenizerFast
import torch
from scipy.sparse import csr_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import pickle5 as pickle
import sqlite3
from tqdm import tqdm
 
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("using device:", device)

if "disilbert_model" not in locals():
    disilbert_tokenizer =  AutoTokenizer.from_pretrained("distilbert-base-uncased")
    disilbert_model = AutoModel.from_pretrained("distilbert-base-uncased")
    handle = disilbert_model.to(device)

class SwarogModel:
    def __init__(self):
        self.tokenizer =  disilbert_tokenizer
        self.model = disilbert_tokenizer
        self.max_length = 256
        self.model_name = disilbert_model

    def fit(self, X=None, y=None):
        pass
    
    def encode(self, txt):
        return self.tokenizer(txt, max_length=self.max_length, 
                              truncation=True, padding=True, return_tensors="pt")

    def transform(self, X=None):
        dataloader = DataLoader(X, batch_size=4, shuffle=False)
        allembeds = []
        for batch in tqdm(dataloader):
            batchenc = disilbert_tokenizer(batch, max_length=256, 
                                           truncation=True, padding=True, return_tensors="pt")
            input_ids = batchenc['input_ids'].to(device)
            attention_mask = batchenc['attention_mask'].to(device)
            batchout = disilbert_model(input_ids, attention_mask=attention_mask, 
                                       output_hidden_states=True)
            embeds = [vec[0].cpu().detach().numpy() for vec in batchout[1][-1]]
            allembeds.extend(embeds)
        return np.array(allembeds)
    
    def train(self, body, labels):
        embeddings = self.transform(body)
        self.cls = LogisticRegression(max_iter=1000)
        self.cls.fit(embeddings, labels)
        self.train_prob = self.cls.predict_proba(embeddings)
        
    def predict(self, body):
        embeddings = self.transform(body)
        self.test_prob = self.cls.predict_proba(embeddings)
        return  self.cls.predict(embeddings) 

using device: cuda


# Graph Model

In [104]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression

# Download stopwords list
nltk.download('wordnet')
nltk.download('punkt')
stop_words = set(stopwords.words('english')) 

# Interface lemma tokenizer from nltk with sklearn
class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`', "'"]
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]
    

class TrustexModel:
    def __init__(self):
        # Lemmatize the stop words
        self.tokenizer=LemmaTokenizer()
        self.token_stop = self.tokenizer(' '.join(stop_words))
        
    def tfidf(self,body):
        self.tfidf_vectorizer = TfidfVectorizer(stop_words=self.token_stop)
        self.tfidf_vectorizer.fit(body)
        self.vocabulary_tfidf_words = self.tfidf_vectorizer.get_feature_names_out()
        self.bodyTrainTFIDF = self.tfidf_vectorizer.transform(body)
        
    def create_graph(self, body, labels):
        self.nn = NearestNeighbors(n_neighbors=10)
        self.nn.fit(self.bodyTrainTFIDF)
        knn_d,knn_idx = self.nn.kneighbors(self.bodyTrainTFIDF)
        self.graph_knn = []
        self.train_labels = labels
        from tqdm import tqdm
        for id, topIDs in tqdm(enumerate(knn_idx), total=knn_idx.shape[0]):
            vec = self.bodyTrainTFIDF[id]
            newvec = get_graph_node_stats(vec, topIDs[1:], labels, self.bodyTrainTFIDF)
            self.graph_knn.append(newvec)
        print("avg. nodes sim.=",np.mean([x[2]/x[1] for x in self.graph_knn]))

    def graph_transform_test_data(self, body):
        self.bodyTestTFIDF = self.tfidf_vectorizer.transform(body) 
        knn_test_d,knn_test_idx = self.nn.kneighbors(self.bodyTestTFIDF)
        self.graph_test_knn = []
        for id, topIDs in tqdm(enumerate(knn_test_idx), total=knn_test_idx.shape[0]):
            vec = self.bodyTestTFIDF[id]
            newvec = get_graph_node_stats(vec, topIDs[1:], self.train_labels, self.bodyTrainTFIDF)
            self.graph_test_knn.append(newvec)        
              
    def train(self, body, labels):
        print("Building similarity graph")
        self.tfidf(body)
        self.create_graph(body, labels)
          
        self.cls = LogisticRegression(max_iter=10000)
        self.cls.fit(self.graph_knn, labels)

    def predict(self, body):
        self.graph_transform_test_data(body)
        y_pred = self.cls.predict(self.graph_test_knn)
        return y_pred
    


[nltk_data] Downloading package wordnet to /home/rkozik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/rkozik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Infrence 

In [111]:
from sklearn.ensemble import RandomForestClassifier

class Inference:
    def train(self,graph, content, labels):
        newX=[]
        for i,vec in enumerate(content):
            v2 = np.append(content[i], graph[i])
            newX.append(v2)
            
        self.inf = RandomForestClassifier(max_depth=12)
        self.inf.fit(newX, labels)
        
    def predict(self, graph, content):
        newTest=[]
        for i,vec in enumerate(content):
            v2 = np.append(content[i], graph[i])
            newTest.append(v2)
    
        return self.inf.predict(newTest)

# Experiments

In [120]:
loader = DatasetLoader()

for dataset in ["covidfn"]:
    body, labels, total_number_of_claims = loader.get(dataset)
    X=range(0,total_number_of_claims)
    
    trustex_quality = Metrics()
    swarog_quality = Metrics()
    inf_quality = Metrics()
    

    for fold_idx, (train, test) in enumerate(rskf.split(X, labels)):
        print(f"fold-{fold_idx}")    
        
        swarog = SwarogModel()
        trustex = TrustexModel()
        inference = Inference()
    
        trustex.train(body[train],labels[train])
        ypred = trustex.predict(body[test])
        trustex_quality.update(labels[test], ypred)
        
        swarog.train(body[train],labels[train])
        ypred = swarog.predict(body[test])
        swarog_quality.update(labels[test], ypred)
        
        inference.train(trustex.graph_knn, swarog.train_prob, labels[train])
        newpred = inference.predict(trustex.graph_test_knn, swarog.test_prob)
        inf_quality.update(labels[test], newpred)


print("Symbolic:")
trustex_quality.print_table()
print("Deep:")
swarog_quality.print_table()
print("Both:")
inf_quality.print_table()


loading covidfn
total_number_of_claims= 8972
labels fake= 461 real= 8511
fold-0
Building similarity graph


100%|█████████████████████████████████████| 8074/8074 [00:07<00:00, 1093.85it/s]


avg. nodes sim.= 0.49843772108989287


100%|███████████████████████████████████████| 898/898 [00:00<00:00, 1088.30it/s]
100%|██████████████████████████████████████| 2019/2019 [00:10<00:00, 191.12it/s]
100%|████████████████████████████████████████| 225/225 [00:01<00:00, 189.11it/s]


fold-1
Building similarity graph


100%|█████████████████████████████████████| 8074/8074 [00:07<00:00, 1092.86it/s]


avg. nodes sim.= 0.49894484811768425


100%|███████████████████████████████████████| 898/898 [00:00<00:00, 1092.27it/s]
100%|██████████████████████████████████████| 2019/2019 [00:10<00:00, 193.19it/s]
100%|████████████████████████████████████████| 225/225 [00:01<00:00, 188.02it/s]


fold-2
Building similarity graph


100%|█████████████████████████████████████| 8075/8075 [00:07<00:00, 1092.63it/s]


avg. nodes sim.= 0.4963674985297739


100%|███████████████████████████████████████| 897/897 [00:00<00:00, 1087.05it/s]
100%|██████████████████████████████████████| 2019/2019 [00:10<00:00, 192.61it/s]
100%|████████████████████████████████████████| 225/225 [00:01<00:00, 187.77it/s]


fold-3
Building similarity graph


100%|█████████████████████████████████████| 8075/8075 [00:07<00:00, 1073.99it/s]


avg. nodes sim.= 0.49683872879472807


100%|███████████████████████████████████████| 897/897 [00:00<00:00, 1080.23it/s]
100%|██████████████████████████████████████| 2019/2019 [00:10<00:00, 192.23it/s]
100%|████████████████████████████████████████| 225/225 [00:01<00:00, 188.12it/s]


fold-4
Building similarity graph


100%|█████████████████████████████████████| 8075/8075 [00:07<00:00, 1079.12it/s]


avg. nodes sim.= 0.4980695776280309


100%|███████████████████████████████████████| 897/897 [00:00<00:00, 1079.36it/s]
100%|██████████████████████████████████████| 2019/2019 [00:10<00:00, 192.47it/s]
100%|████████████████████████████████████████| 225/225 [00:01<00:00, 189.58it/s]


fold-5
Building similarity graph


100%|█████████████████████████████████████| 8075/8075 [00:07<00:00, 1070.05it/s]


avg. nodes sim.= 0.4961970478447988


100%|███████████████████████████████████████| 897/897 [00:00<00:00, 1070.26it/s]
100%|██████████████████████████████████████| 2019/2019 [00:10<00:00, 192.86it/s]
100%|████████████████████████████████████████| 225/225 [00:01<00:00, 186.20it/s]


fold-6
Building similarity graph


100%|█████████████████████████████████████| 8075/8075 [00:07<00:00, 1068.29it/s]


avg. nodes sim.= 0.49845898081323436


100%|███████████████████████████████████████| 897/897 [00:00<00:00, 1064.38it/s]
100%|██████████████████████████████████████| 2019/2019 [00:10<00:00, 189.82it/s]
100%|████████████████████████████████████████| 225/225 [00:01<00:00, 188.01it/s]


fold-7
Building similarity graph


100%|█████████████████████████████████████| 8075/8075 [00:07<00:00, 1079.03it/s]


avg. nodes sim.= 0.5007491060296677


100%|███████████████████████████████████████| 897/897 [00:00<00:00, 1076.63it/s]
100%|██████████████████████████████████████| 2019/2019 [00:10<00:00, 193.53it/s]
100%|████████████████████████████████████████| 225/225 [00:01<00:00, 187.62it/s]


fold-8
Building similarity graph


100%|█████████████████████████████████████| 8075/8075 [00:07<00:00, 1079.94it/s]


avg. nodes sim.= 0.4971720455277567


100%|███████████████████████████████████████| 897/897 [00:00<00:00, 1077.40it/s]
100%|██████████████████████████████████████| 2019/2019 [00:10<00:00, 192.91it/s]
100%|████████████████████████████████████████| 225/225 [00:01<00:00, 188.60it/s]


fold-9
Building similarity graph


100%|█████████████████████████████████████| 8075/8075 [00:07<00:00, 1087.33it/s]


avg. nodes sim.= 0.49983059103851946


100%|███████████████████████████████████████| 897/897 [00:00<00:00, 1081.61it/s]
100%|██████████████████████████████████████| 2019/2019 [00:10<00:00, 192.91it/s]
100%|████████████████████████████████████████| 225/225 [00:01<00:00, 189.65it/s]


Symbolic:
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              96.4 ±  0.4  -            -            
  Balanced Accuracy     69.6 ±  3.1  -            -            
  F1                    95.8 ±  0.5  98.1 ±  0.2  52.9 ±  6.3  
  G-mean                64.1 ±  4.4  62.7 ±  4.8  62.7 ±  4.8  
  Precision             96.0 ±  0.6  96.8 ±  0.3  80.4 ±  8.4  
  Recall                96.4 ±  0.4  99.5 ±  0.3  39.7 ±  6.2  
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
Deep:
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              97.5 ±  0.4  -            -            
  Balanced Accuracy     81.2 ±  2.7  -            -            
  F1                    97.3 ± 

In [117]:
loader = DatasetLoader()

for dataset in ["fnkdd"]:
    body, labels, total_number_of_claims = loader.get(dataset)
    X=range(0,total_number_of_claims)
    
    trustex_quality = Metrics()
    swarog_quality = Metrics()
    inf_quality = Metrics()
    

    for fold_idx, (train, test) in enumerate(rskf.split(X, labels)):
        print(f"fold-{fold_idx}")    
        
        swarog = SwarogModel()
        trustex = TrustexModel()
        inference = Inference()
    
        trustex.train(body[train],labels[train])
        ypred = trustex.predict(body[test])
        trustex_quality.update(labels[test], ypred)
        
        swarog.train(body[train],labels[train])
        ypred = swarog.predict(body[test])
        swarog_quality.update(labels[test], ypred)
        
        inference.train(trustex.graph_knn, swarog.train_prob, labels[train])
        newpred = inference.predict(trustex.graph_test_knn, swarog.test_prob)
        inf_quality.update(labels[test], newpred)

        
        
print("Symbolic:")
trustex_quality.print_table()

print("Deep:")
swarog_quality.print_table()

print("Both:")
inf_quality.print_table()
        
        

loading fnkdd
total_number_of_claims= 4986
labels fake= 2014 real= 2972
fold-0
Building similarity graph


100%|██████████████████████████████████████| 4487/4487 [00:07<00:00, 611.51it/s]


avg. nodes sim.= 0.4514347825623094


100%|████████████████████████████████████████| 499/499 [00:00<00:00, 607.56it/s]
100%|███████████████████████████████████████| 1122/1122 [00:15<00:00, 73.01it/s]
100%|█████████████████████████████████████████| 125/125 [00:01<00:00, 76.64it/s]


fold-1
Building similarity graph


100%|██████████████████████████████████████| 4487/4487 [00:07<00:00, 608.65it/s]


avg. nodes sim.= 0.4574446733552907


100%|████████████████████████████████████████| 499/499 [00:00<00:00, 606.50it/s]
100%|███████████████████████████████████████| 1122/1122 [00:15<00:00, 73.40it/s]
100%|█████████████████████████████████████████| 125/125 [00:01<00:00, 73.32it/s]


fold-2
Building similarity graph


100%|██████████████████████████████████████| 4487/4487 [00:07<00:00, 614.44it/s]


avg. nodes sim.= 0.4538385544167052


100%|████████████████████████████████████████| 499/499 [00:00<00:00, 609.09it/s]
100%|███████████████████████████████████████| 1122/1122 [00:15<00:00, 73.44it/s]
100%|█████████████████████████████████████████| 125/125 [00:01<00:00, 71.82it/s]


fold-3
Building similarity graph


100%|██████████████████████████████████████| 4487/4487 [00:07<00:00, 610.67it/s]


avg. nodes sim.= 0.4578061771417086


100%|████████████████████████████████████████| 499/499 [00:00<00:00, 612.23it/s]
100%|███████████████████████████████████████| 1122/1122 [00:15<00:00, 73.00it/s]
100%|█████████████████████████████████████████| 125/125 [00:01<00:00, 74.64it/s]


fold-4
Building similarity graph


100%|██████████████████████████████████████| 4487/4487 [00:07<00:00, 618.78it/s]


avg. nodes sim.= 0.45472603140799983


100%|████████████████████████████████████████| 499/499 [00:00<00:00, 616.01it/s]
100%|███████████████████████████████████████| 1122/1122 [00:15<00:00, 73.54it/s]
100%|█████████████████████████████████████████| 125/125 [00:01<00:00, 67.44it/s]


fold-5
Building similarity graph


100%|██████████████████████████████████████| 4487/4487 [00:07<00:00, 615.14it/s]


avg. nodes sim.= 0.4503095908366874


100%|████████████████████████████████████████| 499/499 [00:00<00:00, 610.73it/s]
100%|███████████████████████████████████████| 1122/1122 [00:15<00:00, 73.27it/s]
100%|█████████████████████████████████████████| 125/125 [00:01<00:00, 71.77it/s]


fold-6
Building similarity graph


100%|██████████████████████████████████████| 4488/4488 [00:07<00:00, 615.24it/s]


avg. nodes sim.= 0.45466396898602074


100%|████████████████████████████████████████| 498/498 [00:00<00:00, 617.27it/s]
100%|███████████████████████████████████████| 1122/1122 [00:15<00:00, 73.40it/s]
100%|█████████████████████████████████████████| 125/125 [00:01<00:00, 69.75it/s]


fold-7
Building similarity graph


100%|██████████████████████████████████████| 4488/4488 [00:07<00:00, 614.39it/s]


avg. nodes sim.= 0.45480810729190174


100%|████████████████████████████████████████| 498/498 [00:00<00:00, 608.78it/s]
100%|███████████████████████████████████████| 1122/1122 [00:15<00:00, 73.46it/s]
100%|█████████████████████████████████████████| 125/125 [00:01<00:00, 71.51it/s]


fold-8
Building similarity graph


100%|██████████████████████████████████████| 4488/4488 [00:07<00:00, 612.03it/s]


avg. nodes sim.= 0.45980710367652594


100%|████████████████████████████████████████| 498/498 [00:00<00:00, 606.00it/s]
100%|███████████████████████████████████████| 1122/1122 [00:15<00:00, 73.15it/s]
100%|█████████████████████████████████████████| 125/125 [00:01<00:00, 74.05it/s]


fold-9
Building similarity graph


100%|██████████████████████████████████████| 4488/4488 [00:07<00:00, 614.24it/s]


avg. nodes sim.= 0.4517896759738164


100%|████████████████████████████████████████| 498/498 [00:00<00:00, 613.60it/s]
100%|███████████████████████████████████████| 1122/1122 [00:15<00:00, 73.75it/s]
100%|█████████████████████████████████████████| 125/125 [00:01<00:00, 69.45it/s]


Symbolic:
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              72.7 ±  1.7  -            -            
  Balanced Accuracy     70.3 ±  2.0  -            -            
  F1                    72.2 ±  1.8  78.3 ±  1.2  63.0 ±  2.8  
  G-mean                70.2 ±  2.0  69.1 ±  2.3  69.1 ±  2.3  
  Precision             72.4 ±  1.7  74.4 ±  1.7  69.4 ±  2.0  
  Recall                72.7 ±  1.7  82.8 ±  1.3  57.8 ±  3.7  
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
Deep:
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              76.0 ±  1.3  -            -            
  Balanced Accuracy     74.2 ±  1.4  -            -            
  F1                    75.7 ± 

In [118]:
loader = DatasetLoader()

for dataset in ["mmcovid"]:
    body, labels, total_number_of_claims = loader.get(dataset)
    X=range(0,total_number_of_claims)
    
    trustex_quality = Metrics()
    swarog_quality = Metrics()
    inf_quality = Metrics()
    

    for fold_idx, (train, test) in enumerate(rskf.split(X, labels)):
        print(f"fold-{fold_idx}")    
        
        swarog = SwarogModel()
        trustex = TrustexModel()
        inference = Inference()
    
        trustex.train(body[train],labels[train])
        ypred = trustex.predict(body[test])
        trustex_quality.update(labels[test], ypred)
        
        swarog.train(body[train],labels[train])
        ypred = swarog.predict(body[test])
        swarog_quality.update(labels[test], ypred)
        
        inference.train(trustex.graph_knn, swarog.train_prob, labels[train])
        newpred = inference.predict(trustex.graph_test_knn, swarog.test_prob)
        inf_quality.update(labels[test], newpred)

        
        
print("Symbolic:")
trustex_quality.print_table()

print("Deep:")
swarog_quality.print_table()

print("Both:")
inf_quality.print_table()
        
        

loading mmcovid
total_number_of_claims= 7332
labels fake= 2028 real= 5304
fold-0
Building similarity graph


100%|██████████████████████████████████████| 6598/6598 [00:10<00:00, 655.05it/s]


avg. nodes sim.= 0.5590873156199317


100%|████████████████████████████████████████| 734/734 [00:01<00:00, 659.42it/s]
100%|██████████████████████████████████████| 1650/1650 [00:12<00:00, 128.61it/s]
100%|████████████████████████████████████████| 184/184 [00:01<00:00, 127.04it/s]


fold-1
Building similarity graph


100%|██████████████████████████████████████| 6598/6598 [00:10<00:00, 656.40it/s]


avg. nodes sim.= 0.5607023821444866


100%|████████████████████████████████████████| 734/734 [00:01<00:00, 658.41it/s]
100%|██████████████████████████████████████| 1650/1650 [00:12<00:00, 130.12it/s]
100%|████████████████████████████████████████| 184/184 [00:01<00:00, 126.29it/s]


fold-2
Building similarity graph


100%|██████████████████████████████████████| 6599/6599 [00:10<00:00, 655.19it/s]


avg. nodes sim.= 0.5605790115887976


100%|████████████████████████████████████████| 733/733 [00:01<00:00, 654.70it/s]
100%|██████████████████████████████████████| 1650/1650 [00:12<00:00, 129.77it/s]
100%|████████████████████████████████████████| 184/184 [00:01<00:00, 128.65it/s]


fold-3
Building similarity graph


100%|██████████████████████████████████████| 6599/6599 [00:10<00:00, 657.51it/s]


avg. nodes sim.= 0.560540406674895


100%|████████████████████████████████████████| 733/733 [00:01<00:00, 657.64it/s]
100%|██████████████████████████████████████| 1650/1650 [00:12<00:00, 129.46it/s]
100%|████████████████████████████████████████| 184/184 [00:01<00:00, 135.66it/s]


fold-4
Building similarity graph


100%|██████████████████████████████████████| 6599/6599 [00:10<00:00, 652.01it/s]


avg. nodes sim.= 0.5593903874201553


100%|████████████████████████████████████████| 733/733 [00:01<00:00, 647.85it/s]
100%|██████████████████████████████████████| 1650/1650 [00:12<00:00, 129.94it/s]
100%|████████████████████████████████████████| 184/184 [00:01<00:00, 125.06it/s]


fold-5
Building similarity graph


100%|██████████████████████████████████████| 6599/6599 [00:10<00:00, 656.39it/s]


avg. nodes sim.= 0.5614634597782383


100%|████████████████████████████████████████| 733/733 [00:01<00:00, 654.08it/s]
100%|██████████████████████████████████████| 1650/1650 [00:12<00:00, 129.79it/s]
100%|████████████████████████████████████████| 184/184 [00:01<00:00, 126.10it/s]


fold-6
Building similarity graph


100%|██████████████████████████████████████| 6599/6599 [00:10<00:00, 647.87it/s]


avg. nodes sim.= 0.5609033538526145


100%|████████████████████████████████████████| 733/733 [00:01<00:00, 654.30it/s]
100%|██████████████████████████████████████| 1650/1650 [00:12<00:00, 129.97it/s]
100%|████████████████████████████████████████| 184/184 [00:01<00:00, 125.25it/s]


fold-7
Building similarity graph


100%|██████████████████████████████████████| 6599/6599 [00:10<00:00, 659.72it/s]


avg. nodes sim.= 0.5510119579638341


100%|████████████████████████████████████████| 733/733 [00:01<00:00, 659.39it/s]
100%|██████████████████████████████████████| 1650/1650 [00:12<00:00, 130.48it/s]
100%|████████████████████████████████████████| 184/184 [00:01<00:00, 124.69it/s]


fold-8
Building similarity graph


100%|██████████████████████████████████████| 6599/6599 [00:10<00:00, 657.80it/s]


avg. nodes sim.= 0.5607969017158434


100%|████████████████████████████████████████| 733/733 [00:01<00:00, 658.74it/s]
100%|██████████████████████████████████████| 1650/1650 [00:12<00:00, 130.57it/s]
100%|████████████████████████████████████████| 184/184 [00:01<00:00, 127.69it/s]


fold-9
Building similarity graph


100%|██████████████████████████████████████| 6599/6599 [00:10<00:00, 650.66it/s]


avg. nodes sim.= 0.5619162037380135


100%|████████████████████████████████████████| 733/733 [00:01<00:00, 648.80it/s]
100%|██████████████████████████████████████| 1650/1650 [00:12<00:00, 129.58it/s]
100%|████████████████████████████████████████| 184/184 [00:01<00:00, 129.61it/s]


Symbolic:
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              89.9 ±  1.2  -            -            
  Balanced Accuracy     84.6 ±  1.6  -            -            
  F1                    89.5 ±  1.3  93.2 ±  0.8  79.9 ±  2.4  
  G-mean                84.5 ±  1.6  83.8 ±  1.7  83.8 ±  1.7  
  Precision             89.8 ±  1.3  90.3 ±  0.9  88.4 ±  3.1  
  Recall                89.9 ±  1.2  96.3 ±  1.1  72.9 ±  2.6  
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
Deep:
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              93.0 ±  1.0  -            -            
  Balanced Accuracy     88.8 ±  1.8  -            -            
  F1                    92.8 ± 

In [119]:
loader = DatasetLoader()

for dataset in ["liar"]:
    body, labels, total_number_of_claims = loader.get(dataset)
    X=range(0,total_number_of_claims)
    
    trustex_quality = Metrics()
    swarog_quality = Metrics()
    inf_quality = Metrics()
    

    for fold_idx, (train, test) in enumerate(rskf.split(X, labels)):
        print(f"fold-{fold_idx}")    
        
        swarog = SwarogModel()
        trustex = TrustexModel()
        inference = Inference()
    
        trustex.train(body[train],labels[train])
        ypred = trustex.predict(body[test])
        trustex_quality.update(labels[test], ypred)
        
        swarog.train(body[train],labels[train])
        ypred = swarog.predict(body[test])
        swarog_quality.update(labels[test], ypred)
        
        inference.train(trustex.graph_knn, swarog.train_prob, labels[train])
        newpred = inference.predict(trustex.graph_test_knn, swarog.test_prob)
        inf_quality.update(labels[test], newpred)

        
        
print("Symbolic:")
trustex_quality.print_table()

print("Deep:")
swarog_quality.print_table()

print("Both:")
inf_quality.print_table()
        
        

loading liar
total_number_of_claims= 8061
labels fake= 3554 real= 4507
fold-0
Building similarity graph


100%|█████████████████████████████████████| 7254/7254 [00:06<00:00, 1092.37it/s]


avg. nodes sim.= 0.37215363446118804


100%|███████████████████████████████████████| 807/807 [00:00<00:00, 1091.27it/s]
100%|██████████████████████████████████████| 1814/1814 [00:09<00:00, 189.70it/s]
100%|████████████████████████████████████████| 202/202 [00:01<00:00, 190.51it/s]


fold-1
Building similarity graph


100%|█████████████████████████████████████| 7255/7255 [00:06<00:00, 1105.43it/s]


avg. nodes sim.= 0.3739579257438074


100%|███████████████████████████████████████| 806/806 [00:00<00:00, 1103.67it/s]
100%|██████████████████████████████████████| 1814/1814 [00:09<00:00, 192.33it/s]
100%|████████████████████████████████████████| 202/202 [00:01<00:00, 188.49it/s]


fold-2
Building similarity graph


100%|█████████████████████████████████████| 7255/7255 [00:06<00:00, 1106.68it/s]


avg. nodes sim.= 0.37313028989467983


100%|███████████████████████████████████████| 806/806 [00:00<00:00, 1103.14it/s]
100%|██████████████████████████████████████| 1814/1814 [00:09<00:00, 189.96it/s]
100%|████████████████████████████████████████| 202/202 [00:01<00:00, 187.95it/s]


fold-3
Building similarity graph


100%|█████████████████████████████████████| 7255/7255 [00:06<00:00, 1116.44it/s]


avg. nodes sim.= 0.373071933837595


100%|███████████████████████████████████████| 806/806 [00:00<00:00, 1119.73it/s]
100%|██████████████████████████████████████| 1814/1814 [00:09<00:00, 191.81it/s]
100%|████████████████████████████████████████| 202/202 [00:01<00:00, 186.07it/s]


fold-4
Building similarity graph


100%|█████████████████████████████████████| 7255/7255 [00:06<00:00, 1109.57it/s]


avg. nodes sim.= 0.37359819667052463


100%|███████████████████████████████████████| 806/806 [00:00<00:00, 1109.28it/s]
100%|██████████████████████████████████████| 1814/1814 [00:09<00:00, 191.88it/s]
100%|████████████████████████████████████████| 202/202 [00:01<00:00, 189.24it/s]


fold-5
Building similarity graph


100%|█████████████████████████████████████| 7255/7255 [00:06<00:00, 1102.61it/s]


avg. nodes sim.= 0.3739604650401444


100%|███████████████████████████████████████| 806/806 [00:00<00:00, 1098.12it/s]
100%|██████████████████████████████████████| 1814/1814 [00:09<00:00, 192.79it/s]
100%|████████████████████████████████████████| 202/202 [00:01<00:00, 187.84it/s]


fold-6
Building similarity graph


100%|█████████████████████████████████████| 7255/7255 [00:06<00:00, 1114.46it/s]


avg. nodes sim.= 0.372543491705117


100%|███████████████████████████████████████| 806/806 [00:00<00:00, 1114.24it/s]
100%|██████████████████████████████████████| 1814/1814 [00:09<00:00, 192.16it/s]
100%|████████████████████████████████████████| 202/202 [00:01<00:00, 188.09it/s]


fold-7
Building similarity graph


100%|█████████████████████████████████████| 7255/7255 [00:06<00:00, 1111.61it/s]


avg. nodes sim.= 0.3737639898557941


100%|███████████████████████████████████████| 806/806 [00:00<00:00, 1107.32it/s]
100%|██████████████████████████████████████| 1814/1814 [00:09<00:00, 192.27it/s]
100%|████████████████████████████████████████| 202/202 [00:01<00:00, 187.89it/s]


fold-8
Building similarity graph


100%|█████████████████████████████████████| 7255/7255 [00:06<00:00, 1105.11it/s]


avg. nodes sim.= 0.3735825062528571


100%|███████████████████████████████████████| 806/806 [00:00<00:00, 1101.21it/s]
100%|██████████████████████████████████████| 1814/1814 [00:09<00:00, 192.84it/s]
100%|████████████████████████████████████████| 202/202 [00:01<00:00, 184.73it/s]


fold-9
Building similarity graph


100%|█████████████████████████████████████| 7255/7255 [00:06<00:00, 1108.15it/s]


avg. nodes sim.= 0.37326784368000177


100%|███████████████████████████████████████| 806/806 [00:00<00:00, 1108.15it/s]
100%|██████████████████████████████████████| 1814/1814 [00:09<00:00, 193.18it/s]
100%|████████████████████████████████████████| 202/202 [00:01<00:00, 185.85it/s]


Symbolic:
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              61.2 ±  1.0  -            -            
  Balanced Accuracy     59.1 ±  1.0  -            -            
  F1                    59.8 ±  1.0  68.9 ±  0.9  48.3 ±  1.4  
  G-mean                59.0 ±  1.0  56.3 ±  1.1  56.3 ±  1.1  
  Precision             60.7 ±  1.1  62.4 ±  0.7  58.6 ±  1.8  
  Recall                61.2 ±  1.0  77.0 ±  1.8  41.2 ±  1.8  
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
Deep:
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              65.6 ±  1.9  -            -            
  Balanced Accuracy     64.6 ±  1.9  -            -            
  F1                    65.3 ± 

# Graphs and plots

In [None]:
import pandas as pd
import sqlite3
con = sqlite3.connect("wyniki.db")
resutls = pd.read_csv("wyniki.csv")
resutls.to_sql("wyniki", con)
con.close()

In [160]:
resutls["Metric"].value_counts()

Accuracy            12
BalancedAccuracy    12
F1                  12
G-mean              12
Precision           12
Recall              12
Name: Metric, dtype: int64

In [136]:
!rm wyniki.db

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [167]:
con = sqlite3.connect("wyniki.db")
bac = pd.read_sql("""
select Datase as Dataset, Algorithm, Value || " ± " || Std as BAC from wyniki where Metric='BalancedAccuracy'
""", con)

f1=pd.read_sql("""
select Value || " ± " || Std as F1 from wyniki where Metric='F1'
""", con)


Precision=pd.read_sql("""
select Value || " ± " || Std as Precision from wyniki where Metric='Precision'
""", con)

Recall=pd.read_sql("""
select Value || " ± " || Std as Precision from wyniki where Metric='Recall'
""", con)

G_mean=pd.read_sql("""
select Value || " ± " || Std as "G-mean" from wyniki where Metric='G-mean'
""", con)

res = pd.concat([bac,f1,G_mean], axis=1)
res

Unnamed: 0,Dataset,Algorithm,BAC,F1,G-mean
0,FN-KDD,Symbolic,70.3 ± 2.0,72.2 ± 1.8,70.2 ± 2.0
1,FN-KDD,Deep,74.2 ± 1.4,75.7 ± 1.3,74.2 ± 1.4
2,FN-KDD,Both,75.6 ± 1.3,76.9 ± 1.2,75.6 ± 1.3
3,MM-Covid,Symbolic,84.6 ± 1.6,89.5 ± 1.3,84.5 ± 1.6
4,MM-Covid,Deep,88.8 ± 1.8,92.8 ± 1.1,88.7 ± 1.8
5,MM-Covid,Both,91.6 ± 1.0,94.3 ± 0.6,91.6 ± 1.0
6,LIAR,Symbolic,59.1 ± 1.0,59.8 ± 1.0,59.0 ± 1.0
7,LIAR,Deep,64.6 ± 1.9,65.3 ± 1.9,64.5 ± 1.9
8,LIAR,Both,65.2 ± 1.5,66.0 ± 1.5,65.2 ± 1.5
9,COVID-FN,Symbolic,69.6 ± 3.1,95.8 ± 0.5,64.1 ± 4.4


In [171]:
def renameit(x):
    if x == "Symbolic":
        return "Symbolic Model"
    if x == "Deep":
        return "Deep Model"
    if x == "Both":
        return "Proposed"
res["Algorithm"]=[renameit(x) for x in res["Algorithm"]]

In [176]:
print(res.style.to_latex())

\begin{tabular}{llllll}
 & Dataset & Algorithm & BAC & F1 & G-mean \\
0 & FN-KDD & Symbolic Model & 70.3 ± 2.0 & 72.2 ± 1.8 & 70.2 ± 2.0 \\
1 & FN-KDD & Deep Model & 74.2 ± 1.4 & 75.7 ± 1.3 & 74.2 ± 1.4 \\
2 & FN-KDD & Proposed & 75.6 ± 1.3 & 76.9 ± 1.2 & 75.6 ± 1.3 \\
3 & MM-Covid & Symbolic Model & 84.6 ± 1.6 & 89.5 ± 1.3 & 84.5 ± 1.6 \\
4 & MM-Covid & Deep Model & 88.8 ± 1.8 & 92.8 ± 1.1 & 88.7 ± 1.8 \\
5 & MM-Covid & Proposed & 91.6 ± 1.0 & 94.3 ± 0.6 & 91.6 ± 1.0 \\
6 & LIAR & Symbolic Model & 59.1 ± 1.0 & 59.8 ± 1.0 & 59.0 ± 1.0 \\
7 & LIAR & Deep Model & 64.6 ± 1.9 & 65.3 ± 1.9 & 64.5 ± 1.9 \\
8 & LIAR & Proposed & 65.2 ± 1.5 & 66.0 ± 1.5 & 65.2 ± 1.5 \\
9 & COVID-FN & Symbolic Model & 69.6 ± 3.1 & 95.8 ± 0.5 & 64.1 ± 4.4 \\
10 & COVID-FN & Deep Model & 81.2 ± 2.7 & 97.3 ± 0.4 & 79.5 ± 3.3 \\
11 & COVID-FN & Proposed & 84.4 ± 3.1 & 97.3 ± 0.4 & 83.3 ± 3.6 \\
\end{tabular}



In [215]:
pd.read_sql("""
 
    with blah as (
        select Datase as Dataset, Algorithm, Metric, -Value as 'm' from wyniki where  Algorithm == 'Deep'
        union
        select Datase as Dataset, Algorithm, Metric, Value as 'm' from wyniki where   Algorithm == 'Both'
    )
    select Dataset,sum(m) as Improvement from blah where Metric in ("BalancedAccuracy") group by Dataset,Metric
    order by Improvement desc

""", con)

Unnamed: 0,Dataset,Improvement
0,COVID-FN,3.2
1,MM-Covid,2.8
2,FN-KDD,1.4
3,LIAR,0.6


In [244]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [245]:
import pybars.charts as mygraph

In [246]:
%%sql  show=graph  db=wyniki.db stack=no vertical=yes width=400 height=300 fontSize=16 itemBorderRadius=10
with blah as (
    select Datase as Dataset, Algorithm, Metric, -Value as 'm' from wyniki where  Algorithm == 'Deep'
    union
    select Datase as Dataset, Algorithm, Metric, Value as 'm' from wyniki where   Algorithm == 'Both'
)
select Dataset, Metric ,round(sum(m)+0.001,2) as Improvement from blah where Metric in ("BalancedAccuracy", "G-mean") group by Dataset,Metric
order by Dataset

using wyniki.db
