# Data loader

In [1]:
import pandas as pd

from sklearn.model_selection import RepeatedStratifiedKFold
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1410)

class DatasetLoader:
    def get(self, name):
        print(f"loading {name}")
        if name == "fnkdd":
            data = pd.read_csv("/home/rkozik/Desktop/swarog_exp_disk/datasets/fakenewskdd/train.csv",sep="\t")
            data.head()
            body = data["text"].values
            labels = 1-data["label"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))
        
        if name == "mmcovid":
            data = pd.read_csv("/media/rkozik/02FF-A831/data/swarog/datasets/mmcovid/news_collection.csv",sep="\t")
            data["label"] = [ 1 if v =="fake" else 0 for v in data["label"]]
            data["text"] = [ str(v) for v in data["text"]]
            data = data[data["lang"] == "en"]

            body = data["text"].values
            labels = data["label"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))
        
        if name == "liar":
            data = pd.read_csv("/media/rkozik/02FF-A831/data/swarog/datasets/liar.csv", sep="\t",encoding="utf-8")
            def mpx(x):
                if x in [0,2]:
                    return 0
                elif x in [4,5]:
                    return 1
                else:
                    return -1
            data["text"] = data["statement"]
            data["label"] = [mpx(x) for x in data["label"]]
            data=data[ data["label"] != -1] 
            body = data["text"].values
            labels = data["label"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))

        if name == "covidfn":
            data = pd.read_csv("covid_fake_news.csv", sep=",")
            body = data["headlines"].values
            labels = 1 - data["outcome"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))
        
        return body, labels, total_number_of_claims

# Utility functions

In [17]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from imblearn.metrics import geometric_mean_score
from IPython.display import clear_output



class Metrics:
    def __init__(self):
        self.scores = {
            'Accuracy': {'func': accuracy_score},
            'Balanced Accuracy': {'func': balanced_accuracy_score},
            'F1': {'func': f1_score},
            'Precision': {'func': precision_score},
            'Recall': {'func': recall_score},
            'G-mean': {'func': geometric_mean_score}
        }
        
        for score_name, score_dict in self.scores.items():
            score_dict["list"] = []
            score_dict["lab"] = []

    def update(self, actual, prediction):
        for score_name, score_dict in self.scores.items():
            if score_name in ["F1","Precision","Recall","G-mean"]:
                scorvaln = score_dict['func'](actual, prediction, average=None)
                score_dict['lab'].append(scorvaln)
                scorval = score_dict['func'](actual, prediction, average="weighted")
                score_dict['list'].append(scorval)
                #print(score_name, scorval, scorvaln)  
            else:
                scorval=score_dict['func'](actual, prediction)
                score_dict['list'].append(scorval)
                
    def print_table(self, labels=None):
        # Print stats
        scores = self.scores
        numlabels = scores["F1"]["lab"][0].shape[0]
        scores["F1"]["lab"][0].shape[0] 
        head = "  %-20s  %-10s  " +  numlabels * " %-10s  " 
        headv = ["Score", "Average"]
        if labels:
            headv.extend([labels[i] for i in range(numlabels)])
        else:
            headv.extend(["Lab:"+str(i+1) for i in range(numlabels)])
        row=head % tuple(headv)
        # table header
        print("―"*len(row))
        print(row)
        print("―"*len(row))
        # table rows
        for score_name, score_dict in sorted(scores.items()) :
            headv = [score_name, np.mean(score_dict['list'])*100, np.std(score_dict['list'])*100]
            for i in range(numlabels):
                if score_name in ["F1","Precision","Recall", "G-mean"]:
                    head = "  %-20s  %4.1f ± %4.1f  " + numlabels* "%4.1f ± %4.1f  "
                    vals = [v[i] for v in scores[score_name]["lab"]]
                    headv.append(np.mean(vals)*100)
                    headv.append(np.std(vals)*100)
                else:
                    head = "  %-20s  %4.1f ± %4.1f  " + numlabels * "%-11s  " 
                    headv.append("-")
            print(head % tuple(headv))
        print("―"*len(row))


def get_graph_node_stats(vec, nearestDocIDs, y_train, bodyTrainTFIDF):   
    vecdense = vec.toarray()[0]
    docids = nearestDocIDs
    trlabels = np.array(y_train)
    labsum = trlabels[docids].sum()
    
    ivec = []
    labmask = []
    for hitdocid in docids:
        value=bodyTrainTFIDF[hitdocid].toarray()[0]
        intersection = (vecdense>0)*(value>0)
        ivec.append(intersection.sum())
        labmask.append(trlabels[hitdocid])
        
    masked_ivec =  np.array(ivec)*np.array(labmask)   
    masked_ivec_neg =  np.array(ivec)*(-1*(np.array(labmask)-1)) 
    ivec = np.array(ivec)
    masked_ivec = np.array(masked_ivec)
    masked_ivec_neg = np.array(masked_ivec_neg)
    
    newvec = [labsum, (vecdense>0).sum(),ivec.max(), ivec.max(), masked_ivec.max(), masked_ivec.min(), masked_ivec_neg.max(), masked_ivec_neg.min()]
    return newvec

# Swarog Model

In [18]:
import numpy as np
import bentoml
from bentoml.io import NumpyNdarray
from bentoml.io import JSON
from annoy import AnnoyIndex
import re

import transformers
transformers.logging.set_verbosity_error()
from torch.utils.data import DataLoader 

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import TFDistilBertModel, DistilBertTokenizerFast
import torch
from scipy.sparse import csr_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import pickle5 as pickle
import sqlite3
from tqdm import tqdm
 
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("using device:", device)

if "disilbert_model" not in locals():
    disilbert_tokenizer =  AutoTokenizer.from_pretrained("distilbert-base-uncased")
    disilbert_model = AutoModel.from_pretrained("distilbert-base-uncased")
    handle = disilbert_model.to(device)

class SwarogModel:
    def __init__(self):
        self.tokenizer =  disilbert_tokenizer
        self.model = disilbert_tokenizer
        self.max_length = 256
        self.model_name = disilbert_model

    def fit(self, X=None, y=None):
        pass
    
    def encode(self, txt):
        return self.tokenizer(txt, max_length=self.max_length, 
                              truncation=True, padding=True, return_tensors="pt")

    def transform(self, X=None):
        dataloader = DataLoader(X, batch_size=4, shuffle=False)
        allembeds = []
        for batch in tqdm(dataloader):
            batchenc = disilbert_tokenizer(batch, max_length=256, 
                                           truncation=True, padding=True, return_tensors="pt")
            input_ids = batchenc['input_ids'].to(device)
            attention_mask = batchenc['attention_mask'].to(device)
            batchout = disilbert_model(input_ids, attention_mask=attention_mask, 
                                       output_hidden_states=True)
            embeds = [vec[0].cpu().detach().numpy() for vec in batchout[1][-1]]
            allembeds.extend(embeds)
        return np.array(allembeds)
    
    def train(self, body, labels):
        embeddings = self.transform(body)
        self.cls = LogisticRegression(max_iter=1000)
        self.cls.fit(embeddings, labels)
        self.train_prob = self.cls.predict_proba(embeddings)
        
    def predict(self, body):
        embeddings = self.transform(body)
        self.test_prob = self.cls.predict_proba(embeddings)
        return  self.cls.predict(embeddings) 

using device: cuda


# Graph Model

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression

# Download stopwords list
nltk.download('wordnet')
nltk.download('punkt')
stop_words = set(stopwords.words('english')) 

# Interface lemma tokenizer from nltk with sklearn
class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`', "'"]
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]
    

class TrustexModel:
    def __init__(self):
        # Lemmatize the stop words
        self.tokenizer=LemmaTokenizer()
        self.token_stop = self.tokenizer(' '.join(stop_words))
        
    def tfidf(self,body):
        self.tfidf_vectorizer = TfidfVectorizer(stop_words=self.token_stop)
        self.tfidf_vectorizer.fit(body)
        self.vocabulary_tfidf_words = self.tfidf_vectorizer.get_feature_names_out()
        self.bodyTrainTFIDF = self.tfidf_vectorizer.transform(body)
        
    def create_graph(self, body, labels):
        self.nn = NearestNeighbors(n_neighbors=10)
        self.nn.fit(self.bodyTrainTFIDF)
        knn_d,knn_idx = self.nn.kneighbors(self.bodyTrainTFIDF)
        self.graph_knn = []
        self.train_labels = labels
        from tqdm import tqdm
        for id, topIDs in tqdm(enumerate(knn_idx), total=knn_idx.shape[0]):
            vec = self.bodyTrainTFIDF[id]
            newvec = get_graph_node_stats(vec, topIDs[1:], labels, self.bodyTrainTFIDF)
            self.graph_knn.append(newvec)
        print("avg. nodes sim.=",np.mean([x[2]/x[1] for x in self.graph_knn]))

    def graph_transform_test_data(self, body):
        self.bodyTestTFIDF = self.tfidf_vectorizer.transform(body) 
        knn_test_d,knn_test_idx = self.nn.kneighbors(self.bodyTestTFIDF)
        self.graph_test_knn = []
        for id, topIDs in tqdm(enumerate(knn_test_idx), total=knn_test_idx.shape[0]):
            vec = self.bodyTestTFIDF[id]
            newvec = get_graph_node_stats(vec, topIDs[1:], self.train_labels, self.bodyTrainTFIDF)
            self.graph_test_knn.append(newvec)        
              
    def train(self, body, labels):
        print("Building similarity graph")
        self.tfidf(body)
        self.create_graph(body, labels)
          
        self.cls = LogisticRegression(max_iter=10000)
        self.cls.fit(self.graph_knn, labels)

    def predict(self, body):
        self.graph_transform_test_data(body)
        y_pred = self.cls.predict(self.graph_test_knn)
        return y_pred
    


[nltk_data] Downloading package wordnet to /home/rkozik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/rkozik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# FTS Semantics 2.0

In [74]:
import re
import traceback
from rank_bm25 import BM25Okapi
import multiprocessing as mp
import functools
from tqdm import tqdm
import re
import traceback
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from subject_verb_object_extract import findSVOs, nlp
import os
import spacy
import pickle
import numpy as np

sp = spacy.load('en_core_web_sm')


def call_it2(instance, name, arg):
    "indirect caller for instance methods and multiprocessing"
    return getattr(instance, name)(arg)


class FTSSemantic2:
    def __init__(self):
        pass
    
    def tokenize_single(self, e):
        docid, txt = e
        tokens = nlp(txt)
        svos = findSVOs(tokens)
        proc_text = []
        all_lemas = []
        for xsvo in svos:
            svo = xsvo
#             if len(svo) < 3 and len(svo) > 1:
#                 svos = [svo[0],svo[1],""]
            if len(svo) == 3:
                line = " ".join(svo)
                sentence = sp(line)
                lemas =  [word.lemma_ for word in sentence]
                all_lemas.extend(lemas)
#                 proc_text.append((docid,lemas))
        return [(docid, all_lemas)]


    def tokenize(self, body,batch=32): 
        print("Extracting SVO Triples")
        tokenized_corpus = []
        tokenized_corpus_i = []
        
        with mp.Pool(processes=mp.cpu_count()) as pool:
            func_call_it = functools.partial(call_it2, self, 'tokenize_single')
            vectors = list(tqdm(pool.imap_unordered(func_call_it, enumerate(body),chunksize=batch), total=len(body)))
        
        
        for e in vectors:
            for docid, lema in e:
                tokenized_corpus.append(lema)
                tokenized_corpus_i.append(docid)
                    
        return tokenized_corpus, tokenized_corpus_i
    
    def transform_single(self, lema):
        doc_scores = self.bm25.get_scores(lema)
        topN = np.argsort(doc_scores)[::-1][:10]
        sc = doc_scores[topN]
        lab = np.array([self.training_labels[i] for i in topN])
        sc2 = [-v if lab[i]==1 else v for i,v in enumerate(sc)]
        v = [
                len(lema), np.sum(lab),
                np.mean(sc),np.max(sc),np.min(sc),
                np.mean(sc2),np.sum(sc2),np.max(sc2),np.min(sc2),
        ]
        return v
    
    def transform(self, body, batch=32):
        vectors = []
        
        with mp.Pool(processes=mp.cpu_count()) as pool:
            func_call_it = functools.partial(call_it2, self, 'transform_single')
            vectors = list(tqdm(pool.imap_unordered(func_call_it, body, chunksize=batch), total=len(body)))
        
        return vectors
        
    def train(self, body, labels):    
        tokenized_corpus, tokenized_corpus_i = self.tokenize(body)
        self.bm25 = BM25Okapi(tokenized_corpus)
        self.training_labels = labels[list(tokenized_corpus_i)]
        self.trainX = self.transform(tokenized_corpus)
        y = labels[list(tokenized_corpus_i)]
        self.cls = LogisticRegression(max_iter=10000)
        self.cls.fit(self.trainX, y)
            
    def predict(self, body, labels):
        tokenized_corpus, tokenized_corpus_i = self.tokenize(body)
        self.testX = self.transform(tokenized_corpus)
        preds = self.cls.predict(self.testX)
        
        mp={}
        for i,e in enumerate(tokenized_corpus_i):
            if e not in mp:
                mp[e]=[]
            mp[e].append(preds[i])
        
        ypred=[]
        for id,row in enumerate(body):
            if id not in mp:
                ypred.append(0)
            else:
                ypred.append(1 if np.mean(mp[e]) > 0.5 else 0)
        
        return ypred

In [72]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [75]:
fts2 = FTSSemantic2()
fts2.train(body[train],labels[train])

Extracting SVO Triples


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 9180/9180 [00:22<00:00, 407.84it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 9180/9180 [00:26<00:00, 350.34it/s]


In [76]:
ypred = fts2.predict(body[test],labels[test])

Extracting SVO Triples


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1021/1021 [00:04<00:00, 213.13it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 1021/1021 [00:04<00:00, 212.88it/s]


In [77]:
met = Metrics()
met.update(labels[test], ypred)
met.print_table()

――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              95.3 ±  0.0  -            -            
  Balanced Accuracy     50.0 ±  0.0  -            -            
  F1                    93.0 ±  0.0   0.0 ±  0.0  97.6 ±  0.0  
  G-mean                21.2 ±  0.0   0.0 ±  0.0   0.0 ±  0.0  
  Precision             90.8 ±  0.0   0.0 ±  0.0  95.3 ±  0.0  
  Recall                95.3 ±  0.0   0.0 ±  0.0  100.0 ±  0.0  
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



# Infrence 

In [470]:
from sklearn.ensemble import RandomForestClassifier

class Inference:
    def train(self,graph, content, semantics, labels):
        newX=[]
        for i,vec in enumerate(content):
            v2 = np.append(content[i], graph[i]) 
            if semantics:
                v2 = np.append(v2, semantics[i])
            newX.append(v2)
            
        self.inf = RandomForestClassifier(max_depth=12)
        self.inf.fit(newX, labels)
        
    def predict(self, graph, content, semantics):
        newTest=[]
        for i,vec in enumerate(content):
            v2 = np.append(content[i], graph[i]) 
            if semantics:
                v2 = np.append(v2, semantics[i])
            newTest.append(v2)
    
        return self.inf.predict(newTest)

# Experiments

In [2]:
loader = DatasetLoader()
body, labels, total_number_of_claims = loader.get("covidfn")
X=range(0,total_number_of_claims)
train, test = list(rskf.split(X, labels))[0]

loading covidfn
total_number_of_claims= 10201
labels fake= 9727 real= 474


In [480]:
loader = DatasetLoader()

for dataset in ["fnkdd"]:
    body, labels, total_number_of_claims = loader.get(dataset)
    X=range(0,total_number_of_claims)
    
    trustex_quality = Metrics()
    swarog_quality = Metrics()
    semantics_quality = Metrics()
    inf3_quality = Metrics()
    inf2_quality = Metrics()
    

    for fold_idx, (train, test) in enumerate(rskf.split(X, labels)):
        print(f"fold-{fold_idx}")    
        
        swarog = SwarogModel()
        trustex = TrustexModel()
        inference2 = Inference()
        inference3 = Inference()
        semantics = FTSSemantic()
        
        semantics.train(body[train],labels[train])
        ypred = semantics.predict(body[test])
        semantics_quality.update(labels[test], ypred)
    
        trustex.train(body[train],labels[train])
        ypred = trustex.predict(body[test])
        trustex_quality.update(labels[test], ypred)
        
        swarog.train(body[train],labels[train])
        ypred = swarog.predict(body[test])
        swarog_quality.update(labels[test], ypred)

        inference2.train(trustex.graph_knn, swarog.train_prob, None, labels[train])
        newpred = inference2.predict(trustex.graph_test_knn, swarog.test_prob, None)
        inf2_quality.update(labels[test], newpred)

        
        inference3.train(trustex.graph_knn, swarog.train_prob, semantics.trainX, labels[train])
        newpred = inference3.predict(trustex.graph_test_knn, swarog.test_prob, semantics.testX)
        inf3_quality.update(labels[test], newpred)

        break

print("Semantic:")
semantics_quality.print_table()
print("Symbolic:")
trustex_quality.print_table()
print("Deep:")
swarog_quality.print_table()

print("Symb+Deep:")
inf2_quality.print_table()

print("ALL:")
inf3_quality.print_table()


loading covidfn
total_number_of_claims= 10201
labels fake= 9727 real= 474
fold-0
Building index


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 9180/9180 [01:24<00:00, 108.86it/s]


Creating trainX


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9180/9180 [02:08<00:00, 71.63it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1021/1021 [00:13<00:00, 78.34it/s]


Building similarity graph


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 9180/9180 [00:08<00:00, 1052.12it/s]


avg. nodes sim.= 0.6016417403761986


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1021/1021 [00:00<00:00, 1050.97it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 2295/2295 [00:12<00:00, 185.58it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:01<00:00, 183.95it/s]


Semantic:
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              95.5 ±  0.0  -            -            
  Balanced Accuracy     61.0 ±  0.0  -            -            
  F1                    94.6 ±  0.0  32.4 ±  0.0  97.7 ±  0.0  
  G-mean                50.3 ±  0.0  47.6 ±  0.0  47.6 ±  0.0  
  Precision             94.4 ±  0.0  55.0 ±  0.0  96.3 ±  0.0  
  Recall                95.5 ±  0.0  22.9 ±  0.0  99.1 ±  0.0  
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
Symbolic:
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              97.4 ±  0.0  -            -            
  Balanced Accuracy     75.8 ±  0.0  -            -            
  F1                    97.