# Data loader

In [44]:
import pandas as pd

from sklearn.model_selection import RepeatedStratifiedKFold
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1410)

class DatasetLoader:
    def get(self, name):
        print(f"loading {name}")
        if name == "fnkdd":
            data = pd.read_csv("/home/rkozik/Desktop/swarog_exp_disk/datasets/fakenewskdd/train.csv",sep="\t")
            data.head()
            body = data["text"].values
            labels = 1-data["label"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))
        
        if name == "mmcovid":
            data = pd.read_csv("/media/rkozik/02FF-A831/data/swarog/datasets/mmcovid/news_collection.csv",sep="\t")
            data["label"] = [ 1 if v =="fake" else 0 for v in data["label"]]
            data["text"] = [ str(v) for v in data["text"]]
            data = data[data["lang"] == "en"]

            body = data["text"].values
            labels = data["label"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))
        
        if name == "liar":
            data = pd.read_csv("/media/rkozik/02FF-A831/data/swarog/datasets/liar.csv", sep="\t",encoding="utf-8")
            def mpx(x):
                if x in [0,2]:
                    return 0
                elif x in [4,5]:
                    return 1
                else:
                    return -1
            data["text"] = data["statement"]
            data["label"] = [mpx(x) for x in data["label"]]
            data=data[ data["label"] != -1] 
            body = data["text"].values
            labels = data["label"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))

        if name == "covidfn":
            data = pd.read_csv("covid_fake_news.csv", sep="\t")
            body = data["title"].values
            labels = data["label"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))
        
        return body, labels, total_number_of_claims

# Utility functions

In [3]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from imblearn.metrics import geometric_mean_score
from IPython.display import clear_output



class Metrics:
    def __init__(self):
        self.scores = {
            'Accuracy': {'func': accuracy_score},
            'Balanced Accuracy': {'func': balanced_accuracy_score},
            'F1': {'func': f1_score},
            'Precision': {'func': precision_score},
            'Recall': {'func': recall_score},
            'G-mean': {'func': geometric_mean_score}
        }
        
        for score_name, score_dict in self.scores.items():
            score_dict["list"] = []
            score_dict["lab"] = []

    def update(self, actual, prediction):
        for score_name, score_dict in self.scores.items():
            if score_name in ["F1","Precision","Recall","G-mean"]:
                scorvaln = score_dict['func'](actual, prediction, average=None)
                score_dict['lab'].append(scorvaln)
                scorval = score_dict['func'](actual, prediction, average="weighted")
                score_dict['list'].append(scorval)
                #print(score_name, scorval, scorvaln)  
            else:
                scorval=score_dict['func'](actual, prediction)
                score_dict['list'].append(scorval)
                
    def print_table(self, labels=None):
        # Print stats
        scores = self.scores
        numlabels = scores["F1"]["lab"][0].shape[0]
        scores["F1"]["lab"][0].shape[0] 
        head = "  %-20s  %-10s  " +  numlabels * " %-10s  " 
        headv = ["Score", "Average"]
        if labels:
            headv.extend([labels[i] for i in range(numlabels)])
        else:
            headv.extend(["Lab:"+str(i+1) for i in range(numlabels)])
        row=head % tuple(headv)
        # table header
        print("―"*len(row))
        print(row)
        print("―"*len(row))
        # table rows
        for score_name, score_dict in sorted(scores.items()) :
            headv = [score_name, np.mean(score_dict['list'])*100, np.std(score_dict['list'])*100]
            for i in range(numlabels):
                if score_name in ["F1","Precision","Recall", "G-mean"]:
                    head = "  %-20s  %4.1f ± %4.1f  " + numlabels* "%4.1f ± %4.1f  "
                    vals = [v[i] for v in scores[score_name]["lab"]]
                    headv.append(np.mean(vals)*100)
                    headv.append(np.std(vals)*100)
                else:
                    head = "  %-20s  %4.1f ± %4.1f  " + numlabels * "%-11s  " 
                    headv.append("-")
            print(head % tuple(headv))
        print("―"*len(row))


def get_graph_node_stats(vec, nearestDocIDs, y_train, bodyTrainTFIDF):   
    vecdense = vec.toarray()[0]
    docids = nearestDocIDs
    trlabels = np.array(y_train)
    labsum = trlabels[docids].sum()
    
    ivec = []
    labmask = []
    for hitdocid in docids:
        value=bodyTrainTFIDF[hitdocid].toarray()[0]
        intersection = (vecdense>0)*(value>0)
        ivec.append(intersection.sum())
        labmask.append(trlabels[hitdocid])
        
    masked_ivec =  np.array(ivec)*np.array(labmask)   
    masked_ivec_neg =  np.array(ivec)*(-1*(np.array(labmask)-1)) 
    ivec = np.array(ivec)
    masked_ivec = np.array(masked_ivec)
    masked_ivec_neg = np.array(masked_ivec_neg)
    
    newvec = [labsum, (vecdense>0).sum(),ivec.max(), ivec.max(), masked_ivec.max(), masked_ivec.min(), masked_ivec_neg.max(), masked_ivec_neg.min()]
    return newvec

# Swarog Model

In [4]:
import numpy as np
import bentoml
from bentoml.io import NumpyNdarray
from bentoml.io import JSON
from annoy import AnnoyIndex
import re

import transformers
transformers.logging.set_verbosity_error()
from torch.utils.data import DataLoader 

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import TFDistilBertModel, DistilBertTokenizerFast
import torch
from scipy.sparse import csr_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import pickle5 as pickle
import sqlite3
from tqdm import tqdm
 
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("using device:", device)

if "disilbert_model" not in locals():
    disilbert_tokenizer =  AutoTokenizer.from_pretrained("distilbert-base-uncased")
    disilbert_model = AutoModel.from_pretrained("distilbert-base-uncased")
    handle = disilbert_model.to(device)

class SwarogModel:
    def __init__(self):
        self.tokenizer =  disilbert_tokenizer
        self.model = disilbert_tokenizer
        self.max_length = 256
        self.model_name = disilbert_model

    def fit(self, X=None, y=None):
        pass
    
    def encode(self, txt):
        return self.tokenizer(txt, max_length=self.max_length, 
                              truncation=True, padding=True, return_tensors="pt")

    def transform(self, X=None):
        dataloader = DataLoader(X, batch_size=4, shuffle=False)
        allembeds = []
        for batch in tqdm(dataloader):
            batchenc = disilbert_tokenizer(batch, max_length=256, 
                                           truncation=True, padding=True, return_tensors="pt")
            input_ids = batchenc['input_ids'].to(device)
            attention_mask = batchenc['attention_mask'].to(device)
            batchout = disilbert_model(input_ids, attention_mask=attention_mask, 
                                       output_hidden_states=True)
            embeds = [vec[0].cpu().detach().numpy() for vec in batchout[1][-1]]
            allembeds.extend(embeds)
        return np.array(allembeds)
    
    def train(self, body, labels):
        embeddings = self.transform(body)
        self.cls = LogisticRegression(max_iter=1000)
        self.cls.fit(embeddings, labels)
        self.train_prob = self.cls.predict_proba(embeddings)
        
    def predict(self, body):
        embeddings = self.transform(body)
        self.test_prob = self.cls.predict_proba(embeddings)
        return  self.cls.predict(embeddings) 

using device: cuda


# Graph Model

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression

# Download stopwords list
nltk.download('wordnet')
nltk.download('punkt')
stop_words = set(stopwords.words('english')) 

# Interface lemma tokenizer from nltk with sklearn
class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`', "'"]
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]
    

class TrustexModel:
    def __init__(self):
        # Lemmatize the stop words
        self.tokenizer=LemmaTokenizer()
        self.token_stop = self.tokenizer(' '.join(stop_words))
        
    def tfidf(self,body):
        self.tfidf_vectorizer = TfidfVectorizer(stop_words=self.token_stop)
        self.tfidf_vectorizer.fit(body)
        self.vocabulary_tfidf_words = self.tfidf_vectorizer.get_feature_names_out()
        self.bodyTrainTFIDF = self.tfidf_vectorizer.transform(body)
        
    def create_graph(self, body, labels):
        self.nn = NearestNeighbors(n_neighbors=10)
        self.nn.fit(self.bodyTrainTFIDF)
        knn_d,knn_idx = self.nn.kneighbors(self.bodyTrainTFIDF)
        self.graph_knn = []
        self.train_labels = labels
        from tqdm import tqdm
        for id, topIDs in tqdm(enumerate(knn_idx), total=knn_idx.shape[0]):
            vec = self.bodyTrainTFIDF[id]
            newvec = get_graph_node_stats(vec, topIDs[1:], labels, self.bodyTrainTFIDF)
            self.graph_knn.append(newvec)
        print("avg. nodes sim.=",np.mean([x[2]/x[1] for x in self.graph_knn]))

    def graph_transform_test_data(self, body):
        self.bodyTestTFIDF = self.tfidf_vectorizer.transform(body) 
        knn_test_d,knn_test_idx = self.nn.kneighbors(self.bodyTestTFIDF)
        self.graph_test_knn = []
        for id, topIDs in tqdm(enumerate(knn_test_idx), total=knn_test_idx.shape[0]):
            vec = self.bodyTestTFIDF[id]
            newvec = get_graph_node_stats(vec, topIDs[1:], self.train_labels, self.bodyTrainTFIDF)
            self.graph_test_knn.append(newvec)        
              
    def train(self, body, labels):
        print("Building similarity graph")
        self.tfidf(body)
        self.create_graph(body, labels)
          
        self.cls = LogisticRegression(max_iter=10000)
        self.cls.fit(self.graph_knn, labels)

    def predict(self, body):
        self.graph_transform_test_data(body)
        y_pred = self.cls.predict(self.graph_test_knn)
        return y_pred
    


[nltk_data] Downloading package wordnet to /home/rkozik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/rkozik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Semantic relations

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from subject_verb_object_extract import findSVOs, nlp
import nltk
from nltk.stem import PorterStemmer
import sqlite3
from tqdm import tqdm
import os

nltk.download("punkt")
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english')) 

class Semantic:
    def __init__(self):
        self.ps = PorterStemmer()
        self.tokenizer=LemmaTokenizer()
        self.token_stop = self.tokenizer(' '.join(stop_words))
        self.con=sqlite3.connect("svo.db")
        
    def extract_svo(self, body, labels):
        self.con.execute("drop table if exists svo")
        self.con.execute("create table if not exists svo(sub, verb, obj, docid, label)")
        self.con.commit()
        
        for docid, txt in tqdm(enumerate(body), total=len(labels)):
            tokens = nlp(txt)
            svos = findSVOs(tokens)
            for svo in svos:
                if len(svo) == 3:
                    self.con.execute("insert into svo values(?,?,?,?,?)", 
                        [svo[0],self.ps.stem(svo[1]),svo[2], docid, int(labels[docid])])
            self.con.commit()
        
    def tfidf(self,body):
        self.tfidf_vectorizer = TfidfVectorizer(stop_words=self.token_stop)
        self.tfidf_vectorizer.fit(body)
        self.vocabulary_tfidf_words = self.tfidf_vectorizer.get_feature_names_out()
        self.bodyTrainTFIDF = self.tfidf_vectorizer.transform(body)
        
    def create_graph(self, body, labels):
        self.nn = NearestNeighbors(n_neighbors=10)
        self.nn.fit(self.bodyTrainTFIDF)
        knn_d,knn_idx = self.nn.kneighbors(self.bodyTrainTFIDF)
        self.train_labels = labels
        
    def transform_test_data(self, body):
        self.bodyTestTFIDF = self.tfidf_vectorizer.transform(body) 
        knn_test_d,knn_test_idx = self.nn.kneighbors(self.bodyTestTFIDF)
        rsvo_body = self.right_svo["redux"].values
        return knn_test_d,knn_test_idx
              
    def train(self, body, labels):
        print("Extracting SVO")
        self.extract_svo(body,labels) 

#         rsvo_body = self.right_svo["redux"].values
#         print(self.right_svo.head())
        
#         print("Building similarity graph")
#         self.tfidf(rsvo_body)
#         self.create_graph(rsvo_body, labels)
          
#         self.cls = LogisticRegression(max_iter=10000)
#         self.cls.fit(self.graph_knn, labels)

    def predict(self, body):
        print(body)
        tokens = nlp(body)
        svos = findSVOs(tokens)
        for svo in svos:
            if len(svo) != 3:
                continue
            verb = self.ps.stem(svo[1])
            print("---\n",svo)    
            self.svo = pd.read_sql(f"""
                select 
                    sum(label) as lab, count(*) as ctr, count(distinct docid) as docs
                    from svo  
                    where 1 
                    and (sub='{svo[0]}' and verb='{verb}' and obj='{svo[2]}')
                """, self.con)
            print(self.svo)

        
        #return self.svo
        #y_pred = self.cls.predict(self.graph_test_knn)
        #return y_pred
    

[nltk_data] Downloading package punkt to /home/rkozik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/rkozik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/rkozik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
sem = Semantic()
sem.train(body[train],labels[train])

Extracting SVO


100%|██████████████████████████████████████| 8074/8074 [00:55<00:00, 145.20it/s]


In [43]:
body[0]

'A post claims compulsory vacination violates the principles of bioethics, that coronavirus doesnâ€™t exist, that the PCR test returns many false positives, and that influenza vaccine is related to COVID-19.'

In [41]:
sem.predict(body[test][11])

The president of Ghana said that the coronavirus was created in a laboratory
---
 ('a laboratory', 'create', 'the coronavirus')
   lab  ctr  docs
0    0    4     4


In [42]:
body[train][209], labels[train][209]

('5G technology creates coronavirus in human cells', 0)

In [29]:
pd.read_sql("""
select * from svo 
where verb = 'creat'
limit 10
""", sem.con)

Unnamed: 0,sub,verb,obj,docid,label
0,a laboratory,creat,the coronavirus,105,0
1,5 G technology,creat,coronavirus,209,0
2,a laboratory,creat,the coronavirus,222,0
3,a laboratory,creat,the coronavirus,230,0
4,Izmir,creat,5 G base stations,346,0
5,the new coronavirus epidemic period,creat,5 G base stations,346,0
6,a special operation,creat,a concentration camp in,471,0
7,a laboratory,creat,the coronavirus,659,0
8,mass chipping,creat,Dr. Leonard Coldwell :,829,0
9,mass chipping,creat,coronavirus,829,0


In [None]:
pd.read_sql("""
select 
    sum(label),count(*), count(distinct docid)
    from svo  
    where sub='chlorine dioxide' and verb='cures' and obj='COVID-19' 
    -- limit 10
""", con)
right_svo["redux"].values

In [5]:
from subject_verb_object_extract import findSVOs, nlp
tokens = nlp("Seated in Mission Control, Chris Kraft neared the end of a tedious Friday afternoon as he monitored a seemingly interminable ground test of the Apollo 1 spacecraft.")
tokens = nlp("  WHO suggests oral hygiene to prevent the spread of coronavirus.")
svos = findSVOs(tokens)
print(svos)

def semantic_train(body, labels):
    

IndentationError: expected an indented block (1323189935.py, line 8)

In [24]:
import nltk
from nltk.stem import PorterStemmer
nltk.download("punkt")
ps = PorterStemmer()

example_words = ["program","programming","programer","programs","programmed"]
print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in example_words:
    print("{0:20}{1:20}".format(word, ps.stem(word)))

--Word--            --Stem--            
program             program             
programming         program             
programer           program             
programs            program             
programmed          program             


[nltk_data] Downloading package punkt to /home/rkozik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
!rm svodb

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
rm: cannot remove 'svodb': No such file or directory


In [27]:
import sqlite3
from tqdm import tqdm
import os
con=sqlite3.connect("svo.db")
con.execute("drop table if exists svo")
con.execute("create table if not exists svo(sub, verb, obj, docid, label)")
con.commit()

In [28]:
from subject_verb_object_extract import findSVOs, nlp
con.commit()
for docid, txt in tqdm(enumerate(body[train]), total=len(train)):
    tokens = nlp(txt)
    svos = findSVOs(tokens)
    for svo in svos:
        if len(svo) == 3:
            con.execute("insert into svo values(?,?,?,?,?)", [svo[0],ps.stem(svo[1]),svo[2],docid, int(labels[train][docid])] )
con.commit()

100%|██████████████████████████████████████| 8074/8074 [00:37<00:00, 218.02it/s]


In [25]:
pd.read_sql("select count(*) from svo", con)

Unnamed: 0,count(*)
0,10618


In [132]:
right_svo = pd.read_sql("""
select 
    group_concat(sub)  || " " || verb || " " || obj as redux,
    sum(label) as fakes,
    count(*) as ctr 
    from svo group by verb,obj 
    order by ctr desc 
    -- limit 10
""", con)
right_svo

Unnamed: 0,redux,fakes,ctr
0,"thousands of times,Facebook,a claim,hundreds o...",0,126
1,"chlorine dioxide,Drinking water,Lemon Juice,Pa...",0,75
2,a protest against coronavirus restrictions in ...,0,57
3,"thousands of times,multiple posts on,hundreds ...",0,44
4,"WHO,measures,Boldo tea,that cures,seawater,a p...",0,42
...,...,...,...
7707,There ’s a “ direct correlation ” between Caro...,1,1
7708,There ’s definitive proof are,0,1
7709,That ’s his quote,1,1
7710,there ’s some good data there,1,1


# Infrence 

In [8]:
from sklearn.ensemble import RandomForestClassifier

class Inference:
    def train(self,graph, content, labels):
        newX=[]
        for i,vec in enumerate(content):
            v2 = np.append(content[i], graph[i])
            newX.append(v2)
            
        self.inf = RandomForestClassifier(max_depth=12)
        self.inf.fit(newX, labels)
        
    def predict(self, graph, content):
        newTest=[]
        for i,vec in enumerate(content):
            v2 = np.append(content[i], graph[i])
            newTest.append(v2)
    
        return self.inf.predict(newTest)

# Experiments

In [45]:
body, labels, total_number_of_claims = loader.get("covidfn")
train, test = list(rskf.split(X, labels))[0]

loading covidfn
total_number_of_claims= 8972
labels fake= 461 real= 8511


In [46]:
loader = DatasetLoader()

for dataset in ["covidfn"]:
    body, labels, total_number_of_claims = loader.get(dataset)
    X=range(0,total_number_of_claims)
    
    trustex_quality = Metrics()
    swarog_quality = Metrics()
    inf_quality = Metrics()
    

    for fold_idx, (train, test) in enumerate(rskf.split(X, labels)):
        print(f"fold-{fold_idx}")    
        
        swarog = SwarogModel()
        trustex = TrustexModel()
        inference = Inference()
    
        trustex.train(body[train],labels[train])
        ypred = trustex.predict(body[test])
        trustex_quality.update(labels[test], ypred)
        
        swarog.train(body[train],labels[train])
        ypred = swarog.predict(body[test])
        swarog_quality.update(labels[test], ypred)
        
        inference.train(trustex.graph_knn, swarog.train_prob, labels[train])
        newpred = inference.predict(trustex.graph_test_knn, swarog.test_prob)
        inf_quality.update(labels[test], newpred)

        break

print("Symbolic:")
trustex_quality.print_table()
print("Deep:")
swarog_quality.print_table()
print("Both:")
inf_quality.print_table()


loading covidfn
total_number_of_claims= 8972
labels fake= 461 real= 8511
fold-0
Building similarity graph


100%|█████████████████████████████████████| 8074/8074 [00:07<00:00, 1062.31it/s]


avg. nodes sim.= 0.49843772108989287


100%|███████████████████████████████████████| 898/898 [00:00<00:00, 1064.68it/s]
100%|██████████████████████████████████████| 2019/2019 [00:10<00:00, 184.54it/s]
100%|████████████████████████████████████████| 225/225 [00:01<00:00, 185.76it/s]


Symbolic:
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              96.5 ±  0.0  -            -            
  Balanced Accuracy     67.3 ±  0.0  -            -            
  F1                    95.8 ±  0.0  98.2 ±  0.0  50.8 ±  0.0  
  G-mean                60.7 ±  0.0  58.9 ±  0.0  58.9 ±  0.0  
  Precision             96.5 ±  0.0  96.6 ±  0.0  94.1 ±  0.0  
  Recall                96.5 ±  0.0  99.9 ±  0.0  34.8 ±  0.0  
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
Deep:
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              97.1 ±  0.0  -            -            
  Balanced Accuracy     77.9 ±  0.0  -            -            
  F1                    96.9 ± 