# Data loader

In [13]:
import pandas as pd

from sklearn.model_selection import RepeatedStratifiedKFold
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=1410)

class DatasetLoader:
    def get(self, name):
        print(f"loading {name}")
        if name == "fnkdd":
            data = pd.read_csv("/home/rkozik/Desktop/swarog_exp_disk/datasets/fakenewskdd/train.csv",sep="\t")
            data.head()
            body = data["text"].values
            labels = 1-data["label"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))
        
        if name == "mmcovid":
            data = pd.read_csv("/media/rkozik/02FF-A831/data/swarog/datasets/mmcovid/news_collection.csv",sep="\t")
            data["label"] = [ 1 if v =="fake" else 0 for v in data["label"]]
            data["text"] = [ str(v) for v in data["text"]]
            data = data[data["lang"] == "en"]

            body = data["text"].values
            labels = data["label"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))
        
        if name == "liar":
            data = pd.read_csv("/media/rkozik/02FF-A831/data/swarog/datasets/liar.csv", sep="\t",encoding="utf-8")
            def mpx(x):
                if x in [0,2]:
                    return 0
                elif x in [4,5]:
                    return 1
                else:
                    return -1
            data["text"] = data["statement"]
            data["label"] = [mpx(x) for x in data["label"]]
            data=data[ data["label"] != -1] 
            body = data["text"].values
            labels = data["label"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))

        if name == "covidfn":
            data = pd.read_csv("covid_fake_news.csv", sep=",")
            body = data["headlines"].values
            labels = data["outcome"].values
            total_number_of_claims = data.shape[0]
            print("total_number_of_claims=",total_number_of_claims)
            print("labels fake=",sum(labels),"real=", len(labels)-sum(labels))
        
        return body, labels, total_number_of_claims

# Utility functions

In [17]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from imblearn.metrics import geometric_mean_score
from IPython.display import clear_output



class Metrics:
    def __init__(self):
        self.scores = {
            'Accuracy': {'func': accuracy_score},
            'Balanced Accuracy': {'func': balanced_accuracy_score},
            'F1': {'func': f1_score},
            'Precision': {'func': precision_score},
            'Recall': {'func': recall_score},
            'G-mean': {'func': geometric_mean_score}
        }
        
        for score_name, score_dict in self.scores.items():
            score_dict["list"] = []
            score_dict["lab"] = []

    def update(self, actual, prediction):
        for score_name, score_dict in self.scores.items():
            if score_name in ["F1","Precision","Recall","G-mean"]:
                scorvaln = score_dict['func'](actual, prediction, average=None)
                score_dict['lab'].append(scorvaln)
                scorval = score_dict['func'](actual, prediction, average="weighted")
                score_dict['list'].append(scorval)
                #print(score_name, scorval, scorvaln)  
            else:
                scorval=score_dict['func'](actual, prediction)
                score_dict['list'].append(scorval)
                
    def print_table(self, labels=None):
        # Print stats
        scores = self.scores
        numlabels = scores["F1"]["lab"][0].shape[0]
        scores["F1"]["lab"][0].shape[0] 
        head = "  %-20s  %-10s  " +  numlabels * " %-10s  " 
        headv = ["Score", "Average"]
        if labels:
            headv.extend([labels[i] for i in range(numlabels)])
        else:
            headv.extend(["Lab:"+str(i+1) for i in range(numlabels)])
        row=head % tuple(headv)
        # table header
        print("―"*len(row))
        print(row)
        print("―"*len(row))
        # table rows
        for score_name, score_dict in sorted(scores.items()) :
            headv = [score_name, np.mean(score_dict['list'])*100, np.std(score_dict['list'])*100]
            for i in range(numlabels):
                if score_name in ["F1","Precision","Recall", "G-mean"]:
                    head = "  %-20s  %4.1f ± %4.1f  " + numlabels* "%4.1f ± %4.1f  "
                    vals = [v[i] for v in scores[score_name]["lab"]]
                    headv.append(np.mean(vals)*100)
                    headv.append(np.std(vals)*100)
                else:
                    head = "  %-20s  %4.1f ± %4.1f  " + numlabels * "%-11s  " 
                    headv.append("-")
            print(head % tuple(headv))
        print("―"*len(row))


def get_graph_node_stats(vec, nearestDocIDs, y_train, bodyTrainTFIDF):   
    vecdense = vec.toarray()[0]
    docids = nearestDocIDs
    trlabels = np.array(y_train)
    labsum = trlabels[docids].sum()
    
    ivec = []
    labmask = []
    for hitdocid in docids:
        value=bodyTrainTFIDF[hitdocid].toarray()[0]
        intersection = (vecdense>0)*(value>0)
        ivec.append(intersection.sum())
        labmask.append(trlabels[hitdocid])
        
    masked_ivec =  np.array(ivec)*np.array(labmask)   
    masked_ivec_neg =  np.array(ivec)*(-1*(np.array(labmask)-1)) 
    ivec = np.array(ivec)
    masked_ivec = np.array(masked_ivec)
    masked_ivec_neg = np.array(masked_ivec_neg)
    
    newvec = [labsum, (vecdense>0).sum(),ivec.max(), ivec.max(), masked_ivec.max(), masked_ivec.min(), masked_ivec_neg.max(), masked_ivec_neg.min()]
    return newvec

# Swarog Model

In [18]:
import numpy as np
import bentoml
from bentoml.io import NumpyNdarray
from bentoml.io import JSON
from annoy import AnnoyIndex
import re

import transformers
transformers.logging.set_verbosity_error()
from torch.utils.data import DataLoader 

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import TFDistilBertModel, DistilBertTokenizerFast
import torch
from scipy.sparse import csr_matrix

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import pickle5 as pickle
import sqlite3
from tqdm import tqdm
 
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("using device:", device)

if "disilbert_model" not in locals():
    disilbert_tokenizer =  AutoTokenizer.from_pretrained("distilbert-base-uncased")
    disilbert_model = AutoModel.from_pretrained("distilbert-base-uncased")
    handle = disilbert_model.to(device)

class SwarogModel:
    def __init__(self):
        self.tokenizer =  disilbert_tokenizer
        self.model = disilbert_tokenizer
        self.max_length = 256
        self.model_name = disilbert_model

    def fit(self, X=None, y=None):
        pass
    
    def encode(self, txt):
        return self.tokenizer(txt, max_length=self.max_length, 
                              truncation=True, padding=True, return_tensors="pt")

    def transform(self, X=None):
        dataloader = DataLoader(X, batch_size=4, shuffle=False)
        allembeds = []
        for batch in tqdm(dataloader):
            batchenc = disilbert_tokenizer(batch, max_length=256, 
                                           truncation=True, padding=True, return_tensors="pt")
            input_ids = batchenc['input_ids'].to(device)
            attention_mask = batchenc['attention_mask'].to(device)
            batchout = disilbert_model(input_ids, attention_mask=attention_mask, 
                                       output_hidden_states=True)
            embeds = [vec[0].cpu().detach().numpy() for vec in batchout[1][-1]]
            allembeds.extend(embeds)
        return np.array(allembeds)
    
    def train(self, body, labels):
        embeddings = self.transform(body)
        self.cls = LogisticRegression(max_iter=1000)
        self.cls.fit(embeddings, labels)
        self.train_prob = self.cls.predict_proba(embeddings)
        
    def predict(self, body):
        embeddings = self.transform(body)
        self.test_prob = self.cls.predict_proba(embeddings)
        return  self.cls.predict(embeddings) 

using device: cuda


# Graph Model

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression

# Download stopwords list
nltk.download('wordnet')
nltk.download('punkt')
stop_words = set(stopwords.words('english')) 

# Interface lemma tokenizer from nltk with sklearn
class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`', "'"]
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]
    

class TrustexModel:
    def __init__(self):
        # Lemmatize the stop words
        self.tokenizer=LemmaTokenizer()
        self.token_stop = self.tokenizer(' '.join(stop_words))
        
    def tfidf(self,body):
        self.tfidf_vectorizer = TfidfVectorizer(stop_words=self.token_stop)
        self.tfidf_vectorizer.fit(body)
        self.vocabulary_tfidf_words = self.tfidf_vectorizer.get_feature_names_out()
        self.bodyTrainTFIDF = self.tfidf_vectorizer.transform(body)
        
    def create_graph(self, body, labels):
        self.nn = NearestNeighbors(n_neighbors=10)
        self.nn.fit(self.bodyTrainTFIDF)
        knn_d,knn_idx = self.nn.kneighbors(self.bodyTrainTFIDF)
        self.graph_knn = []
        self.train_labels = labels
        from tqdm import tqdm
        for id, topIDs in tqdm(enumerate(knn_idx), total=knn_idx.shape[0]):
            vec = self.bodyTrainTFIDF[id]
            newvec = get_graph_node_stats(vec, topIDs[1:], labels, self.bodyTrainTFIDF)
            self.graph_knn.append(newvec)
        print("avg. nodes sim.=",np.mean([x[2]/x[1] for x in self.graph_knn]))

    def graph_transform_test_data(self, body):
        self.bodyTestTFIDF = self.tfidf_vectorizer.transform(body) 
        knn_test_d,knn_test_idx = self.nn.kneighbors(self.bodyTestTFIDF)
        self.graph_test_knn = []
        for id, topIDs in tqdm(enumerate(knn_test_idx), total=knn_test_idx.shape[0]):
            vec = self.bodyTestTFIDF[id]
            newvec = get_graph_node_stats(vec, topIDs[1:], self.train_labels, self.bodyTrainTFIDF)
            self.graph_test_knn.append(newvec)        
              
    def train(self, body, labels):
        print("Building similarity graph")
        self.tfidf(body)
        self.create_graph(body, labels)
          
        self.cls = LogisticRegression(max_iter=10000)
        self.cls.fit(self.graph_knn, labels)

    def predict(self, body):
        self.graph_transform_test_data(body)
        y_pred = self.cls.predict(self.graph_test_knn)
        return y_pred
    


[nltk_data] Downloading package wordnet to /home/rkozik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/rkozik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# FTS Semantics

In [392]:
import re

class FTSSemantic:
    def __init__(self):
        self.con=sqlite3.connect("svo.db")
        
    def build_index(self, body, labels, tblname="train"): 
        print("Building index")
        self.con.execute(f"drop table if exists {tblname}_fts")
        self.con.execute(f"create VIRTUAL table if not exists {tblname}_fts USING fts5(txt, original, label)")
        
        for docid, txt in tqdm(enumerate(body), total=len(labels)):
            tokens = nlp(txt)
            svos = findSVOs(tokens)
            vecs = []
            newtext=txt
            for xsvo in svos:
                svo = xsvo
                if len(svo) < 3 and len(svo) > 1:
                    svos = [svo[0],svo[1],""]
                if len(svo) == 3:
                    line = " ".join(svo)
                    sentence = sp(line)
                    w = " ".join([word.lemma_ for word in sentence])
                    vecs.append(w)
            if len(vecs) > 0:
                newtext=" ".join(vecs)
                
            self.con.execute(f"insert into {tblname}_fts values(?,?,?)", 
                        [newtext,txt, str(labels[docid])])
            
        self.con.commit() 
        
    def transformData(self, body):
        X = []
        for docid, txt in tqdm(enumerate(body), total=len(body)):
            tokens = nlp(txt)
            svos = findSVOs(tokens)
            svos_vec=self.svos_features(svos)
            X.append(svos_vec)
        return X
            
    def createTrainData(self, body, labels):
        print("Creating trainX")
        X = self.transformData(body)
        with open("fts_train_X.pickle","wb") as fp:
            pickle.dump(X, fp)
        return X
        
    def train(self, body, labels, tblname="train"):    
        #self.build_index(body,labels, tblname)

        #self.createTrainData(body,labels)
        
        with open("fts_train_X.pickle","rb") as fp:
            X=pickle.load(fp)
        
        #self.cls = LogisticRegression(max_iter=10000)
        self.cls = RandomForestClassifier(max_depth=12)
        self.cls.fit(X, labels)
        
    def svo2query(self,svo):
        line = " ".join(svo)
        sentence = sp(line)
        zlepek = [re.sub(r'[^a-zA-Z0-9]', ' ', word.lemma_) for word in sentence]
        zlepek = [" ".join(z.split()) for z in zlepek]
        zlepek = [z for z in zlepek if len(z) > 1]
        w = " OR ".join(zlepek)
        return w
    
    def svos_features(self,svos):
        result = []
        for xsvo in svos:
            svo = xsvo
            if len(svo) < 3 and len(svo) > 1:
                svos = [svo[0],svo[1],""]

            if len(svo) == 3:
                w = self.svo2query(svo)
                #print("   ",w)
                try:
                    q=f"""
                        with top10 as (
                            select bm25(train_fts) as score, cast(label as int) as pred from train_fts  
                             where txt  match "{w}" order by bm25(train_fts) limit 5
                        )
                        select min(score), avg(score), max(score), avg(pred), count(*), sum(pred) from top10
                    """
                    fvector = self.con.execute(q).fetchall()[0]
                    result.append(fvector)
                    #print(w)
                    #print(matchscore)
                except (sqlite3.OperationalError):
                    #print(w)
                    result.append([0]*6)
        if len(result) > 0:
            return np.mean(result, axis=0)
        else:
            return [0]*6

    def predict(self, body, labels):
        pred = []
        X = self.transformData(body)
        return self.cls.predict(X)

In [393]:
sem = FTSSemantic()
sem.train(body[train],labels[train])

In [394]:
ypred=sem.predict(body[test],labels[test])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1021/1021 [00:11<00:00, 89.83it/s]


In [395]:
met = Metrics()
met.update(labels[test], ypred)
met.print_table()

――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              95.9 ±  0.0  -            -            
  Balanced Accuracy     60.2 ±  0.0  -            -            
  F1                    94.8 ±  0.0  97.9 ±  0.0  32.3 ±  0.0  
  G-mean                48.5 ±  0.0  45.5 ±  0.0  45.5 ±  0.0  
  Precision             95.1 ±  0.0  96.2 ±  0.0  71.4 ±  0.0  
  Recall                95.9 ±  0.0  99.6 ±  0.0  20.8 ±  0.0  
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――


# Semantic relations

In [232]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.linear_model import LogisticRegression
from subject_verb_object_extract import findSVOs, nlp
import nltk
from nltk.stem import PorterStemmer
import sqlite3
from tqdm import tqdm
import os
import spacy
sp = spacy.load('en_core_web_sm')
import pickle 

nltk.download("punkt")
nltk.download('wordnet')
nltk.download('punkt')

stop_words = set(stopwords.words('english')) 

        
class Semantic:
    def __init__(self):
        self.ps = PorterStemmer()
        self.tokenizer=LemmaTokenizer()
        self.token_stop = self.tokenizer(' '.join(stop_words))
        self.con=sqlite3.connect("svo.db")
        
    def extract_svo(self, body, labels, tblname="train"):
        self.con.execute(f"drop table if exists {tblname}_svo")
        self.con.execute(f"create table if not exists {tblname}_svo(sub, verb, obj, docid, label)")
        self.con.commit()
        
        for docid, txt in tqdm(enumerate(body), total=len(labels)):
            tokens = nlp(txt)
            svos = findSVOs(tokens)
            for svo in svos:
                if len(svo) == 3:
                    self.con.execute(f"insert into {tblname}_svo values(?,?,?,?,?)", 
                        [svo[0],self.ps.stem(svo[1]),svo[2], docid, int(labels[docid])])
            self.con.commit()
            
    def svo_lema(self, tblname="train"):
        self.con.execute(f"drop table if exists {tblname}_svo_lema")
        self.con.execute(f"create table if not exists {tblname}_svo_lema(lema, docid, label)")
        self.con.commit()
        
        redux = pd.read_sql(f"""
        select sub || " " || verb || " " || obj as redux, docid, label from {tblname}_svo 
        -- limit 10
        """, sem.con).values

        for line in tqdm(redux,total=redux.shape[0]):
            sentence = sp(line[0])
            w = " ".join([word.lemma_ for word in sentence])
            self.con.execute(f"insert into {tblname}_svo_lema values(?,?,?)", [w, line[1],line[2]]) 
            
        self.con.commit()
        
    def tfidf(self,body):
        self.tfidf_vectorizer = TfidfVectorizer(stop_words=self.token_stop)
        self.tfidf_vectorizer.fit(body)
        self.vocabulary_tfidf_words = self.tfidf_vectorizer.get_feature_names_out()
        self.bodyTrainTFIDF = self.tfidf_vectorizer.transform(body)
        
    def create_graph(self, body, labels):
        self.nn = NearestNeighbors(n_neighbors=10)
        self.nn.fit(self.bodyTrainTFIDF)
        knn_d,knn_idx = self.nn.kneighbors(self.bodyTrainTFIDF)
        self.graph_knn = []
        self.train_labels = labels
        from tqdm import tqdm
        for id, topIDs in tqdm(enumerate(knn_idx), total=knn_idx.shape[0]):
            #print(body[id])
            vec = self.bodyTrainTFIDF[id]
            newvec = get_graph_node_stats(vec, topIDs[1:], labels, self.bodyTrainTFIDF)
            #print("  ",newvec)
            self.graph_knn.append(newvec)
        print("avg. nodes sim.=",np.mean([x[2]/(x[1]+0.000001) for x in self.graph_knn]))
        
        with open("sem_graph_knn.pickle","wb") as fp:
            pickle.dump(sem.graph_knn, fp)
        
    def transform_test_data(self, body):
        self.bodyTestTFIDF = self.tfidf_vectorizer.transform(body) 
        knn_test_d,knn_test_idx = self.nn.kneighbors(self.bodyTestTFIDF)
        rsvo_body = self.right_svo["redux"].values
        return knn_test_d,knn_test_idx
              
    def train(self, body, labels):
        print("Extracting SVO")
        #self.extract_svo(body,labels) 
        print("Lematizing SVO")
        #self.svo_lema()
  
    
        rexux=pd.read_sql(f"""
        select lema, label from train_svo_lema 
        --limit 10
        """, sem.con)
    
        self.tfidf(rexux["lema"].values)
        #self.create_graph(rexux["lema"].values,rexux["label"].values)
        
        with open("sem_graph_knn.pickle","rb") as fp:
            sem.graph_knn=pickle.load(fp)
        
        
                  
        self.cls = LogisticRegression(max_iter=10000)
        self.cls.fit(self.graph_knn, rexux["label"].values)
        
            
#         rsvo_body = self.right_svo["redux"].values
#         print(self.right_svo.head())
        
#         print("Building similarity graph")
#         self.tfidf(rsvo_body)
#         self.create_graph(rsvo_body, labels)
          
#         self.cls = LogisticRegression(max_iter=10000)
#         self.cls.fit(self.graph_knn, labels)

    def predict(self, body, labels):
        print("Extracting SVO")
        #self.extract_svo(body,labels,"test") 
        print("Lematizing SVO")
        #self.svo_lema("test")
        
            
        rexux=pd.read_sql(f"""
        select lema, label from train_svo_lema 
        --limit 10
        """, sem.con)
    
        self.tfidf(rexux["lema"].values)
        
        self.nn = NearestNeighbors(n_neighbors=10)
        self.nn.fit(self.bodyTrainTFIDF)
        
        with open("sem_graph_knn.pickle","rb") as fp:
            sem.graph_knn=pickle.load(fp)
            
        sem.graph_knn=np.array(sem.graph_knn)
        
        preds=[]
        
        for docid, txt in tqdm(enumerate(body), total=len(labels)):
            tokens = nlp(txt)
            svos = findSVOs(tokens)
            vecs = []

            for xsvo in svos:
                svo = xsvo
                if len(svo) < 3 and len(svo) > 1:
                    svos = [svo[0],svo[1],""]
                
                if len(svo) == 3:
                    line = " ".join([svo[0],self.ps.stem(svo[1]),svo[2]])
                    sentence = sp(line)
                    w = " ".join([word.lemma_ for word in sentence])
                    densev = self.tfidf_vectorizer.transform([w]) 
                    knn_test_d,knn_test_idx = self.nn.kneighbors(densev)
                    nearest = knn_test_idx[0][0]
                    nv = sem.graph_knn[nearest]
                    vecs.append(self.cls.predict([nv])[0])
            if len(vecs) > 0:
                preds.append(np.mean(vecs))
            else:
                preds.append(0)
        return preds

[nltk_data] Downloading package punkt to /home/rkozik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/rkozik/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/rkozik/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [233]:
sem = Semantic()
sem.train(body[train],labels[train])

Extracting SVO
Lematizing SVO


In [234]:
 pred = sem.predict(body[test], labels[test])

Extracting SVO
Lematizing SVO


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1021/1021 [00:11<00:00, 90.24it/s]


In [237]:
ypred = [1 if x > 0.5 else 0 for x in pred]

In [239]:
met = Metrics()
met.update(labels[test], ypred)
met.print_table()

――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              95.5 ±  0.0  -            -            
  Balanced Accuracy     53.1 ±  0.0  -            -            
  F1                    93.6 ±  0.0  97.7 ±  0.0  11.5 ±  0.0  
  G-mean                31.9 ±  0.0  25.0 ±  0.0  25.0 ±  0.0  
  Precision             94.6 ±  0.0  95.6 ±  0.0  75.0 ±  0.0  
  Recall                95.5 ±  0.0  99.9 ±  0.0   6.2 ±  0.0  
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――


In [120]:
sem.tfidf_vectorizer.transform(["the principle of , violat compulsory vacination"]).todense()[0]

matrix([[0., 0., 0., ..., 0., 0., 0.]])

In [85]:
pd.read_sql("""
select lema, label from svo_lema 
limit 10
""", sem.con)["lema"]


0     the principle of , violat compulsory vacination
1             many false positive return the PCR test
2                    COVID-19 relat influenza vaccine
3    COVID-19 regist all death by respiratory failure
4                there be a lot of PCR false positive
5                         a lot of spread coronavirus
6           a red ribbon put Households with , Grande
7      their garbage bag put Households with , Grande
8                               they handl Garbagemen
9                         a safe way handl Garbagemen
Name: lema, dtype: object

In [66]:
redux_lema_labels = [x[0] for x in pd.read_sql("""
select label from svo 
""", sem.con).values]
print(len(redux_lema_labels))

11977


In [67]:
pd.read_sql("""
select sub || " " || verb || " " || obj as redux from svo 
-- limit 10
""", sem.con).values

tm = TrustexModel()
tm.train(redux_lema,redux_lema_labels)

Building similarity graph


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 11977/11977 [00:14<00:00, 837.76it/s]

avg. nodes sim.= nan



  print("avg. nodes sim.=",np.mean([x[2]/x[1] for x in self.graph_knn]))


# Infrence 

In [21]:
from sklearn.ensemble import RandomForestClassifier

class Inference:
    def train(self,graph, content, labels):
        newX=[]
        for i,vec in enumerate(content):
            v2 = np.append(content[i], graph[i])
            newX.append(v2)
            
        self.inf = RandomForestClassifier(max_depth=12)
        self.inf.fit(newX, labels)
        
    def predict(self, graph, content):
        newTest=[]
        for i,vec in enumerate(content):
            v2 = np.append(content[i], graph[i])
            newTest.append(v2)
    
        return self.inf.predict(newTest)

# Experiments

In [24]:
loader = DatasetLoader()
body, labels, total_number_of_claims = loader.get("covidfn")
X=range(0,total_number_of_claims)
train, test = list(rskf.split(X, labels))[0]

loading covidfn
total_number_of_claims= 10201
labels fake= 474 real= 9727


In [22]:
loader = DatasetLoader()

for dataset in ["covidfn"]:
    body, labels, total_number_of_claims = loader.get(dataset)
    X=range(0,total_number_of_claims)
    
    trustex_quality = Metrics()
    swarog_quality = Metrics()
    inf_quality = Metrics()
    

    for fold_idx, (train, test) in enumerate(rskf.split(X, labels)):
        print(f"fold-{fold_idx}")    
        
        swarog = SwarogModel()
        trustex = TrustexModel()
        inference = Inference()
    
        trustex.train(body[train],labels[train])
        ypred = trustex.predict(body[test])
        trustex_quality.update(labels[test], ypred)
        
        swarog.train(body[train],labels[train])
        ypred = swarog.predict(body[test])
        swarog_quality.update(labels[test], ypred)
        
        inference.train(trustex.graph_knn, swarog.train_prob, labels[train])
        newpred = inference.predict(trustex.graph_test_knn, swarog.test_prob)
        inf_quality.update(labels[test], newpred)

        break

print("Symbolic:")
trustex_quality.print_table()
print("Deep:")
swarog_quality.print_table()
print("Both:")
inf_quality.print_table()


loading covidfn
total_number_of_claims= 10201
labels fake= 474 real= 9727
fold-0
Building similarity graph


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 9180/9180 [00:08<00:00, 1093.91it/s]


avg. nodes sim.= 0.6016417403761986


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1021/1021 [00:00<00:00, 1091.77it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 2295/2295 [00:12<00:00, 181.00it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 256/256 [00:01<00:00, 189.25it/s]


Symbolic:
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              97.4 ±  0.0  -            -            
  Balanced Accuracy     75.8 ±  0.0  -            -            
  F1                    97.0 ±  0.0  98.6 ±  0.0  64.9 ±  0.0  
  G-mean                72.7 ±  0.0  72.0 ±  0.0  72.0 ±  0.0  
  Precision             97.1 ±  0.0  97.7 ±  0.0  86.2 ±  0.0  
  Recall                97.4 ±  0.0  99.6 ±  0.0  52.1 ±  0.0  
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
Deep:
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Score                 Average      Lab:1        Lab:2       
――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――――
  Accuracy              98.2 ±  0.0  -            -            
  Balanced Accuracy     85.2 ±  0.0  -            -            
  F1                    98.1 ± 