In [50]:
import pickle5 as pickle

with open(f'../../pickles/swarog_data/swarog_tfidf.pickle', 'rb') as handle:
        tfidfvec = pickle.load(handle)
tfidfvec.shape 

(185460, 7000)

In [41]:
conn = sqlite3.connect('../../pickles/swarog_data/swarog.sqlite')
c = conn.cursor()
c.execute("""
select dataset from raw group by dataset
""")
names = [n[0] for n in c.fetchall()]

conn.close()
names

['covid_fake_news', 'grafn', 'isot', 'mmcovid_en', 'pubhealth', 'qprop']

In [39]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from imblearn.metrics import geometric_mean_score
from IPython.display import clear_output

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from sklearn.decomposition import PCA


def experiment(foldids, X, y, cls = LogisticRegression(max_iter=10000), fit=True):

    scores = {
        'Accuracy': {'func': accuracy_score},
        'Balanced Accuracy': {'func': balanced_accuracy_score},
        'F1': {'func': f1_score},
        'Precision': {'func': precision_score},
        'Recall': {'func': recall_score},
        'G-mean': {'func': geometric_mean_score}
    }

    for score_name, score_dict in scores.items():
        scores[score_name]["list"] = []
        scores[score_name]["lab"] = []

    for fold,j in enumerate(foldids):
        train = foldids[fold][1]
        test = foldids[fold][2]
        xin, yin = X[train], np.array(y[train])
        
        pca = PCA(n_components=512)
        pca.fit(xin)
        
        
        if fit == True:
            cls.fit(pca.transform(xin), yin)
        y_pred = cls.predict(pca.transform(X[test]))
        for score_name, score_dict in scores.items():
            if score_name in ["F1","Precision","Recall"]:
                scorvaln = score_dict['func'](y[test], y_pred, average=None)
                score_dict['lab'].append(scorvaln)
                scorval = score_dict['func'](y[test], y_pred, average="weighted")
                score_dict['list'].append(scorval)
                #print(score_name, scorval, scorvaln)  
            else:
                scorval=score_dict['func'](y[test], y_pred)
                score_dict['list'].append(scorval)
                #print(score_name, scorval)
        #print(" ")

    #clear_output()
    for score_name, score_dict in scores.items():
        score_dict['avg'] = np.mean(score_dict['list'])
        score_dict['std'] = np.std(score_dict['list'])
 
    # Print stats
    numlabels = scores["F1"]["lab"][0].shape[0]
    scores["F1"]["lab"][0].shape[0] 
    head = "| %-20s | %-10s |" +  numlabels * " %-10s |" 
    headv = ["Score", "Average"]
    headv.extend(["Kat_"+str(i+1) for i in range(numlabels)])
    row=head % tuple(headv)
    print("+"*len(row))
    print(row)
    print("+"*len(row))
    for score_name, score_dict in sorted(scores.items()) :
        headv = [score_name, np.mean(score_dict['list'])*100, np.std(score_dict['list'])*100]
        for i in range(numlabels):
            if score_name in ["F1","Precision","Recall"]:
                head = "| %-20s | %4.1f ± %3.1f |" + numlabels* " %4.1f ± %3.1f |"
                vals = [v[i] for v in scores[score_name]["lab"]]
                headv.append(np.mean(vals)*100)
                headv.append(np.std(vals)*100)
            else:
                head = "| %-20s | %4.1f ± %3.1f |" + numlabels * " %-10s |" 
                headv.append("-")
        print(head % tuple(headv))
    print("+"*len(row))
    return cls, scores, pca

In [56]:
indata['label'].values

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [57]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import RepeatedStratifiedKFold
rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=1410)
conn = sqlite3.connect('../../pickles/swarog_data/swarog.sqlite')

for name in names:
    print(name)
    indata = pd.read_sql_query("""
            select tfidf.rowid as _index, raw.dataset, body, tfidf.gid as gid, raw.label as label 
            from raw join tfidf on (tfidf.gid = raw.ROWID) where raw.dataset ='%s'
            """ % name, conn)

    X = []
    y = indata['label'].values
    
    for _id in indata['_index'].values:
        X.append(tfidfvec[_id-1].toarray()[0])
        
    X = np.array(X)  
    
    foldids = []
    for fold_idx, (train, test) in enumerate(rskf.split(X, y)):
        foldids.append((fold_idx,train,test))

    cls, scores, pca = experiment(foldids, X, y, LogisticRegression(max_iter=10000))
    #print(scores)

conn.close()

covid_fake_news
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
| Score                | Average    | Kat_1      | Kat_2      |
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
| Accuracy             | 96.0 ± 0.1 | -          | -          |
| Balanced Accuracy    | 60.9 ± 0.7 | -          | -          |
| F1                   | 94.7 ± 0.1 | 35.7 ± 1.9 | 97.9 ± 0.0 |
| G-mean               | 46.7 ± 1.5 | -          | -          |
| Precision            | 96.1 ± 0.1 | 99.0 ± 1.6 | 95.9 ± 0.1 |
| Recall               | 96.0 ± 0.1 | 21.8 ± 1.4 | 100.0 ± 0.0 |
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
grafn
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
| Score                | Average    | Kat_1      | Kat_2      |
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
| Accuracy             | 92.9 ± 0.2 | -          | -          |
| Balanced Accuracy    | 85.3 ± 0.3 | -          | -          |
| F1             