# SQL definitions

In [93]:
import sqlite3
conn = sqlite3.connect('swarog.sqlite')
c = conn.cursor()
c.execute('''CREATE TABLE raw
             (dataset TEXT, id INT, body TEXT, label INT)''')
conn.commit()
conn.close()

In [82]:
import sqlite3
conn = sqlite3.connect('swarog.sqlite')
c = conn.cursor()
c.execute('''CREATE TABLE bert
             (dataset TEXT, id INT, vec TEXT)''')
conn.commit()
conn.close()

In [94]:
import sqlite3
conn = sqlite3.connect('swarog.sqlite')
c = conn.cursor()
c.execute('''CREATE TABLE bertnp
             (dataset TEXT, id INT, vec BLOB)''')
conn.commit()
conn.close()

In [76]:
import codecs
compressor = 'zlib'

def adapt_array(arr):
    """
    http://stackoverflow.com/a/31312102/190597 (SoulNibbler)
    """
    # zlib uses similar disk size that Matlab v5 .mat files
    # bz2 compress 4 times zlib, but storing process is 20 times slower.
    out = io.BytesIO()
    np.save(out, arr)
    out.seek(0)
    return sqlite3.Binary(codecs.encode(out.read(),compressor))  # zlib, bz2

def convert_array(text):
    out = io.BytesIO(text)
    out.seek(0)
    out = io.BytesIO(codecs.decode(out.read(),compressor))
    return np.load(out)

import sqlite3

# sqlite3.register_adapter(np.ndarray, adapt_array)
# sqlite3.register_converter("array", convert_array)




In [None]:
conn = sqlite3.connect('swarog.sqlite')
c = conn.cursor()
c.execute('''CREATE TABLE bertnp
             (dataset TEXT, id INT, vec array)''')
conn.commit()
conn.close()


# Import raw text

In [97]:
import pandas as pd
from tqdm import tqdm

conn = sqlite3.connect('swarog.sqlite')
c = conn.cursor()

data = pd.read_csv("../raw/covid_fake_news.csv",sep="\t")
for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    values_to_insert = [('covid_fake_news', index, row['text'], row['label'])]
    c.executemany("""INSERT INTO raw(dataset, id, body, label) VALUES (?,?,?,?)""", values_to_insert)
    conn.commit()
    
    
data = pd.read_csv("../raw/mmcovid_en.csv",sep=",")
for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    values_to_insert = [('mmcovid_en', index, row['text'], row['label'])]
    c.executemany("""INSERT INTO raw(dataset, id, body, label) VALUES (?,?,?,?)""", values_to_insert)
    conn.commit()

data = pd.read_csv("../raw/pubhealth.csv",sep=",")
for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    values_to_insert = [('pubhealth', index, row['text'], row['label'])]
    c.executemany("""INSERT INTO raw(dataset, id, body, label) VALUES (?,?,?,?)""", values_to_insert)
    conn.commit()

data = pd.read_csv("../raw/qprop.csv",sep="\t")
for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    values_to_insert = [('qprop', index, row['text'], row['label'])]
    c.executemany("""INSERT INTO raw(dataset, id, body, label) VALUES (?,?,?,?)""", values_to_insert)
    conn.commit()
    
data = pd.read_csv("../raw/isot.csv",sep=",")
for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    values_to_insert = [('isot', index, row['text'], row['label'])]
    c.executemany("""INSERT INTO raw(dataset, id, body, label) VALUES (?,?,?,?)""", values_to_insert)
    conn.commit()
    
    
data = pd.read_csv("../raw/grafn.csv",sep=",")
for index, row in tqdm(data.iterrows(), total=data.shape[0]):
    values_to_insert = [('grafn', index, row['text'], row['label'])]
    c.executemany("""INSERT INTO raw(dataset, id, body, label) VALUES (?,?,?,?)""", values_to_insert)
    conn.commit()
    
        

    
conn.close()

100%|███████████████████████████████████████████████████████████████████████████████████| 8972/8972 [00:22<00:00, 400.03it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 7332/7332 [00:19<00:00, 379.15it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10075/10075 [00:27<00:00, 364.48it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 51270/51270 [02:50<00:00, 299.88it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 44898/44898 [02:41<00:00, 277.39it/s]
  data = pd.read_csv("../raw/grafn.csv",sep=",")
100%|█████████████████████████████████████████████████████████████████████████████████| 63930/63930 [03:17<00:00, 323.35it/s]


# Import bert CLS vector

In [95]:
import pickle 
import json
import codecs

conn = sqlite3.connect('swarog.sqlite')
c = conn.cursor()


with open('../covid_fake_news_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    for index, row in tqdm(enumerate(dst),total=len(dst)):
        values_to_insert = [('covid_fake_news', index, adapt_array(row))]
        c.executemany("""INSERT INTO bertnp(dataset, id, vec) VALUES (?,?,?)""", values_to_insert)
        conn.commit()
        
        
with open('../mmcovid_en_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    for index, row in tqdm(enumerate(dst),total=len(dst)):
        values_to_insert = [('mmcovid_en', index, adapt_array(row))]
        c.executemany("""INSERT INTO bertnp(dataset, id, vec) VALUES (?,?,?)""", values_to_insert)
        conn.commit()
        
with open('../pubhealth_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    for index, row in tqdm(enumerate(dst),total=len(dst)):
        values_to_insert = [('pubhealth', index, adapt_array(row))]
        c.executemany("""INSERT INTO bertnp(dataset, id, vec) VALUES (?,?,?)""", values_to_insert)
        conn.commit()
        
with open('../qprop_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    for index, row in tqdm(enumerate(dst),total=len(dst)):
        values_to_insert = [('qprop', index, adapt_array(row))]
        c.executemany("""INSERT INTO bertnp(dataset, id, vec) VALUES (?,?,?)""", values_to_insert)
        conn.commit()

with open('../isot_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    for index, row in tqdm(enumerate(dst),total=len(dst)):
        values_to_insert = [('isot', index, adapt_array(row))]
        c.executemany("""INSERT INTO bertnp(dataset, id, vec) VALUES (?,?,?)""", values_to_insert)
        conn.commit()
                
with open('../grafn_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    for index, row in tqdm(enumerate(dst),total=len(dst)):
        values_to_insert = [('grafn', index, adapt_array(row))]
        c.executemany("""INSERT INTO bertnp(dataset, id, vec) VALUES (?,?,?)""", values_to_insert)
        conn.commit()
        
conn.close()

100%|███████████████████████████████████████████████████████████████████████████████████| 8972/8972 [00:34<00:00, 259.38it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 7332/7332 [00:18<00:00, 392.50it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10075/10075 [00:36<00:00, 275.73it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 51270/51270 [02:57<00:00, 288.65it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 44898/44898 [02:15<00:00, 330.69it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 63930/63930 [03:29<00:00, 304.71it/s]


# Un-pickle `TF-IDF` 

In [None]:
import pickle
import numpy as np
import pandas as pd

DATA = {
    "X" : [],
    "category" : [],
    "y" : []
}

X=DATA["X"]
category=DATA["category"]
y=DATA["y"]


# Un-pickle bert `CLS` tokens

In [4]:
import pickle
import numpy as np
import pandas as pd

DATA = {
    "X" : [],
    "category" : [],
    "y" : []
}

X=DATA["X"]
category=DATA["category"]
y=DATA["y"]


with open('../covid_fake_news_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    X.extend(dst)
    category.extend(np.repeat(0,len(dst)))
    data = pd.read_csv("../raw/covid_fake_news.csv",sep="\t")
    y.extend(data["label"].values)
    
with open('../mmcovid_en_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    X.extend(dst)
    category.extend(np.repeat(1,len(dst)))
    data = pd.read_csv("../raw/mmcovid_en.csv",sep=",")
    y.extend(data["label"].values)
    

with open('../pubhealth_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    X.extend(dst)
    category.extend(np.repeat(2,len(dst)))
    data = pd.read_csv("../raw/pubhealth.csv",sep=",")
    y.extend(data["label"].values)

    
with open('../qprop_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    X.extend(dst)
    category.extend(np.repeat(3,len(dst)))
    data = pd.read_csv("../raw/qprop.csv",sep="\t")
    y.extend(data["label"].values)
    
with open('../isot_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    X.extend(dst)
    category.extend(np.repeat(4,len(dst)))
    data = pd.read_csv("../raw/isot.csv",sep=",")
    y.extend(data["label"].values)
    
with open('../grafn_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    X.extend(dst)
    category.extend(np.repeat(5,len(dst)))
    data = pd.read_csv("../raw/grafn.csv",sep=",")
    y.extend(data["label"].values)
    
    
DATA["X"]=np.array(X)
DATA["category"]=np.array(category)
DATA["y"]=np.array(y)
DATA["folds"] = []

  data = pd.read_csv("../raw/grafn.csv",sep=",")


# 5x2 CV experiment

In [7]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from imblearn.metrics import geometric_mean_score
from IPython.display import clear_output

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def experiment(foldids, X, y, cls = LogisticRegression(max_iter=10000), fit=True):

    scores = {
        'Accuracy': {'func': accuracy_score},
        'Balanced Accuracy': {'func': balanced_accuracy_score},
        'F1': {'func': f1_score},
        'Precision': {'func': precision_score},
        'Recall': {'func': recall_score},
        'G-mean': {'func': geometric_mean_score}
    }

    for score_name, score_dict in scores.items():
        scores[score_name]["list"] = []
        scores[score_name]["lab"] = []

    for fold,j in enumerate(foldids):
        train = foldids[fold][1]
        test = foldids[fold][2]
        xin, yin = X[train], np.array(y[train])
        
        pca = PCA(n_components=512)
        pca.fit(xin)
        
        
        if fit == True:
            cls.fit(pca.transform(xin), yin)
        y_pred = cls.predict(pca.transform(X[test]))
        for score_name, score_dict in scores.items():
            if score_name in ["F1","Precision","Recall"]:
                scorvaln = score_dict['func'](y[test], y_pred, average=None)
                score_dict['lab'].append(scorvaln)
                scorval = score_dict['func'](y[test], y_pred, average="weighted")
                score_dict['list'].append(scorval)
                #print(score_name, scorval, scorvaln)  
            else:
                scorval=score_dict['func'](y[test], y_pred)
                score_dict['list'].append(scorval)
                #print(score_name, scorval)
        #print(" ")

    #clear_output()
    for score_name, score_dict in scores.items():
        score_dict['avg'] = np.mean(score_dict['list'])
        score_dict['std'] = np.std(score_dict['list'])
 
    # Print stats
    numlabels = scores["F1"]["lab"][0].shape[0]
    scores["F1"]["lab"][0].shape[0] 
    head = "| %-20s | %-10s |" +  numlabels * " %-10s |" 
    headv = ["Score", "Average"]
    headv.extend(["Kat_"+str(i+1) for i in range(numlabels)])
    row=head % tuple(headv)
    print("+"*len(row))
    print(row)
    print("+"*len(row))
    for score_name, score_dict in sorted(scores.items()) :
        headv = [score_name, np.mean(score_dict['list'])*100, np.std(score_dict['list'])*100]
        for i in range(numlabels):
            if score_name in ["F1","Precision","Recall"]:
                head = "| %-20s | %4.1f ± %3.1f |" + numlabels* " %4.1f ± %3.1f |"
                vals = [v[i] for v in scores[score_name]["lab"]]
                headv.append(np.mean(vals)*100)
                headv.append(np.std(vals)*100)
            else:
                head = "| %-20s | %4.1f ± %3.1f |" + numlabels * " %-10s |" 
                headv.append("-")
        print(head % tuple(headv))
    print("+"*len(row))
    return cls, scores, pca

### Recording test/train folds

In [8]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.decomposition import PCA

DATA["folds"] = []
rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=1410)


for t in range(np.max(DATA["category"]+1)):
    print("TASK",t)
    X = DATA["X"][DATA["category"] == t]
    y = DATA["y"][DATA["category"] == t]

    foldids = []
    for fold_idx, (train, test) in enumerate(rskf.split(X, y)):
        foldids.append((fold_idx,train,test))

    #print("shapes X",X.shape,"y", y.shape)
    DATA["folds"].append(foldids)


TASK 0
TASK 1
TASK 2
TASK 3
TASK 4
TASK 5


<hr style="border: 5px dashed red">

# Single Task Performance (BERT `CLS`)

In [9]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.decomposition import PCA

DATA["folds"] = []
rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=1410)

stl=[]

for t in range(np.max(DATA["category"]+1)):
    print("TASK",t)
    X = DATA["X"][DATA["category"] == t]
    y = DATA["y"][DATA["category"] == t]

    foldids = []
    for fold_idx, (train, test) in enumerate(rskf.split(X, y)):
        foldids.append((fold_idx,train,test))

    #print("shapes X",X.shape,"y", y.shape)
    DATA["folds"].append(foldids)
    model1, scores1, pca = experiment(foldids, X, y, 
                                 LogisticRegression(max_iter=10000))
    stl.append((model1,pca,scores1))

TASK 0
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
| Score                | Average    | Kat_1      | Kat_2      |
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
| Accuracy             | 97.2 ± 0.1 | -          | -          |
| Balanced Accuracy    | 79.3 ± 1.3 | -          | -          |
| F1                   | 97.0 ± 0.1 | 68.7 ± 1.6 | 98.5 ± 0.1 |
| G-mean               | 76.8 ± 1.7 | -          | -          |
| Precision            | 97.0 ± 0.2 | 81.8 ± 3.5 | 97.8 ± 0.1 |
| Recall               | 97.2 ± 0.1 | 59.4 ± 2.7 | 99.3 ± 0.2 |
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
TASK 1
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
| Score                | Average    | Kat_1      | Kat_2      |
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
| Accuracy             | 92.6 ± 0.2 | -          | -          |
| Balanced Accuracy    | 88.2 ± 0.7 | -          | -          |
| F1                   | 9

<hr style="border: 5px dashed red">

# Domain Classifier

In [10]:
X = DATA["X"]
y = DATA["y"]
category = DATA["category"]

rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=1410)
foldids = []
for fold_idx, (train, test) in enumerate(rskf.split(X, y)):
    foldids.append((fold_idx,train,test))
    
print("shapes X",X.shape,"y", category.shape)

domain_model, domain_scores, domain_pca = experiment(foldids, X, 
                                         category, LogisticRegression(max_iter=10000, class_weight='balanced'))

shapes X (186477, 768) y (186477,)
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
| Score                | Average    | Kat_1      | Kat_2      | Kat_3      | Kat_4      | Kat_5      | Kat_6      |
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
| Accuracy             | 79.6 ± 0.1 | -          | -          | -          | -          | -          | -          |
| Balanced Accuracy    | 85.7 ± 0.1 | -          | -          | -          | -          | -          | -          |
| F1                   | 79.6 ± 0.1 | 96.7 ± 0.2 | 82.8 ± 0.4 | 72.8 ± 0.5 | 72.8 ± 0.2 | 87.2 ± 0.1 | 77.9 ± 0.1 |
| G-mean               | 85.1 ± 0.1 | -          | -          | -          | -          | -          | -          |
| Precision            | 80.2 ± 0.1 | 95.2 ± 0.3 | 75.4 ± 0.6 | 61.5 ± 0.7 | 73.1 ± 0.2 | 85.0 ± 0.2 | 83.8 ± 0.3 |
| Recall               | 79.6 ± 0.1 |

<hr style="border: 5px dashed red">

# Global model with category pred.

In [13]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from imblearn.metrics import geometric_mean_score
from IPython.display import clear_output

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def cls_max_predict(X_test):
    ypred = np.array([stl[t][0].predict(stl[t][1].transform(X_test)) for t in range(6)])
    return np.mean(ypred, axis=0) > 0.5

def cls_weighted_predict(X_test):
    ypred = np.array([stl[t][0].predict(stl[t][1].transform(X_test)) for t in range(6)])
    return np.average(ypred, weights=[78,83,74,73,90,73], axis=0) > 0.4

def cls_max_predict_raw(X_test):
    ypred = np.array([stl[t][0].predict(stl[t][1].transform(X_test)) for t in range(6)])
    return ypred

def cls_predict(X_test):
    Xd=domain_pca.transform(X_test)
    domain_pred = domain_model.predict(Xd)
    ypred = []
    for i,dpred in enumerate(domain_pred):
        model, model_pca, _ = stl[dpred] 
        xpca = model_pca.transform(X_test[i:i+1])
        ypred.append(model.predict(xpca)[0])
    return ypred
        
def experiment2(foldids, X, y, cback):

    scores = {
        'Accuracy': {'func': accuracy_score},
        'Balanced Accuracy': {'func': balanced_accuracy_score},
        'F1': {'func': f1_score},
        'Precision': {'func': precision_score},
        'Recall': {'func': recall_score},
        'G-mean': {'func': geometric_mean_score}
    }

    for score_name, score_dict in scores.items():
        scores[score_name]["list"] = []
        scores[score_name]["lab"] = []

    for fold,j in enumerate(foldids):
        train = foldids[fold][1]
        test = foldids[fold][2]
        xin, yin = X[train], np.array(y[train])
        
        y_pred = cback(X[test])
        
        for score_name, score_dict in scores.items():
            if score_name in ["F1","Precision","Recall"]:
                scorvaln = score_dict['func'](y[test], y_pred, average=None)
                score_dict['lab'].append(scorvaln)
                scorval = score_dict['func'](y[test], y_pred, average="weighted")
                score_dict['list'].append(scorval)
                print(score_name, scorval, scorvaln)  
            else:
                scorval=score_dict['func'](y[test], y_pred)
                score_dict['list'].append(scorval)
                print(score_name, scorval)
        print(" ")

    clear_output()
    for score_name, score_dict in scores.items():
        score_dict['avg'] = np.mean(score_dict['list'])
        score_dict['std'] = np.std(score_dict['list'])
 
    # Print stats
    numlabels = scores["F1"]["lab"][0].shape[0]
    scores["F1"]["lab"][0].shape[0] 
    head = "| %-20s | %-10s |" +  numlabels * " %-10s |" 
    headv = ["Score", "Average"]
    headv.extend(["Kat_"+str(i+1) for i in range(numlabels)])
    row=head % tuple(headv)
    print("+"*len(row))
    print(row)
    print("+"*len(row))
    for score_name, score_dict in sorted(scores.items()) :
        headv = [score_name, np.mean(score_dict['list'])*100, np.std(score_dict['list'])*100]
        for i in range(numlabels):
            if score_name in ["F1","Precision","Recall"]:
                head = "| %-20s | %4.1f ± %3.1f |" + numlabels* " %4.1f ± %3.1f |"
                vals = [v[i] for v in scores[score_name]["lab"]]
                headv.append(np.mean(vals)*100)
                headv.append(np.std(vals)*100)
            else:
                head = "| %-20s | %4.1f ± %3.1f |" + numlabels * " %-10s |" 
                headv.append("-")
        print(head % tuple(headv))
    print("+"*len(row))
    return scores, pca

In [14]:
X = DATA["X"]
y = DATA["y"]


rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=1410)
foldids = []
for fold_idx, (train, test) in tqdm(enumerate(rskf.split(X, y)), total=rskf.get_n_splits()):
    foldids.append((fold_idx,train,test))
    
print("shapes X",X.shape,"y", y.shape)

gscr, gpca = experiment2(foldids, X, y, cls_predict)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 169.72it/s]

shapes X (186477, 768) y (186477,)





Accuracy 0.8906895183346025
Balanced Accuracy 0.8738772228942854
F1 0.890706375178059 [0.9199887  0.82753486]
Precision 0.8907234951046911 [0.92023432 0.82705902]
Recall 0.8906895183346025 [0.9197432  0.82801124]
G-mean 0.8726727407962598
 
Accuracy 0.8902378858405371
Balanced Accuracy 0.8740442812747655
F1 0.8903299506300135 [0.919563  0.8272626]
Precision 0.8904298659660493 [0.92090929 0.82467358]
Recall 0.8902378858405371 [0.91822063 0.82986793]
G-mean 0.8729271766294437
 
Accuracy 0.8901210866697412
Balanced Accuracy 0.8736792558319861
F1 0.8901881059739745 [0.9195088 0.8269338]
Precision 0.8902592631467223 [0.92048511 0.82505225]
Recall 0.8901210866697412 [0.91853456 0.82882395]
G-mean 0.8725270445120594
 
Accuracy 0.8908063236019649
Balanced Accuracy 0.8742422345763926
F1 0.8908483410282234 [0.92004304 0.82786372]
Precision 0.8908920018230735 [0.92065762 0.82667567]
Recall 0.8908063236019649 [0.91942927 0.8290552 ]
G-mean 0.8730736603814897
 
Accuracy 0.8904535655680563
Balanced 

In [16]:
from sklearn.pipeline import Pipeline
pipe_domain = Pipeline([('pca', domain_pca), ('head', domain_model)])

# Pickle models

In [17]:
import pickle
# SAVE
with open('domain_cls.pickle', 'wb') as handle:
    pickle.dump(pipe_domain, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
domain_model_pipe = []
for i,elem in enumerate(stl):
    with open(f'model_{i}.pickle', 'wb') as handle:
        p = Pipeline([('pca', elem[1]), ('head', elem[0])])
        domain_model_pipe.append(p)
        pickle.dump(p, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Un-pickle models

In [3]:
import pickle5 as pickle
# load
with open('../../pickles/models/domain_cls.pickle', 'rb') as handle:
    pipe_domain = pickle.load(handle)
    
domain_model_pipe = []
for i in range(6):
    with open(f'../../pickles/models/model_{i}.pickle', 'rb') as handle:
        p=pickle.load(handle)
        domain_model_pipe.append(p)
        

In [4]:
cls_predict(X[0:2]), y[0:2]

NameError: name 'cls_predict' is not defined

In [91]:
import bentoml

from sklearn import svm

saved_model = bentoml.sklearn.save_model("big_puppy", bp)
print(f"Model saved: {saved_model}")

Model saved: Model(tag="big_puppy:wo3hgzkt6wg6itgm")


# BENTO models

In [5]:
import bentoml

from sklearn import svm

saved_model = bentoml.sklearn.save_model("domain_cls", pipe_domain, signatures={
        "predict": {"batchable": True, "batch_dim": 0},
        "predict_proba": {"batchable": True, "batch_dim": 0},
    })
print(f"Model saved: {saved_model}")


Model saved: Model(tag="domain_cls:4ljhzhc27sjskdg5")


In [6]:
for i in range(6):
    saved_model = bentoml.sklearn.save_model(f"model_{i}", domain_model_pipe[i], signatures={
        "predict": {"batchable": True, "batch_dim": 0},
        "predict_proba": {"batchable": True, "batch_dim": 0},
    })
    print(f"Model saved: {saved_model}")


Model saved: Model(tag="model_0:4zhlxcc27sxigdg5")
Model saved: Model(tag="model_1:426ssdc27s5j2dg5")
Model saved: Model(tag="model_2:44j2ors27sxkedg5")
Model saved: Model(tag="model_3:45jvync27svgwdg5")
Model saved: Model(tag="model_4:46tsvgs27s3rgdg5")
Model saved: Model(tag="model_5:476kxhc27sbbcdg5")


### check rest api

In [10]:
import requests

requests.post(
     "http://localhost:3000/predict",
     headers={"content-type": "application/json"},
     data='{"text":"I love it"}'
).text

'{"result":1,"result_proba":[0.12162435873841593,0.8783756412615841],"domain":5,"domain_proba":[3.7271139151509206e-05,0.2524385728679994,0.0001431362617570179,0.011453838931336738,0.350516113051998,0.38541106774775735],"similar_articles":[{"text":"LUDHIANA: A 22-year-old man was sentenced to 10-year rigorous imprisonment for raping a minor in 2015. The court of additional sessions judge Karamjit Singh Sular also imposed a fine of Rs 40,000 on the convict and in default of payment of fine, he will have to further undergo a rigorous imprisonment of two years.On November 12, 2015, police had booked the 22-year-old accused under section 376 ( rape ) of Indian Penal Code (IPC) and section 3 and 4 of Protection of Children from Sexual Offences Act (POCSO).As per prosecution, a police party led by sub-inspector Suresh Kumar reached the place of the incident after receiving information about the accused. The accused was arrested from near a gurdwara in Meherban area. Later, the father of the 