In [None]:
!pip install transformers --quiet

[K     |████████████████████████████████| 4.7 MB 5.0 MB/s 
[K     |████████████████████████████████| 120 kB 63.7 MB/s 
[K     |████████████████████████████████| 6.6 MB 42.8 MB/s 
[?25h

In [2]:
import pickle
import numpy as np

X = []
y = []
with open('qprop_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    X.extend(dst)
    y.extend(np.repeat(0,len(dst)))
    
with open('covid_fake_news_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    X.extend(dst)
    y.extend(np.repeat(1,len(dst)))
    
with open('mmcovid_en_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    X.extend(dst)
    y.extend(np.repeat(2,len(dst)))
    
with open('isot_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    X.extend(dst)
    y.extend(np.repeat(3,len(dst)))
    
with open('grafn_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    X.extend(dst)
    y.extend(np.repeat(4,len(dst)))
    
with open('pubhealth_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    X.extend(dst)
    y.extend(np.repeat(5,len(dst)))
    
X=np.array(X)
y=np.array(y)

X.shape, y.shape, np.unique(y, return_counts=True)

((137171, 768),
 (137171,),
 (array([0, 1, 2, 3, 4, 5]),
  array([ 1964,  8972,  7332, 44898, 63930, 10075])))

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import RepeatedStratifiedKFold

rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=1410)
foldids = []
for fold_idx, (train, test) in tqdm(enumerate(rskf.split(X, y)), total=rskf.get_n_splits()):
    foldids.append((fold_idx,train,test))
    
print("shapes X",X.shape,"y", y.shape)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 216.33it/s]

shapes X (137171, 768) y (137171,)





In [61]:
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from imblearn.metrics import geometric_mean_score
from IPython.display import clear_output

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

scores = {
    'Accuracy': {'func': accuracy_score},
    'Balanced Accuracy': {'func': balanced_accuracy_score},
    'F1': {'func': f1_score},
    'Precision': {'func': precision_score},
    'Recall': {'func': recall_score},
    'G-mean': {'func': geometric_mean_score}
}

for score_name, score_dict in scores.items():
    scores[score_name]["list"] = []
    scores[score_name]["lab"] = []
    

def experiment(cls = LogisticRegression(max_iter=10000)):
  for fold,j in enumerate(foldids):
      train = foldids[fold][1]
      test = foldids[fold][2]
      xin, yin = X[train], np.array(y[train])
      
      
      cls.fit(xin, yin)
      y_pred = cls.predict(X[test])

      for score_name, score_dict in scores.items():
        if score_name in ["F1","Precision","Recall"]:
          scorvaln = score_dict['func'](y[test], y_pred, average=None)
          score_dict['lab'].append(scorvaln)
          scorval = score_dict['func'](y[test], y_pred, average="weighted")
          score_dict['list'].append(scorval)
          print(score_name, scorval, scorvaln)  
        else:
          scorval=score_dict['func'](y[test], y_pred)
          score_dict['list'].append(scorval)
          print(score_name, scorval)

      print(" ")


  clear_output()

  for score_name, score_dict in scores.items():
       score_dict['avg'] = np.mean(score_dict['list'])
       score_dict['std'] = np.std(score_dict['list'])
 
 
  return cls

In [8]:
bertlgr = experiment()

Accuracy             | 0.915 ± 0.001 | -             | -               
Balanced Accuracy    | 0.778 ± 0.001 | -             | -               
G-mean               | 0.615 ± 0.008 | -             | -               


In [76]:
numlabels = scores["F1"]["lab"][0].shape[0]
scores["F1"]["lab"][0].shape[0] 
head = "| %-20s | %-10s |" +  numlabels * " %-10s |" 
headv = ["Score", "Average"]
headv.extend(["Kat_"+str(i+1) for i in range(numlabels)])
row=head % tuple(headv)
print("+"*len(row))
print(row)
print("+"*len(row))

for score_name, score_dict in sorted(scores.items()) :
    headv = [score_name, np.mean(score_dict['list'])*100, np.std(score_dict['list'])*100]
    for i in range(numlabels):
        if score_name in ["F1","Precision","Recall"]:
            head = "| %-20s | %4.1f ± %3.1f |" + numlabels* " %4.1f ± %3.1f |"
            vals = [v[i] for v in scores[score_name]["lab"]]
            headv.append(np.mean(vals)*100)
            headv.append(np.std(vals)*100)
        else:
            head = "| %-20s | %4.1f ± %3.1f |" + numlabels * " %-10s |" 
            headv.append("-")
    print(head % tuple(headv))
    
print("+"*len(row))

+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
| Score                | Average    | Kat_1      | Kat_2      | Kat_3      | Kat_4      | Kat_5      | Kat_6      |
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
| Accuracy             | 91.5 ± 0.1 | -          | -          | -          | -          | -          | -          |
| Balanced Accuracy    | 77.8 ± 0.1 | -          | -          | -          | -          | -          | -          |
| F1                   | 91.0 ± 0.1 | 14.2 ± 0.9 | 97.5 ± 0.1 | 88.7 ± 0.3 | 92.7 ± 0.1 | 92.1 ± 0.1 | 87.0 ± 0.2 |
| G-mean               | 61.5 ± 0.8 | -          | -          | -          | -          | -          | -          |
| Precision            | 90.9 ± 0.1 | 45.6 ± 3.4 | 97.2 ± 0.2 | 89.6 ± 0.7 | 91.8 ± 0.2 | 91.4 ± 0.1 | 87.9 ± 0.4 |
| Recall               | 91.5 ± 0.1 |  8.4 ± 0.6 | 97.8 ± 0.2 | 87.8 ± 0

In [10]:
import pickle
# SAVE
with open('scores_mmlab.pickle', 'wb') as handle:
    pickle.dump(scores, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [63]:
import pickle
# LOAD
with open('scores_mmlab.pickle', 'rb') as handle:
    scores = pickle.load(handle)

In [77]:
import pickle
# SAVE
with open('bertlgr_mmlab.pickle', 'wb') as handle:
    pickle.dump(bertlgr, handle, protocol=pickle.HIGHEST_PROTOCOL)

# clustering

In [6]:
from nltk.cluster import KMeansClusterer
import nltk
import pickle
import numpy as np

X = []
y = []

with open('isot_BERTEmbeddings.pickle', 'rb') as handle:
    dst = pickle.load(handle)
    X.extend(dst)
    y.extend(np.repeat(3,len(dst)))
    
NUM_CLUSTERS = 15

kclusterer = KMeansClusterer(
    NUM_CLUSTERS, distance=nltk.cluster.util.cosine_distance,
    repeats=25,avoid_empty_clusters=True)

assigned_clusters = kclusterer.cluster(X, assign_clusters=True)