In [None]:
!pip install transformers --quiet

In [None]:
! id="12vbx8UMl4KpoSPM4ZHqqggjtKrn35odG"; \
conf=$(wget --quiet --save-cookies /tmp/cookies.txt \
--keep-session-cookies --no-check-certificate \
'https://docs.google.com/uc?export=download&id='$id -O- | \
sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p'); \
wget --load-cookies /tmp/cookies.txt \
"https://docs.google.com/uc?export=download&confirm=$conf&id=$id" -O qprop.csv && \
rm -rf /tmp/cookies.txt

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import RepeatedStratifiedKFold

data = pd.read_csv("../../raw/qprop.csv",sep="\t")
print("labels\n",data["label"].value_counts())
print("shape  \n",data.shape)

bootstrap_size = 0

if bootstrap_size != 0:
    bootstrap_factor = bootstrap_size / data.shape[0]
    bootstrap = np.random.uniform(size=data.shape[0]) < bootstrap_factor
    data = data.iloc[bootstrap]

X, y = data["text"].values, data["label"].values

rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=1410)
foldids = []
for fold_idx, (train, test) in tqdm(enumerate(rskf.split(X, y)), total=rskf.get_n_splits()):
    foldids.append((fold_idx,train,test))
    
print("shapes X",X.shape,"y", y.shape)

labels
 0    45534
1     5736
Name: label, dtype: int64
shape  
 (51270, 14)


100%|██████████████████████████████████████████| 10/10 [00:00<00:00, 605.33it/s]

shapes X (51270,) y (51270,)





In [2]:
data.columns

Index(['text', 'event_location', 'average_tone', 'article_URL',
       'MBFC_factuality_label', 'article_URL2', 'MBFC_factuality_label2',
       'URL_to_MBFC_page', 'source_name', 'MBFC_notes_about_source',
       'MBFC_bias_label', 'source_URL', 'published_utc', 'label'],
      dtype='object')

In [3]:
import transformers
transformers.logging.set_verbosity_error()
from torch.utils.data import DataLoader 

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import TFDistilBertModel, DistilBertTokenizerFast

from sklearn.base import BaseEstimator, TransformerMixin
import torch
import numpy as np

from scipy.sparse import csr_matrix
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("using device:", device)

if "disilbert_model" not in locals():
    disilbert_tokenizer =  AutoTokenizer.from_pretrained("distilbert-base-uncased")
    disilbert_model = AutoModel.from_pretrained("distilbert-base-uncased")
    handle = disilbert_model.to(device)


class BERTEmbeddings(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tokenizer =  disilbert_tokenizer
        self.model = disilbert_tokenizer
        self.max_length = 256
        self.model_name = disilbert_model

    def fit(self, X=None, y=None):
        pass
    
    def encode(self, txt):
        return self.tokenizer(txt, max_length=self.max_length, 
                              truncation=True, padding=True, return_tensors="pt")

    def transform(self, X=None):
        dataloader = DataLoader(X, batch_size=4, shuffle=False)
        allembeds = []
        for batch in tqdm(dataloader):
            batchenc = disilbert_tokenizer(batch, max_length=256, truncation=True, padding=True, return_tensors="pt")
            input_ids = batchenc['input_ids'].to(device)
            attention_mask = batchenc['attention_mask'].to(device)
            batchout = disilbert_model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
            embeds = [vec[0].cpu().detach().numpy() for vec in batchout[1][-1]]
            allembeds.extend(embeds)
        return csr_matrix(allembeds)


class BertHead(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.head = LogisticRegression(class_weight='auto', max_iter=10000)

    def fit(self, X=None, y=None):
        self.head.fit(X, y)


    def transform(self, X=None):
        pass
    
    def predict(self, X=None):    
        return self.head.predict(X)
    
    def predict_proba(self, X=None):    
        return self.head.predict_proba(X)



using device: cuda


In [3]:
bert = BERTEmbeddings()
X_dstil_numpy = bert.transform(X).toarray()

100%|███████████████████████████████████████| 3205/3205 [01:39<00:00, 32.16it/s]


In [5]:
import pickle
# SAVE
with open('qprop_BERTEmbeddings.pickle', 'wb') as handle:
    pickle.dump(X_dstil_numpy, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [4]:
import pickle
# LOAD
with open('../../pickles/bertcls_embeddings/qprop_BERTEmbeddings.pickle', 'rb') as handle:
    X_dstil_numpy=pickle.load(handle)

In [5]:
from sklearn.metrics import log_loss,accuracy_score, balanced_accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
from imblearn.metrics import geometric_mean_score
from IPython.display import clear_output

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

def log_los_nonorm(x,y,average=False):
    return log_loss(x,y,normalize=True)

scores = {
    'Accuracy': {'func': accuracy_score},
    'Balanced Accuracy': {'func': balanced_accuracy_score},
    'F1': {'func': f1_score},
    'Precision': {'func': precision_score},
    'Recall': {'func': recall_score},
    'Log-likelihood': {'func': log_los_nonorm},
    #'G-mean': {'func': geometric_mean_score}
}

for score_name, score_dict in scores.items():
    scores[score_name]["list"] = []
    scores[score_name]["lab"] = []


for fold,j in enumerate(foldids):
    train = foldids[fold][1]
    test = foldids[fold][2]
    xin, yin = X_dstil_numpy[train], np.array(y[train])
    cls = BertHead()
    
    cls.fit(xin, yin)
    y_pred = cls.predict(X_dstil_numpy[test])
    y_pred_proba = cls.predict_proba(X_dstil_numpy[test])
    
    for score_name, score_dict in scores.items():
        if score_name == "Log-likelihood":
            scorval=score_dict['func'](y[test], y_pred_proba)
            score_dict['list'].append(scorval)
            print(score_name, scorval)
        elif score_name in ["F1","Precision","Recall"]:
            scorvaln = score_dict['func'](y[test], y_pred, average=None)
            score_dict['lab'].append(scorvaln)
            scorval = score_dict['func'](y[test], y_pred, average="weighted")
            score_dict['list'].append(scorval)
            print(score_name, scorval, scorvaln)  
        else:
            scorval=score_dict['func'](y[test], y_pred)
            score_dict['list'].append(scorval)
            print(score_name, scorval)
    print()

Accuracy 0.9425394967817438
Balanced Accuracy 0.8148175127951653
F1 0.9399287559755203 [0.96802483 0.7168941 ]
Precision 0.9392494389122782 [0.95695279 0.7987152 ]
Recall 0.9425394967817438 [0.97935609 0.65027894]
Log-likelihood 0.14983674875876737

Accuracy 0.942032377608738
Balanced Accuracy 0.8049323297520286
F1 0.9387593291117811 [0.96782157 0.70805501]
Precision 0.9384164750521965 [0.95446974 0.8109811 ]
Recall 0.942032377608738 [0.98155225 0.62831241]
Log-likelihood 0.14875411432786673

Accuracy 0.9409401209284182
Balanced Accuracy 0.8105648168006508
F1 0.9382398307969639 [0.96713697 0.70884615]
Precision 0.9374753989627116 [0.95601425 0.79030875]
Recall 0.9409401209284182 [0.97852154 0.64260809]
Log-likelihood 0.15060901882233826

Accuracy 0.9432416617905208
Balanced Accuracy 0.8121653036684344
F1 0.9403454030825453 [0.96845528 0.71720117]
Precision 0.9398774052475104 [0.95620344 0.81027668]
Recall 0.9432416617905208 [0.98102517 0.64330544]
Log-likelihood 0.1498183520103225

Acc

In [6]:
numlabels = scores["F1"]["lab"][0].shape[0]
scores["F1"]["lab"][0].shape[0] 
head = "| %-20s | %-12s |" +  numlabels * " %-10s |" 
headv = ["Score", "Average"]
headv.extend(["Kat_"+str(i+1) for i in range(numlabels)])
row=head % tuple(headv)
#print("+"*len(row))
print(row)
#print("+"*len(row))

for score_name, score_dict in sorted(scores.items()) :
    if score_name == "Log-likelihood":
        headv = [score_name, np.mean(score_dict['list'])*1, np.std(score_dict['list'])*1]
    else:
        headv = [score_name, np.mean(score_dict['list'])*100, np.std(score_dict['list'])*100]
    
    for i in range(numlabels):
        if score_name == "Log-likelihood":
            head = "| %-20s | %3.2f ± %3.3f |" + numlabels * " %-10s |" 
            headv.append("-")
        elif score_name in ["F1","Precision","Recall"]:
            head = "| %-20s | %4.1f ± %3.3f |" + numlabels* " %4.1f ± %3.1f |"
            vals = [v[i] for v in scores[score_name]["lab"]]
            headv.append(np.mean(vals)*100)
            headv.append(np.std(vals)*100)
        else:
            head = "| %-20s | %4.1f ± %3.3f |" + numlabels * " %-10s |" 
            headv.append("-")
    print(head % tuple(headv))
    
#print("+"*len(row))

| Score                | Average      | Kat_1      | Kat_2      |
| Accuracy             | 94.2 ± 0.103 | -          | -          |
| Balanced Accuracy    | 81.0 ± 0.395 | -          | -          |
| F1                   | 93.9 ± 0.108 | 96.8 ± 0.1 | 71.1 ± 0.5 |
| Log-likelihood       | 0.15 ± 0.001 | -          | -          |
| Precision            | 93.8 ± 0.114 | 95.6 ± 0.1 | 80.2 ± 0.8 |
| Recall               | 94.2 ± 0.103 | 98.0 ± 0.1 | 63.9 ± 0.8 |
