In [None]:
!pip install transformers --quiet

In [None]:
! id="12vbx8UMl4KpoSPM4ZHqqggjtKrn35odG"; \
conf=$(wget --quiet --save-cookies /tmp/cookies.txt \
--keep-session-cookies --no-check-certificate \
'https://docs.google.com/uc?export=download&id='$id -O- | \
sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p'); \
wget --load-cookies /tmp/cookies.txt \
"https://docs.google.com/uc?export=download&confirm=$conf&id=$id" -O qprop.csv && \
rm -rf /tmp/cookies.txt

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import RepeatedStratifiedKFold

data = pd.read_csv("raw/qprop.csv",sep="\t")
print("labels\n",data["label"].value_counts())
print("shape  \n",data.shape)

bootstrap_size = 2000

if bootstrap_size != 0:
    bootstrap_factor = bootstrap_size / data.shape[0]
    bootstrap = np.random.uniform(size=data.shape[0]) < bootstrap_factor
    data = data.iloc[bootstrap]

X, y = data["text"].values, data["label"].values

rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=5, random_state=1410)
foldids = []
for fold_idx, (train, test) in tqdm(enumerate(rskf.split(X, y)), total=rskf.get_n_splits()):
    foldids.append((fold_idx,train,test))
    
print("shapes X",X.shape,"y", y.shape)

labels
 0    45534
1     5736
Name: label, dtype: int64
shape  
 (51270, 14)


100%|███████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 4191.79it/s]

shapes X (2002,) y (2002,)





In [6]:
data.columns

Index(['text', 'event_location', 'average_tone', 'article_URL',
       'MBFC_factuality_label', 'article_URL2', 'MBFC_factuality_label2',
       'URL_to_MBFC_page', 'source_name', 'MBFC_notes_about_source',
       'MBFC_bias_label', 'source_URL', 'published_utc', 'label'],
      dtype='object')

In [5]:
data["source_name"].value_counts()

msn.com                        210
The News International         103
washingtonpost.com              62
timesofindia.indiatimes.com     61
Chicago Tribune                 53
                              ... 
texastribune.org                 1
Utah Public Radio (UPR)          1
St. Louis Post-Dispatch          1
kansas.com                       1
National Review                  1
Name: source_name, Length: 135, dtype: int64

In [69]:
import transformers
transformers.logging.set_verbosity_error()

from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import TFDistilBertModel, DistilBertTokenizerFast

from sklearn.base import BaseEstimator, TransformerMixin
import torch
import numpy as np

from scipy.sparse import csr_matrix
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("using device:", device)

if "disilbert_model" not in locals():
    disilbert_tokenizer =  AutoTokenizer.from_pretrained("distilbert-base-uncased")
    disilbert_model = AutoModel.from_pretrained("distilbert-base-uncased")
    handle = disilbert_model.to(device)


class BERTEmbeddings(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tokenizer =  disilbert_tokenizer
        self.model = disilbert_tokenizer
        self.max_length = 256
        self.model_name = disilbert_model

    def fit(self, X=None, y=None):
        pass
    
    def encode(self, txt):
        return self.tokenizer(txt, max_length=self.max_length, 
                              truncation=True, padding=True, return_tensors="pt")

    def transform(self, X=None):
        dataloader = DataLoader(X, batch_size=16, shuffle=False)
        allembeds = []
        for batch in tqdm(dataloader):
            batchenc = disilbert_tokenizer(batch, max_length=256, truncation=True, padding=True, return_tensors="pt")
            input_ids = batchenc['input_ids'].to(device)
            attention_mask = batchenc['attention_mask'].to(device)
            batchout = disilbert_model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
            embeds = [vec[0].cpu().detach().numpy() for vec in batchout[1][-1]]
            allembeds.extend(embeds)
        return csr_matrix(allembeds)


class BertHead(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.head = LogisticRegression(class_weight='auto', max_iter=10000)

    def fit(self, X=None, y=None):
        self.head.fit(X, y)


    def transform(self, X=None):
        pass
    
    def predict(self, X=None):    
        return self.head.predict(X)



using device: cuda


In [63]:
bert = BERTEmbeddings()
X_dstil_numpy = bert.transform(X).toarray()

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 125/125 [00:03<00:00, 31.75it/s]


In [64]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.linear_model import LogisticRegression

arr = []

for fold,j in enumerate(foldids):
    train = foldids[fold][1]
    test = foldids[fold][2]
    xin, yin = X_dstil_numpy[train], np.array(y[train])
    cls = BertHead()
    
    cls.fit(xin, yin)
    y_pred = cls.predict(X_dstil_numpy[test])

    bac = balanced_accuracy_score(y[test], y_pred)
    arr.append(bac)

    print(" BAC=", bac)

print(10*"-")
print("AVG. BAC=",np.mean(arr),"+/-",np.std(arr))

 BAC= 0.7476174334140435
 BAC= 0.7133849878934625
 BAC= 0.7368571428571429
 BAC= 0.7617094430992736
 BAC= 0.7133849878934625
 BAC= 0.7362857142857142
 BAC= 0.7258595641646489
 BAC= 0.7162421307506053
 BAC= 0.7351428571428571
 BAC= 0.7393801452784503
----------
AVG. BAC= 0.7325864406779661 +/- 0.014860929092860893


# Testy

In [70]:
fold = 1
train = foldids[fold][1]
test = foldids[fold][2]

