In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import yaml
import xopen
import json
import warnings
warnings.filterwarnings("ignore")
from os.path import dirname
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer
from data import infer_preprocess
from model import Classifier, BERTDiseaseClassifier
from utils import decide_subject
from nltk.tokenize import sent_tokenize
import blingfire
import spacy

In [2]:
def cut_sentences(text, tokenizer, nlp):
    if tokenizer == 'blingfire':
        sents = blingfire.text_to_sentences(text.strip()).split("\n")
    if tokenizer == 'nltk':
        sents = sent_tokenize(text.strip())
    if tokenizer == 'spacysm':
        doc = nlp(text)
        sents = [sent.text.strip() for sent in doc.sents]
    if tokenizer == 'spacylg':
        doc = nlp(text)
        sents = [sent.text.strip() for sent in doc.sents]
    if tokenizer == 'spacytrf':
        doc = nlp(text)
        sents = [sent.text.strip() for sent in doc.sents]
    return sents

In [3]:
options = ['blingfire', 'nltk', 'spacysm', 'spacylg', 'spacytrf']
for name in options:
    datastore = []
    senttokenizer = name
    nlp = ''
    # set spacy tokenizer
    if senttokenizer == 'spacysm':
        nlp = spacy.load("en_core_web_sm")
    if senttokenizer == 'spacylg':
        nlp = spacy.load("en_core_web_lg")
    if senttokenizer == 'spacytrf':
        nlp = spacy.load("en_core_web_trf")    


    if __name__ == "__main__":
        batch_size = 64
        input_dir = "../../../data/postdatalines.json"
        ckpt_dir = "lightning_logs/version_0/checkpoints/epoch=1-step=133.ckpt"
        hparams_dir = os.path.join(dirname(dirname(ckpt_dir)), 'hparams.yaml')
        hparams = yaml.load(open(hparams_dir),Loader=yaml.Loader)
        max_len = hparams["max_len"]
        tokenizer = AutoTokenizer.from_pretrained(hparams["model_type"])
        clf = Classifier.load_from_checkpoint(ckpt_dir, symps=['uncertain'])
        clf.eval()
        clf.cuda()
        
        with xopen.xopen(input_dir) as fi:
            for i, line in tqdm(enumerate(fi)):
                record = json.loads(line)
                user_sents = []
                sent_bounds = [0]
                curr_sid = 0
                post_subj = []
                for post in record["text"]:
                    post_subj.append(decide_subject(post))
                    sents = cut_sentences(post, senttokenizer, nlp)
                    curr_sid += len(sents)
                    sent_bounds.append(curr_sid)
                    user_sents.extend(sents)

                all_probs = []
                for i in range(0, len(user_sents), batch_size):
                    curr_texts = user_sents[i:i+batch_size]
                    processed_batch = infer_preprocess(curr_texts, tokenizer, max_len)
                    for k, v in processed_batch.items():
                        processed_batch[k] = v.to(clf.device)
                    with torch.no_grad():
                        logits = clf(processed_batch)
                        probs = logits.sigmoid().detach().cpu().numpy()
                    all_probs.append(probs)
                all_probs = np.concatenate(all_probs, 0)

                # merge all sentence features into post-level feature by max pooling
                all_post_probs = []
                for i in range(len(sent_bounds)-1):
                    lbound, rbound = sent_bounds[i], sent_bounds[i+1]
                    post_prob = all_probs[lbound:rbound, 0].max()
                    all_post_probs.append(post_prob)
                all_post_probs = np.array(all_post_probs)
                post_subj = np.array(post_subj)
                data = {
                    "uncertain": all_post_probs,
                    "subj": post_subj
                }
                datastore.append(data)


    df = pd.read_json("../../../data/postdatalines.json", lines=True)
    vector_df = pd.DataFrame(datastore)
    df = pd.concat([df,vector_df],axis=1)
    df.to_json(f"../../../data/reweighting/uncertaintySubject{senttokenizer}.json",lines=True, orient='records')

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


"accumulation":       1
"bal_sample":         False
"bs":                 64
"control_ratio":      0.75
"exp_name":           mbert_uncertain_only_666
"gradient_clip_val":  0.1
"input_dir":          ../data/symp_data
"loss_mask":          True
"loss_type":          bce
"loss_weighting":     mean
"lr":                 0.0003
"max_len":            64
"model_type":         mental/mental-bert-base-uncased
"patience":           4
"pos_weight_setting": default
"seed":               666
"threshold":          0.5
"uncertain":          only
"write_result_dir":   ./lightning_logs/baseline_records.json


797it [00:22, 35.48it/s]
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


"accumulation":       1
"bal_sample":         False
"bs":                 64
"control_ratio":      0.75
"exp_name":           mbert_uncertain_only_666
"gradient_clip_val":  0.1
"input_dir":          ../data/symp_data
"loss_mask":          True
"loss_type":          bce
"loss_weighting":     mean
"lr":                 0.0003
"max_len":            64
"model_type":         mental/mental-bert-base-uncased
"patience":           4
"pos_weight_setting": default
"seed":               666
"threshold":          0.5
"uncertain":          only
"write_result_dir":   ./lightning_logs/baseline_records.json


797it [00:19, 40.76it/s]
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


"accumulation":       1
"bal_sample":         False
"bs":                 64
"control_ratio":      0.75
"exp_name":           mbert_uncertain_only_666
"gradient_clip_val":  0.1
"input_dir":          ../data/symp_data
"loss_mask":          True
"loss_type":          bce
"loss_weighting":     mean
"lr":                 0.0003
"max_len":            64
"model_type":         mental/mental-bert-base-uncased
"patience":           4
"pos_weight_setting": default
"seed":               666
"threshold":          0.5
"uncertain":          only
"write_result_dir":   ./lightning_logs/baseline_records.json


797it [03:00,  4.42it/s]
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


"accumulation":       1
"bal_sample":         False
"bs":                 64
"control_ratio":      0.75
"exp_name":           mbert_uncertain_only_666
"gradient_clip_val":  0.1
"input_dir":          ../data/symp_data
"loss_mask":          True
"loss_type":          bce
"loss_weighting":     mean
"lr":                 0.0003
"max_len":            64
"model_type":         mental/mental-bert-base-uncased
"patience":           4
"pos_weight_setting": default
"seed":               666
"threshold":          0.5
"uncertain":          only
"write_result_dir":   ./lightning_logs/baseline_records.json


797it [03:13,  4.12it/s]
Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


"accumulation":       1
"bal_sample":         False
"bs":                 64
"control_ratio":      0.75
"exp_name":           mbert_uncertain_only_666
"gradient_clip_val":  0.1
"input_dir":          ../data/symp_data
"loss_mask":          True
"loss_type":          bce
"loss_weighting":     mean
"lr":                 0.0003
"max_len":            64
"model_type":         mental/mental-bert-base-uncased
"patience":           4
"pos_weight_setting": default
"seed":               666
"threshold":          0.5
"uncertain":          only
"write_result_dir":   ./lightning_logs/baseline_records.json


797it [26:30,  2.00s/it]
