In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import yaml
import xopen
import json
import pickle
import warnings
warnings.filterwarnings("ignore")
from os.path import dirname
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer
from data import infer_preprocess, cut_sentences
from model import Classifier, BERTDiseaseClassifier
from utils import default_symps

In [3]:
if __name__ == "__main__":
    batch_size = 64
    patient_dir = "../data/datastoreOZP/dvlog_wtext.json"
    output_dir = "../../Relevance_inference_data"
    os.makedirs(output_dir, exist_ok=True)
    ckpt_dir = "lightning_logs/version_0/checkpoints/epoch=0-step=720.ckpt"
    hparams_dir = os.path.join(dirname(dirname(ckpt_dir)), 'hparams.yaml')
    hparams = yaml.load(open(hparams_dir),Loader=yaml.Loader)
    max_len = hparams["max_len"]
    tokenizer = AutoTokenizer.from_pretrained(hparams["model_type"])
    clf = Classifier.load_from_checkpoint(ckpt_dir, symps=default_symps)
    clf.eval()
    clf.cuda()
    split2dataset = []
    
    with xopen.xopen(patient_dir) as fi:
        for i, line in enumerate(fi):
            record = json.loads(line)
            aid = "P" + str(record['id'])
            user_sents = []
            sent_bounds = [0]
            curr_sid = 0
            if record['posts'] == None:
                break
            else:
                for post in record["posts"]:
                    sents = cut_sentences(post)
                    curr_sid += len(sents)
                    sent_bounds.append(curr_sid)
                    user_sents.extend(sents)
                all_probs = []
                all_feats = []
                for i in range(0, len(user_sents), batch_size):
                    curr_texts = user_sents[i:i+batch_size]
                    processed_batch = infer_preprocess(curr_texts, tokenizer, max_len)
                    for k, v in processed_batch.items():
                        processed_batch[k] = v.cuda()
                    with torch.no_grad():
                        feats, logits = clf.feat_extract_avg(processed_batch)
                        feats = feats.detach().cpu().numpy()
                        probs = logits.sigmoid().detach().cpu().numpy()
                    all_probs.append(probs)
                    all_feats.append(feats)
                all_probs = np.concatenate(all_probs, 0)
                all_feats = np.concatenate(all_feats, 0)

                # merge all sentence features into post-level feature by max pooling
                all_post_probs = []
                all_post_feats = []
                for i in range(len(sent_bounds)-1):
                    lbound, rbound = sent_bounds[i], sent_bounds[i+1]
                    post_prob = all_probs[lbound:rbound, :].max(0)
                    all_post_probs.append(post_prob)
                    post_feat = all_feats[lbound:rbound, :].mean(0)
                    all_post_feats.append(post_feat)
                all_post_probs = np.stack(all_post_probs, 0)
                all_post_feats = np.stack(all_post_feats, 0)
                data = {
                    "id": aid,
                    "diseases": record["diseases"],
                    "probs": all_post_probs,
                    "feats": all_post_feats
                }
                split2dataset.append(data)


df = pd.DataFrame(split2dataset)

# print("Writing")
# for split, dataset in split2dataset.items():
#     with open(os.path.join(output_dir, f"{split}.pkl"), "wb") as fo:   
#         pickle.dump(dataset, fo)

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


"accumulation":       1
"bal_sample":         True
"bs":                 64
"control_ratio":      0.5
"exp_name":           mbert_label_enhance_bal_sample_050_666
"gradient_clip_val":  0.1
"input_dir":          ../data/symp_data_w_control
"loss_mask":          True
"loss_type":          bce
"loss_weighting":     mean
"lr":                 0.0003
"max_len":            64
"model_type":         mental/mental-bert-base-uncased
"patience":           4
"pos_weight_setting": default
"seed":               666
"threshold":          0.5
"uncertain":          exclude
"write_result_dir":   ./lightning_logs/bal_sample_records.json


In [4]:
df.to_csv('../data/datastoreOZP/Relevance_inference_data.csv')
df.to_json('../data/datastoreOZP/Relevance_inference_data.json', orient='records',lines=True)