In [4]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import yaml
import xopen
import json
import warnings
warnings.filterwarnings("ignore")
from os.path import dirname
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer
from data import infer_preprocess
from model import Classifier, BERTDiseaseClassifier
from utils import default_symps
from nltk.tokenize import sent_tokenize
import blingfire
import spacy

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ModuleNotFoundError: No module named 'torch'

In [3]:
def cut_sentences(text, tokenizer, nlp):
    if tokenizer == 'blingfire':
        sents = blingfire.text_to_sentences(text.strip()).split("\n")
    if tokenizer == 'nltk':
        sents = sent_tokenize(text.strip())
    if tokenizer == 'spacysm':
        doc = nlp(text)
        sents = [sent.text.strip() for sent in doc.sents]
    if tokenizer == 'spacylg':
        doc = nlp(text)
        sents = [sent.text.strip() for sent in doc.sents]
    if tokenizer == 'spacytrf':
        doc = nlp(text)
        sents = [sent.text.strip() for sent in doc.sents]
    return sents

In [34]:
datastore = []
# options : blingfire, nltk, spacysm, spacylg, spacytrf
senttokenizer = 'spacysm'
# set spacy tokenizer
if senttokenizer == 'spacysm':
    nlp = spacy.load("en_core_web_sm")
if senttokenizer == 'spacylg':
    nlp = spacy.load("en_core_web_lg")
if senttokenizer == 'spacytrf':
    nlp = spacy.load("en_core_web_trf")    

symps = ["Anxious_Mood","Autonomic_symptoms","Cardiovascular_symptoms","Catatonic_behavior","Decreased_energy_tiredness_fatigue","Depressed_Mood","Gastrointestinal_symptoms","Genitourinary_symptoms","Hyperactivity_agitation","Impulsivity","Inattention","Indecisiveness","Respiratory_symptoms","Suicidal_ideas","Worthlessness_and_guilty","avoidance_of_stimuli","compensatory_behaviors_to_prevent_weight_gain","compulsions","diminished_emotional_expression","do_things_easily_get_painful_consequences","drastical_shift_in_mood_and_energy","fear_about_social_situations","fear_of_gaining_weight","fears_of_being_negatively_evaluated","flight_of_ideas","intrusion_symptoms","loss_of_interest_or_motivation","more_talktive","obsession","panic_fear","pessimism","poor_memory","sleep_disturbance","somatic_muscle","somatic_symptoms_others","somatic_symptoms_sensory","weight_and_appetite_change","Anger_Irritability"]

if __name__ == "__main__":
    batch_size = 64
    input_dir = "../../../data/postdatalines.json"
    ckpt_dir = "lightning_logs/version_0/checkpoints/epoch=0-step=720.ckpt"
    hparams_dir = os.path.join(dirname(dirname(ckpt_dir)), 'hparams.yaml')
    hparams = yaml.load(open(hparams_dir),Loader=yaml.Loader)
    max_len = hparams["max_len"]
    tokenizer = AutoTokenizer.from_pretrained(hparams["model_type"])
    clf = Classifier.load_from_checkpoint(ckpt_dir, symps=default_symps)
    clf.eval()
    clf.cuda()

    allPostSentences = []
    
    with xopen.xopen(input_dir) as fi:
        for i, line in tqdm(enumerate(fi)):
            record = json.loads(line)
            
            user_sents = []
            sent_bounds = [0]
            curr_sid = 0
            
            post_data = {
            "gender": record["gender"],
            "label": record["label"],
            "sentences": []
            }   

            if record['text'] == None:
                break
            else:
                for post in record["text"]:
                    sents = cut_sentences(post, senttokenizer, nlp)
                    curr_sid += len(sents)
                    sent_bounds.append(curr_sid)
                    user_sents.extend(sents)

                
                all_probs = []
                for i in range(0, len(user_sents), batch_size):
                    curr_texts = user_sents[i:i+batch_size]
                    processed_batch = infer_preprocess(curr_texts, tokenizer, max_len)
                    for k, v in processed_batch.items():
                        processed_batch[k] = v.cuda()
                    with torch.no_grad():
                        feats, logits = clf.feat_extract_avg(processed_batch)
                        probs = logits.sigmoid().detach().cpu().numpy()
                    all_probs.extend(probs)
                
                for sent, probs in zip(user_sents, all_probs):
                    sent_data = {"sentence": sent, "probabilities": dict(zip(symps, probs))}
                    post_data['sentences'].append(sent_data)
                
                allPostSentences.append(post_data)

Some weights of BertModel were not initialized from the model checkpoint at mental/mental-bert-base-uncased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


"accumulation":       1
"bal_sample":         True
"bs":                 64
"control_ratio":      0.5
"exp_name":           mbert_label_enhance_bal_sample_050_666
"gradient_clip_val":  0.1
"input_dir":          ../data/symp_data_w_control
"loss_mask":          True
"loss_type":          bce
"loss_weighting":     mean
"lr":                 0.0003
"max_len":            64
"model_type":         mental/mental-bert-base-uncased
"patience":           4
"pos_weight_setting": default
"seed":               666
"threshold":          0.5
"uncertain":          exclude
"write_result_dir":   ./lightning_logs/bal_sample_records.json


797it [03:15,  4.07it/s]


In [2]:
import pandas as pd

In [6]:
try:
 df = pd.read_json('../../../data/annotationData/sentencedata.json',orient='records')
except:
 df = pd.DataFrame(allPostSentences)
 df.to_json('../../../data/annotationData/sentencedata.json', orient='records')

female = df.loc[(df['label'] == 'depression') & (df['gender'] == 'f')]
male = df.loc[(df['label'] == 'depression') & (df['gender'] == 'm')]

femaleAnnotation = female.sample(n=20, random_state=99)
maleAnnotation = male.sample(n=20, random_state=99)

femaleAnnotation.to_json('../../../data/annotationData/femaledata.json',orient='records')
maleAnnotation.to_json('../../../data/annotationData/maledata.json',orient='records')


In [5]:
gender = 'female'
df = pd.read_json(f'../../../data/annotationData/{gender}data.json', orient='records')
symps = ["Anxious_Mood","Autonomic_symptoms","Cardiovascular_symptoms","Catatonic_behavior","Decreased_energy_tiredness_fatigue","Depressed_Mood","Gastrointestinal_symptoms","Genitourinary_symptoms","Hyperactivity_agitation","Impulsivity","Inattention","Indecisiveness","Respiratory_symptoms","Suicidal_ideas","Worthlessness_and_guilty","avoidance_of_stimuli","compensatory_behaviors_to_prevent_weight_gain","compulsions","diminished_emotional_expression","do_things_easily_get_painful_consequences","drastical_shift_in_mood_and_energy","fear_about_social_situations","fear_of_gaining_weight","fears_of_being_negatively_evaluated","flight_of_ideas","intrusion_symptoms","loss_of_interest_or_motivation","more_talktive","obsession","panic_fear","pessimism","poor_memory","sleep_disturbance","somatic_muscle","somatic_symptoms_others","somatic_symptoms_sensory","weight_and_appetite_change","Anger_Irritability"]

relevant_sents = []
postid = 0
sentid = 0

for post in df['sentences']:
    postid += 1
    for sent in post:
        sentid += 1
        meet_threshold = []
        if len(sent['sentence'].split()) >= 10:
            meet_threshold = [{'id':[postid,sentid],'sentence':sent['sentence']}]
            for symptom in symps:
                if sent['probabilities'][symptom] >= 0.3:
                    meet_threshold.append({symptom:sent['probabilities'][symptom]})
                    if sent['probabilities'][symptom] >= 0.5:
                        meet_threshold.append('1')
                    else:
                        meet_threshold.append('0')
            if len(meet_threshold) >= 4:
                relevant_sents.append(meet_threshold)
        else:
            continue


print(len(relevant_sents))

df = pd.DataFrame(relevant_sents)
df.to_csv(f'../../../data/annotationData/{gender}PredictionsetFilled.csv')
df


120


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,"{'id': [2, 69], 'sentence': 'and there's so ma...",{'Anxious_Mood': 0.9780091047},1,{'Depressed_Mood': 0.7143710852},1,{'Hyperactivity_agitation': 0.8948042989},1,{'Anger_Irritability': 0.6481491327000001},1,,...,,,,,,,,,,
1,"{'id': [2, 70], 'sentence': 'so I don't want p...",{'Anxious_Mood': 0.8915029764},1,{'Depressed_Mood': 0.7208292484000001},1,{'Hyperactivity_agitation': 0.8669700623000001},1,{'diminished_emotional_expression': 0.3815910518},0,,...,,,,,,,,,,
2,"{'id': [2, 81], 'sentence': 'and I think espec...",{'Anxious_Mood': 0.9970052838000001},1,{'fear_about_social_situations': 0.44122979040...,0,{'panic_fear': 0.3154592216},0,,,,...,,,,,,,,,,
3,"{'id': [3, 86], 'sentence': 'but I really want...",{'Anxious_Mood': 0.9335232377},1,{'Depressed_Mood': 0.9749916792000001},1,{'Hyperactivity_agitation': 0.4232343435},0,{'Suicidal_ideas': 0.3876424134},0,{'diminished_emotional_expression': 0.415440917},...,{'flight_of_ideas': 0.6484325528},1,{'panic_fear': 0.337185204},0,{'Anger_Irritability': 0.5772311091},1,,,,
4,"{'id': [3, 87], 'sentence': 'and I think that'...",{'Anxious_Mood': 0.8684682846},1,{'Depressed_Mood': 0.9883437157},1,{'diminished_emotional_expression': 0.6806105375},1,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,"{'id': [18, 913], 'sentence': 'and i don't kno...",{'fear_of_gaining_weight': 0.9580567479000001},1,{'weight_and_appetite_change': 0.8464787602},1,,,,,,...,,,,,,,,,,
116,"{'id': [19, 921], 'sentence': 'and I wanted to...",{'Anxious_Mood': 0.932423234},1,{'Depressed_Mood': 0.8644747734},1,{'flight_of_ideas': 0.5232628584},1,{'intrusion_symptoms': 0.6444249153},1,{'obsession': 0.8978157043},...,{'panic_fear': 0.3426011503},0,{'sleep_disturbance': 0.4238805473},0,,,,,,
117,"{'id': [19, 944], 'sentence': 'and I get reall...",{'Hyperactivity_agitation': 0.537912786},1,{'do_things_easily_get_painful_consequences': ...,0,{'drastical_shift_in_mood_and_energy': 0.97944...,1,{'Anger_Irritability': 0.581135869},1,,...,,,,,,,,,,
118,"{'id': [20, 961], 'sentence': 'I was drinking ...",{'do_things_easily_get_painful_consequences': ...,1,{'drastical_shift_in_mood_and_energy': 0.95118...,1,,,,,,...,,,,,,,,,,


In [132]:
gender = 'female'
df = pd.read_json(f'../../../data/annotationData/{gender}data.json', orient='records')
symps = ["Anxious_Mood","Autonomic_symptoms","Cardiovascular_symptoms","Catatonic_behavior","Decreased_energy_tiredness_fatigue","Depressed_Mood","Gastrointestinal_symptoms","Genitourinary_symptoms","Hyperactivity_agitation","Impulsivity","Inattention","Indecisiveness","Respiratory_symptoms","Suicidal_ideas","Worthlessness_and_guilty","avoidance_of_stimuli","compensatory_behaviors_to_prevent_weight_gain","compulsions","diminished_emotional_expression","do_things_easily_get_painful_consequences","drastical_shift_in_mood_and_energy","fear_about_social_situations","fear_of_gaining_weight","fears_of_being_negatively_evaluated","flight_of_ideas","intrusion_symptoms","loss_of_interest_or_motivation","more_talktive","obsession","panic_fear","pessimism","poor_memory","sleep_disturbance","somatic_muscle","somatic_symptoms_others","somatic_symptoms_sensory","weight_and_appetite_change","Anger_Irritability"]

relevant_sents = []
postid = 0
sentid = 0

for post in df['sentences']:
    postid += 1
    for sent in post:
        sentid += 1
        meet_threshold = []
        if len(sent['sentence'].split()) >= 10:
            meet_threshold = [{'id':[postid,sentid],'sentence':sent['sentence']}]
            for symptom in symps:
                if sent['probabilities'][symptom] >= 0.3:
                    meet_threshold.append({symptom:'-'})
                    meet_threshold.append('annotation')
            if len(meet_threshold) >= 4:
                relevant_sents.append(meet_threshold)
        else:
            continue


print(len(relevant_sents))

df = pd.DataFrame(relevant_sents)
df.to_csv(f'../../../data/annotationData/{gender}annotationset.csv')



120
