# Select 200 abstracts for benchmarking the NER 

### Server Part (Done on Computerome)

0. Copy Required scripts for Tagger
1. Prepare reuired input files for tagger
2. run tagger
3. Generate Abstracts and PMID to Journal mapper
3. Process Tagger Output

    1. pmid_to_journal_processed.tsv 
        * map PMID to corresponding journal  example: pmid_to_journal_dict[18086704]==> 'Nucleic Acids Res' 
    2. pmid_to_wordcount 
        * abstract size(word count) per pmid
    3. LSF_count_per_journal_per_category.tsv
        * create two dimensional dict to store LSF count per journal for every category 
        * example: LSF_count_per_journal_per_category['Beauty and Cleaning']['Nucleic Acids Res']

In [15]:
import random
from collections import defaultdict
import pandas as pd

In [2]:
# download pmid_to_journal_processed.tsv file from Zenodo project
#file_path='/Users/dzq660/LOCAL/ZenodoProjects/LSF/NER-Benchmarking/pmid_to_journal_processed.tsv.gz'
file_path='../../data/NER-Benchmarking/pmid_to_journal_processed.tsv.gz'
pmid_to_journal=pd.read_csv(file_path,sep='\t')

pmid_to_journal.head()


Unnamed: 0,pmid,journal
0,16923182,Reprod Biol Endocrinol
1,12086586,BMC Blood Disord
2,16606691,J Cell Biol
3,18086704,Nucleic Acids Res
4,18710930,J Exp Med


In [3]:
#pmid for full text articles to be excluded
# download pmid_to_journal_processed.tsv file from Zenodo project

pmc=pd.read_csv('/Users/dzq660/LOCAL/ZenodoProjects/LSF/NER-Benchmarking/pmcoa_pmids_2022.list.gz',sep='\t',header=None)
pmc.columns=['pmid']
pmc_pmids=pmc.pmid.tolist()
pmc_pmids=set(pmc_pmids)


pmid_to_journal = pmid_to_journal[~pmid_to_journal['pmid'].isin(pmc_pmids)]



pmid_to_journal_dict={}
for pmid,journal in pmid_to_journal.itertuples(index=False):
    pmid_to_journal_dict[pmid]=journal


In [4]:
# count how many abstracts every journal has example:  abstract_count_per_journal['PLoS One'] ==> 266267
counts=pmid_to_journal.iloc[:,1].value_counts()
abstract_count_per_journal={}
for journal, count in counts.items():
        abstract_count_per_journal[journal]=count


In [5]:
df_LSF_count_per_journal_per_category=pd.read_csv('../../data/NER-Benchmarking/LSF_count_per_journal_per_category.tsv',sep='\t',index_col=0)


In [6]:
# select journal which has at least abstracts over than the treshold count 
min_abstract_treshold=1000

In [7]:
journals_to_remove=[]
for journal,row in df_LSF_count_per_journal_per_category.iterrows():
    try:
        if abstract_count_per_journal[journal] < min_abstract_treshold:
            journals_to_remove.append(journal)
    except:
        pass

In [8]:
# remove journals with less than treshold abstract count
df_LSF_count_per_journal_per_category.drop(journals_to_remove,inplace=True)

In [9]:
# remove rows if journal name is missing
df_LSF_count_per_journal_per_category=df_LSF_count_per_journal_per_category.loc[df_LSF_count_per_journal_per_category.index.dropna()]

In [10]:
df_LSF_avg_count_per_journal_per_category=df_LSF_count_per_journal_per_category.copy(deep=True)

In [11]:
for journal,row in df_LSF_count_per_journal_per_category.iterrows():
    for category in row.keys():
        #compute average per category per journal
        try:
            df_LSF_avg_count_per_journal_per_category.at[journal,category]= row[category] / abstract_count_per_journal[journal]
        except:
            pass


In [12]:
def get_top_journals_per_category(n):
    """
        get_top_journals_per_category
    Args:
        n (_type_): top n journals which has higher average LSF count per abstract in the target catgeory
    """
    #stores list of top n journal per category
    top_n_journals_dict={}
    top_n_journals_list=[]
    for category in df_LSF_avg_count_per_journal_per_category.columns:
        top_n=df_LSF_avg_count_per_journal_per_category.nlargest(n,[category])
        top_n_journals_dict[category]=top_n[[category]]
        top_n_journals_list.extend(list(top_n_journals_dict[category].index)) # store journal names
    return top_n_journals_dict,top_n_journals_list



In [13]:
top_journals_dict,top_journals_list=get_top_journals_per_category(3)

In [145]:
top_journals_dict

{'Beauty and Cleaning':                  Beauty and Cleaning
 J Dent Hyg                  0.998329
 J Orthod                    0.996172
 J Orofac Orthop             0.735552,
 'Nutrition':                       Nutrition
 Plant Foods Hum Nutr   3.061862
 Eur J Nutr             3.022776
 Int J Food Sci Nutr    3.016188,
 'Drugs':                         Drugs
 Nicotine Tob Res     2.509218
 Drug Alcohol Depend  1.553877
 Tob Control          1.531500,
 'Environmental exposures':                          Environmental exposures
 Environ Int                             1.379979
 Indoor Air                              1.369270
 J Air Waste Manag Assoc                 1.350343,
 'Non physical leisure time activities':                              Non physical leisure time activities
 J Gambl Stud                                             1.505271
 Cyberpsychol Behav Soc Netw                              0.651148
 J Phys Act Health                                        0.238985,
 'Physi

In [16]:
journal_to_pmids_dict=defaultdict(list)
for pmid in pmid_to_journal_dict:
    journal=pmid_to_journal_dict[pmid]
    journal_to_pmids_dict[journal].append(pmid)


'../'

In [19]:
#file_path='/Users/dzq660/LOCAL/ZenodoProjects/LSF/NER-Benchmarking/pmid_to_wordcount.tsv.gz'
#download from zenodo
file_path='../../data/NER-Benchmarking/pmid_to_wordcount.tsv.gz'
pmid_to_wordcount=pd.read_csv(file_path,sep='\t')

# this dict will store abstract size(word count) per pmid
pmid_to_wordcount_dict={}
for pmid,wordcount in pmid_to_wordcount.itertuples(index=False):
    pmid_to_wordcount_dict[pmid]=wordcount


In [20]:
def select_pmids_of_selected_journals(top_journals_dict,selection_count=200,min_word_count=100,max_word_count=200):
    """gets a dict of selected journals for every category and in total how many abstract should be selected
       and  return selected PMIDS, this will be evenly divided for all categories and journals except first category and first journal of each catgory which
       receives the leftover from division

    Args:
        top_journals_dict (_type_): _description_
        selection_count (int, optional): _description_. Defaults to 200.

    Returns:
        _type_: _description_
    """
    selected_pmids=[]
    
    category_count=len(top_journals_dict)
    docs_per_category=int(selection_count/category_count)
    added_docs=selection_count-docs_per_category*category_count

    for i,category in enumerate(top_journals_dict):
        if i==0:  # first category gets the leftover
            count=docs_per_category+added_docs
        else:
            count=docs_per_category
        
        journals=list(top_journals_dict[category].index)
        count_per_journal=int(count/len(journals))
        added_count_category=count-count_per_journal*len(journals)
        for j,journal in enumerate(journals):
            if j==0: #first journal gets the leftover
                count_selection=count_per_journal+added_count_category
            else:
                count_selection=count_per_journal
            pmids=journal_to_pmids_dict[journal]
            if len(pmids) <=count_selection:
                print(category, ' , ', journal, len(pmids))
            # loop until select required number of abstracts
            while count_selection>0:
                while True:
                    selected_pmid=random.sample(pmids,count_selection)[0]
                    pmids.remove(selected_pmid) # to prevent from reselection
                    
                    if pmid_to_wordcount_dict[selected_pmid] >= min_word_count  and pmid_to_wordcount_dict[selected_pmid] <= max_word_count:
                        break  # found the asbtract based on the requested length
                count_selection-=1 
                selected_pmids.append(selected_pmid)
            
            
    return selected_pmids
            


        

In [21]:
selected_pmids=select_pmids_of_selected_journals(top_journals_dict,selection_count=200,min_word_count=100,max_word_count=200)


In [22]:
len(selected_pmids)

200

In [152]:

#first 200 lines are main selected and rest are reserved ones
df=pd.DataFrame(selected_pmids,index=None)
df.to_csv('../../data/NER-Benchmarking/selected_200_pmids.tsv',sep='\t',header=False,index=None)
