# Merging annotations  with Original BRAT format


- Merging usage + purpose annotations
- keeping all other annotations

## 1. PLOS

In [1]:
import os
from os import listdir
from os.path import isfile, join

path = 'SoMeSci/PLoS_methods/'


# 1. get a list of file names

PLOS_filesList = []

for file_name in os.listdir(path):
    
    # if the file is .ann
    if file_name.endswith('.ann'):
        PLOS_filesList.append(file_name)
    else:
        continue

PLOS_filesList.sort() 



# we are interested only on annotations set of:  (software_usage and purpose annotations)

keep_list = [ 'Application_Creation', 'PlugIn_Creation', 'Application_Deposition', 'PlugIn_Deposition', 
                     'SoftwareCoreference_Deposition','Application_Mention','ProgrammingEnvironment_Mention',
                     'OperatingSystem_Mention', 'PlugIn_Mention','Version','Developer', 'URL', 'License','Citation', 'Abbreviation', 
                     'AlternativeName', 'Release', 'Extension']


merge_list = ['Application_Usage','ProgrammingEnvironment_Usage',
                 'PlugIn_Usage','OperatingSystem_Usage', 'Purpose_Analysis',
                 'Purpose_Modelling', 'Purpose_Stimulation', 
                 'Purpose_DataCollection', 'Purpose_DataPreProcss', 
                 'Purpose_Simulation', 'Purpose_Visualization', 'Purpose_Programming']



mergeAnn_dict = {}  # stores list of ann to be merged
keepAnn_dict = {}   # stores list of ann to be kept


# loop over each file 
for indx, file in enumerate(PLOS_filesList[:]):
    
    # path to each file
    file_path = path + file
    
    with open(file_path, "r") as a_file:
        
        
        mergAnn_line = []  # stores ann line to be merged 
        keepANn_line = []  # stores ann line to be kept 
        
        
        # check if the annotation is what we want (usage, purpose, starts with T )
        for line in a_file:
            
            annotataion_type = line.split('\t')[1].split()[0]  # grab annotation type
            
            # store annotations to be merged into mergAnn_line
            
            if (line.startswith('T') & (annotataion_type in merge_list)):
                
                strStop_lst = line.split('\t')[1].split()[1:]
                
                # check if the ann is already in the mergAnn_line
                if (strStop_lst) not in mergAnn_line: 
                    
                    mergAnn_line.append(line.split('\t'))
                else:
                    pass
            # store annotations to be kept into mergAnn_line    
            else:
                keepANn_line.append(line.split('\t'))
                
        
        mergeAnn_dict[file] = mergAnn_line  
        keepAnn_dict[file] = keepANn_line

### Merger function

In [2]:
'''

each annotation line has a form of:

     [Tx | Annotation | START | STOP | nameOfSoftware]
     
     example: 'T2', 'Application_Usage', '13536', '13539', 'SAS'

mergeList() merges two lines of annotations if their start is the same

'''

def mergeList(list_1, list_2):
    
    #stores merged annotations
    result_list = []
    checked_list = []
    
    for x, y in [(x,y) for x in list_1[1].split()[1:] for y in list_2[1].split()[1:]]:
        
        if( x == y):
            
            # to avoid duplicate merging or appending
            if x not in checked_list:
            
                #get id of the firt annotation 
                result_list.append(list_1[0])
                
                # get annotation starting_number stop_number into a list annotationStrStp
                annotationStrStp = []

                #merge annotations by _
                annotationStrStp.append(list_1[1].split()[0]+'_'+list_2[1].split()[0].split('_')[1])

                # start number
                annotationStrStp.append(list_1[1].split()[1])

                #end number
                annotationStrStp.append(list_1[1].split()[2])             
                
                # Join annotation , start and stop to result_list
                annStrStpJOin = ' '.join(annotationStrStp) 
                result_list.append(annStrStpJOin)

                # get name of the software
                software_name = list_1[2]

                result_list.append(software_name)
                
                checked_list.extend(list_1[1].split()[1:])    
        else:
            pass
    return result_list

### Merge annotations of usage and purpose

In [3]:
# stores MERGED LIST of annotations in each file
merged_dict = {}

#stores list of list of all annotations
all_annotations_list = []


# for each file in the dict
for key in list(mergeAnn_dict)[:]:
    
    # stores list of merged annotations
    merged_annotationlist_perfile = []
    
    #print(key)
    """
    compare all annotations to each other and grab those that share the same "start number"
    
    """
    
    lookup_list1 = []
    for ls1 in mergeAnn_dict.get(key):
        
        lookup_list2 = []
        for ls2 in mergeAnn_dict.get(key):
            
            # compare each annotation with another but not to itself
            if (ls1 != ls2) :
                
                r = mergeList(ls1,ls2)
                
                if (len(r) != 0 ):
                    
                    if (r[1].split()[1] not in lookup_list1 ) | (r[1].split()[2] not in lookup_list1 ):
                        
                        merged_annotationlist_perfile.append(r) 
                        all_annotations_list.append(r)
                            
                        lookup_list2.extend(r[1].split()[1:])
                            
                    else:
                        pass
        # save already merged list on the lookup list            
        lookup_list1.extend(lookup_list2) 
                    
                
    merged_dict[key] = merged_annotationlist_perfile

In [4]:
merged_dict.get('PMC1657052.ann')

[['T1', 'Application_Usage_Modelling 3670 3681', 'DS modeling\n'],
 ['T2', 'Application_Usage_DataCollection 5677 5683', 'pCLAMP\n'],
 ['T3', 'Application_Usage_Analysis 1203 1211', 'ENSEMBLE\n']]

In [5]:
mergeAnn_dict.get('PMC1657052.ann')

[['T1', 'Application_Usage 3670 3681', 'DS modeling\n'],
 ['T2', 'Application_Usage 5677 5683', 'pCLAMP\n'],
 ['T3', 'Application_Usage 1203 1211', 'ENSEMBLE\n'],
 ['T4', 'Purpose_Analysis 1203 1211', 'ENSEMBLE\n'],
 ['T6', 'Purpose_Modelling 3670 3681', 'DS modeling\n'],
 ['T12', 'Purpose_DataCollection 5677 5683', 'pCLAMP\n']]

In [6]:
keepAnn_dict.get('PMC1657052.ann')

[['T5', 'Developer 1224 1240', 'Sanger Institute\n'],
 ['R1', 'Developer_of Arg1:T5 Arg2:T3', '\n'],
 ['T7', 'Version 3699 3702', '1.1\n'],
 ['R3', 'Version_of Arg1:T7 Arg2:T1', '\n'],
 ['T8', 'Developer 3704 3712', 'Accelrys\n'],
 ['R4', 'Developer_of Arg1:T8 Arg2:T1', '\n'],
 ['T9', 'URL 3714 3737', 'http://www.accelrys.com\n'],
 ['R5', 'URL_of Arg1:T9 Arg2:T8', '\n'],
 ['T10', 'Version 5683 5684', '9\n'],
 ['R6', 'Version_of Arg1:T10 Arg2:T2', '\n'],
 ['T11', 'Developer 5686 5702', 'Axon Instruments\n'],
 ['R7', 'Developer_of Arg1:T11 Arg2:T2', '\n']]

## combine merged annotations with other annotations

In [7]:
combinedAnnotations_PLOS = {}

for file in list(mergeAnn_dict)[:]:
    
    combinedAnnotations_PLOS[file] = merged_dict.get(file) + keepAnn_dict.get(file)

In [8]:
combinedAnnotations_PLOS.get('PMC2265305.ann')

[['T1', 'Application_Usage_Analysis 3540 3545', 'Stata\n'],
 ['T2', 'Application_Usage_Modelling 4256 4261', 'Stata\n'],
 ['T7', 'Version 3576 3579', '9.2\n'],
 ['R3', 'Version_of Arg1:T7 Arg2:T1', '\n'],
 ['T8', 'Developer 3581 3598', 'Stata Corporation\n'],
 ['R4', 'Developer_of Arg1:T8 Arg2:T1', '\n']]

### PLoS:  writing merged annotations to  `PLoS_methodsClean` folder

In [9]:
for file in list(combinedAnnotations_PLOS):
    
    path2 = 'SoMeSci/PLoS_methodsClean/'
    file_path = path2 + file
       
    with open(file_path, "w") as f1:
            
        for line in combinedAnnotations_PLOS.get(file):
 
            txt = '\t'.join(line)
            f1.write(txt)

print("# of files with Combined annotations:", len(combinedAnnotations_PLOS))

# of files with Combined annotations: 480


# 2. PubMed

In [10]:
import os
from os import listdir
from os.path import isfile, join

mypath = "SoMeSci/Pubmed_fulltext/"


# 1. get a list of file names

PubMed_filesList = []

for file_name in os.listdir(mypath):
    
    # if the file is .ann
    if file_name.endswith('.ann'):
        PubMed_filesList.append(file_name)
    else:
        continue

PubMed_filesList.sort() 



# we are interested only on annotations set of:  (software_usage and purpose annotations)

keep_list = [ 'Application_Creation', 'PlugIn_Creation', 'Application_Deposition', 'PlugIn_Deposition', 
                     'SoftwareCoreference_Deposition','Application_Mention','ProgrammingEnvironment_Mention',
                     'OperatingSystem_Mention', 'PlugIn_Mention','Version','Developer', 'URL', 'License','Citation', 'Abbreviation', 
                     'AlternativeName', 'Release', 'Extension']


merge_list = ['Application_Usage','ProgrammingEnvironment_Usage',
                 'PlugIn_Usage','OperatingSystem_Usage', 'Purpose_Analysis',
                 'Purpose_Modelling', 'Purpose_Stimulation', 
                 'Purpose_DataCollection', 'Purpose_DataPreProcss', 
                 'Purpose_Simulation', 'Purpose_Visualization', 'Purpose_Programming']



mergeAnn_dict2 = {}  # stores list of ann to be merged
keepAnn_dict2 = {}   # stores list of ann to be kept


# loop over each file 
for indx, file in enumerate(PubMed_filesList[:]):
    
    # path to each file
    file_path = mypath + file
    
    with open(file_path, "r") as a_file:
        
        
        mergAnn_line2 = []  # stores ann line to be merged 
        keepANn_line2 = []  # stores ann line to be kept 
        
        
        # check if the annotation is what we want (usage, purpose, starts with T )
        for line in a_file:
            
            annotataion_type = line.split('\t')[1].split()[0]  # grab annotation type
            
            # store annotations to be merged into mergAnn_line
            
            if (line.startswith('T') & (annotataion_type in merge_list)):
                
                strStop_lst = line.split('\t')[1].split()[1:]
                
                # check if the ann is already in the mergAnn_line
                if (strStop_lst) not in mergAnn_line2: 
                    
                    mergAnn_line2.append(line.split('\t'))
                else:
                    pass
            # store annotations to be kept into mergAnn_line    
            else:
                keepANn_line2.append(line.split('\t'))
                
        
        mergeAnn_dict2[file] = mergAnn_line2  
        keepAnn_dict2[file] = keepANn_line2

### Merge annotations

In [11]:
# stores MERGED LIST of annotations in each file
merged_dict2 = {}

#stores list of list of all annotations
all_annotations_list2 = []


# for each file in the dict
for key in list(mergeAnn_dict2)[:]:
    
    # stores list of merged annotations
    merged_annotationlist_perfile = []
    
    #print(key)
    """
    compare all annotations to each other and grab those that share the same "start number"
    
    """
    
    lookup_list1 = []
    for ls1 in mergeAnn_dict2.get(key):
        
        lookup_list2 = []
        for ls2 in mergeAnn_dict2.get(key):
            
            # compare each annotation with another but not to itself
            if (ls1 != ls2) :
                
                r = mergeList(ls1,ls2)
                
                if (len(r) != 0 ):
                    
                    if (r[1].split()[1] not in lookup_list1 ) | (r[1].split()[2] not in lookup_list1 ):
                        
                        merged_annotationlist_perfile.append(r) 
                        all_annotations_list2.append(r)
                            
                        lookup_list2.extend(r[1].split()[1:])
                            
                    else:
                        pass
        # save already merged list on the lookup list            
        lookup_list1.extend(lookup_list2) 
                    
                
    merged_dict2[key] = merged_annotationlist_perfile

## combine merged annotations with other annotations

In [12]:
combinedAnnotations_PubMed = {}

for file in list(merged_dict2)[:]:
    
    combinedAnnotations_PubMed[file] = merged_dict2.get(file) + keepAnn_dict2.get(file)

In [13]:
#combinedAnnotations_PubMed.get('PMC3568080.ann')

### Pubmed:  writing merged annotations to  `Pubmed_fulltext_Clean` folder

In [14]:
for file in list(combinedAnnotations_PubMed):
    
    path2 = 'SoMeSci/Pubmed_fulltext_Clean/'
    file_path = path2 + file
       
    with open(file_path, "w") as f1:
            
        for line in combinedAnnotations_PubMed.get(file):
 
            txt = '\t'.join(line)
            f1.write(txt)

print("# of files with Combined annotations:", len(combinedAnnotations_PubMed))

# of files with Combined annotations: 100
