# Merging Annotations


##  1. PLoS Methods

In [1]:
import os
from os import listdir
from os.path import isfile, join


# we are interested only on annotations set of:  (software_usage and purpose annotations)

interest_list = ['Application_Usage', 'Purpose_Analysis','Purpose_Modelling', 'Purpose_Stimulation', 
                 'Purpose_DataCollection', 'Purpose_DataPreProcss', 'Purpose_Simulation', 'Purpose_Visualization', 
                 'Purpose_Programming']

# file path
path = 'SoMeSci/PLoS_methods/'

# gather list of .ann PLoS_methods files ---------------------------------------------
PLOS_filesList = []

# iterating over all files in the dir
for file_name in os.listdir(path):
    
    # if the file is .ann
    if file_name.endswith('.ann'):
        PLOS_filesList.append(file_name)
    else:
        continue

PLOS_filesList.sort() 

print('.ann PLosMethods file count : ', len(PLOS_filesList), '\n')
print('Firts 5 files: ',PLOS_filesList[:5])

# gather list of .ann PLoS_methods files ---------------------------------------------

.ann PLosMethods file count :  480 

Firts 5 files:  ['PMC1088287.ann', 'PMC1657052.ann', 'PMC1831734.ann', 'PMC1880851.ann', 'PMC1939732.ann']


In [2]:
"""

There are different kinds of annotations in the .ann file. 
-----------------------------------------------------------

TYPES OF ANNOTATIONS

T - text bound annotations
R - relations
E - event
A - attribute
M - modification
N - Normalization 
-----------------------------------------------------------

We are interested in "text bound annotations only" that indicate software_usage and software_purpose.

Therefore get a list of all annotations in the .ann file iff:
        
        1. annotations are text bound ( start with T) & 
        2. annotations indicate software_usage or software_purpose
        
the result is then stored in a dictionary 

where,

    key of dict == name of the file 
    Value of dict == list of list of annotation lines in the file
"""

# store each list of annotation in a fileAn_dict
fileAn_dict = {}

# store all annotations in the file ( format: list-of-list)
annotation_filelist = []

# loop over each file 
for indx, file in enumerate(PLOS_filesList[:]):
    
    # path to each file
    file_path = path + file
     
    with open(file_path, "r") as a_file:
        
        # stores annotation line 
        annotation_line = []
        
        
        # check if the annotation is what we want (usage, purpose, starts with T )
        for line in a_file:
            
            # grab the type of annotation
            annotataion = line.split()[1:2][0]
            
            if (line.startswith('T') & (annotataion in interest_list)):

                if ( line.split()[2:4] ) not in annotation_line: 
                    
                    annotation_line.append(line.split())
                    print(indx, file, line.split())

                else:
                    pass
        
        fileAn_dict[file] = annotation_line     

1 PMC1657052.ann ['T1', 'Application_Usage', '3670', '3681', 'DS', 'modeling']
1 PMC1657052.ann ['T2', 'Application_Usage', '5677', '5683', 'pCLAMP']
1 PMC1657052.ann ['T3', 'Application_Usage', '1203', '1211', 'ENSEMBLE']
1 PMC1657052.ann ['T4', 'Purpose_Analysis', '1203', '1211', 'ENSEMBLE']
1 PMC1657052.ann ['T6', 'Purpose_Modelling', '3670', '3681', 'DS', 'modeling']
1 PMC1657052.ann ['T12', 'Purpose_DataCollection', '5677', '5683', 'pCLAMP']
2 PMC1831734.ann ['T1', 'Application_Usage', '1447', '1450', 'RIU']
2 PMC1831734.ann ['T2', 'Application_Usage', '8512', '8517', 'Stata']
2 PMC1831734.ann ['T9', 'Purpose_DataCollection', '1447', '1450', 'RIU']
2 PMC1831734.ann ['T10', 'Purpose_Analysis', '8512', '8517', 'Stata']
4 PMC1939732.ann ['T1', 'Application_Usage', '1984', '1990', 'AsReml']
4 PMC1939732.ann ['T3', 'Application_Usage', '7300', '7306', 'AsReml']
4 PMC1939732.ann ['T10', 'Purpose_Modelling', '1984', '1990', 'AsReml']
4 PMC1939732.ann ['T7', 'Purpose_Stimulation', '7238',

358 PMC5409177.ann ['T1', 'Application_Usage', '7227', '7232', 'STATA']
358 PMC5409177.ann ['T3', 'Purpose_Analysis', '7227', '7232', 'STATA']
359 PMC5425029.ann ['T1', 'Application_Usage', '1836', '1842', 'ImageJ']
359 PMC5425029.ann ['T2', 'Application_Usage', '2348', '2351', 'JMP']
359 PMC5425029.ann ['T3', 'Application_Usage', '10569', '10572', 'JMP']
359 PMC5425029.ann ['T4', 'Application_Usage', '10782', '10786', 'SPSS']
359 PMC5425029.ann ['T12', 'Purpose_DataCollection', '1836', '1842', 'ImageJ']
359 PMC5425029.ann ['T13', 'Purpose_Analysis', '2348', '2351', 'JMP']
359 PMC5425029.ann ['T14', 'Purpose_Analysis', '10569', '10572', 'JMP']
359 PMC5425029.ann ['T15', 'Purpose_Analysis', '10782', '10786', 'SPSS']
359 PMC5425029.ann ['T16', 'Purpose_Modelling', '15862', '15863', 'R']
359 PMC5425029.ann ['T17', 'Purpose_Modelling', '15891', '15895', 'nlme']
360 PMC5435321.ann ['T1', 'Application_Usage', '6663', '6667', 'SPSS']
360 PMC5435321.ann ['T2', 'Application_Usage', '6729', '673

In [3]:
print('The size of the mapping dictionary: ', len(fileAn_dict))
print('ALl files included!')

The size of the mapping dictionary:  480
ALl files included!


## Merging Annotations

In [4]:
'''

each annotation line has a form of:

     [Tx | Annotation | START | STOP | nameOfSoftware]
     
     example: 'T2', 'Application_Usage', '13536', '13539', 'SAS'

mergeList() merges two lines of annotations if their start is the same

'''

def mergeList(list_1, list_2):
    
    #stores merged annotations
    result_list = []
    
    '''
    
     list_1[2:3] is START number 
     merge two annotations if their start number is the same
    
    '''
    
    for x, y in [(x,y) for x in list_1[2:3] for y in list_2[2:3]]:
        
        # if the starting positions are the same x=y
        if( x == y) :
            
            #get id of the firt annotation 
            result_list.append(list_1[0])
            
            #merge annotations by :
            result_list.append(list_1[1]+':'+list_2[1])
            
            # start number
            result_list.append(list_1[2])
            
            #end number
            result_list.append(list_1[3])
            
            # get name of the software
            result_list.append(list_1[4:])
    
            
        else:
            pass
    return result_list

In [5]:

# stores list of merged annotations
merged_annotationlist = []

# for each file in the dict
for key in list(fileAn_dict):
    
    """
    compare all annotations to each other and grab those that share the same "start number"
    
    """
    for ls1 in fileAn_dict.get(key):
            for ls2 in fileAn_dict.get(key):
                
                # compare each annotation with another but not to itself
                if (ls1 != ls2):
                    
                    # merge annotations
                    r = mergeList(ls1,ls2)
                    if len(r) !=0:
                        #print(r)
                        merged_annotationlist.append(r)                 

In [6]:
print('size of merged annotatioon list: ', len(merged_annotationlist), '\n')

print('merged annotation list overview:')
merged_annotationlist[:2]

size of merged annotatioon list:  2056 

merged annotation list overview:


[['T1',
  'Application_Usage:Purpose_Modelling',
  '3670',
  '3681',
  ['DS', 'modeling']],
 ['T2',
  'Application_Usage:Purpose_DataCollection',
  '5677',
  '5683',
  ['pCLAMP']]]

### Store the merged annotation lists Into a Data Frame

In [7]:
import pandas as pd

df_PLOS = pd.DataFrame(columns=['code', 'annotation','start','stop','software'], data=merged_annotationlist)
df_PLOS.head(10)

Unnamed: 0,code,annotation,start,stop,software
0,T1,Application_Usage:Purpose_Modelling,3670,3681,"[DS, modeling]"
1,T2,Application_Usage:Purpose_DataCollection,5677,5683,[pCLAMP]
2,T3,Application_Usage:Purpose_Analysis,1203,1211,[ENSEMBLE]
3,T4,Purpose_Analysis:Application_Usage,1203,1211,[ENSEMBLE]
4,T6,Purpose_Modelling:Application_Usage,3670,3681,"[DS, modeling]"
5,T12,Purpose_DataCollection:Application_Usage,5677,5683,[pCLAMP]
6,T1,Application_Usage:Purpose_DataCollection,1447,1450,[RIU]
7,T2,Application_Usage:Purpose_Analysis,8512,8517,[Stata]
8,T9,Purpose_DataCollection:Application_Usage,1447,1450,[RIU]
9,T10,Purpose_Analysis:Application_Usage,8512,8517,[Stata]


In [8]:

"""

our data has duplicates, 
because it is retaining bi-directinal comparison.

example: (T1 T2) as well as (T2 T1) is retained.

drop the redudndent entries!

"""

df_PLOS = df_PLOS.drop_duplicates(subset='start', keep="first")


## reset index
df_PLOS = df_PLOS.reset_index(drop=True)


"""
join strings in the "software" column to re-generate the original software name

"""
df_PLOS["software"]= df_PLOS["software"].str.join(" ")
df_PLOS

Unnamed: 0,code,annotation,start,stop,software
0,T1,Application_Usage:Purpose_Modelling,3670,3681,DS modeling
1,T2,Application_Usage:Purpose_DataCollection,5677,5683,pCLAMP
2,T3,Application_Usage:Purpose_Analysis,1203,1211,ENSEMBLE
3,T1,Application_Usage:Purpose_DataCollection,1447,1450,RIU
4,T2,Application_Usage:Purpose_Analysis,8512,8517,Stata
...,...,...,...,...,...
991,T54,Application_Usage:Purpose_DataPreProcss,4548,4578,Haplotype Reference Consortium
992,T1,Application_Usage:Purpose_Analysis,10140,10145,Stata
993,T2,Application_Usage:Purpose_DataCollection,8352,8358,Enketo
994,T4,Application_Usage:Purpose_DataCollection,8487,8493,Enketo


### Save the df_PLOS image

In [9]:
# get head and tail of the df
PLOS_head = df_PLOS.head(10)
PLOS_tail = df_PLOS.tail(10)


import dataframe_image as dfi


PLOS_head = PLOS_head.style.background_gradient()
dfi.export(PLOS_head,"PLOS_head.png")

PLOS_tail = PLOS_tail.style.background_gradient()
dfi.export(PLOS_tail,"PLOS_tail.png")

df_PLOS.to_csv(r'PLOS.csv', index = False)

[0106/143221.871682:ERROR:sandbox_linux.cc(376)] InitializeSandbox() called with multiple threads in process gpu-process.
[0106/143221.980374:INFO:headless_shell.cc(653)] Written to file /tmp/tmpfoei7stz/temp.png.
[0106/143222.105989:ERROR:sandbox_linux.cc(376)] InitializeSandbox() called with multiple threads in process gpu-process.
[0106/143222.208018:INFO:headless_shell.cc(653)] Written to file /tmp/tmp3a6a6myy/temp.png.


## 2. PubMed Full Text

In [10]:
import os
from os import listdir
from os.path import isfile, join


interest_list = ['Application_Usage', 'Purpose_Analysis','Purpose_Modelling', 'Purpose_Stimulation', 
                 'Purpose_DataCollection', 'Purpose_DataPreProcss', 'Purpose_Simulation', 'Purpose_Visualization', 
                 'Purpose_Programming']


#  1. get a list of PubMed file names---------------------------------------------------------------------

mypath = "SoMeSci/Pubmed_fulltext/"

Pubmed_filesList = []
  
# iterating over all files in the dir
for file_name in os.listdir(mypath):
    
    # if the file is .ann
    if file_name.endswith('.ann'):
        Pubmed_filesList.append(file_name)
    else:
        continue

Pubmed_filesList.sort() 

#print('First 5 files: ',Pubmed_filesList[:5])


# 2. get file name and list of annotations into a dict-------------------------------------------------------

# file path
mypath = 'SoMeSci/Pubmed_fulltext/'

# store each list of annotation in a file in dict 
filename_annotation_dict2 = {}

# each line of annotation
annotation_filelist = []

for indx, file in enumerate(Pubmed_filesList[:]):
    
    file_path = mypath + file
     
    with open(file_path, "r") as a_file:

        annotation_line = []
        # for a line in the file
        for line in a_file:
            
            # grab the type of annotation
            annotataion = line.split()[1:2][0]
            
            # select lines of annotation that are in our interest list
            if (line.startswith('T') & (annotataion in interest_list)):

                if ( line.split()[2:4] ) not in annotation_line: 
                    
                    annotation_line.append(line.split())
                    #print(indx, file, line.split())

                else:
                    pass
        
        filename_annotation_dict2[file] = annotation_line
        



# 3. Merge annotations -----------------------------------------------------------------------

merged_annotationlist2 = []

for key in list(filename_annotation_dict2)[:]:

    for ls1 in filename_annotation_dict2.get(key):
            for ls2 in filename_annotation_dict2.get(key):
                if (ls1 != ls2):
                    r = mergeList(ls1,ls2)
                    if len(r) !=0:
                        #print(r)
                        merged_annotationlist2.append(r) 

In [11]:

print('The size of the mapping for PUB Med Full tex dictionary: ', len(filename_annotation_dict2))
print('overview of merged annotation list of PubMed full text :')
merged_annotationlist2[:2]

The size of the mapping for PUB Med Full tex dictionary:  100
overview of merged annotation list of PubMed full text :


[['T1', 'Application_Usage:Purpose_Analysis', '15525', '15534', ['SigmaStat']],
 ['T5',
  'Application_Usage:Purpose_DataCollection',
  '23622',
  '23636',
  ['Image-Pro', 'Plus']]]

## Into the DF

In [12]:
import pandas as pd

PubMed = pd.DataFrame(columns=['code', 'annotation','start','stop','software'], data=merged_annotationlist2)


# drop duplicates

PubMed = PubMed.drop_duplicates(subset='start', keep="first")

## reset index

PubMed = PubMed.reset_index(drop=True)


### join the strings in the Software col to re-create the original name of software
PubMed["software"]= PubMed["software"].str.join(" ")
PubMed

PubMed.to_csv(r'PubMed.csv', index = False)

### Save the df image

In [13]:
import dataframe_image as dfi

PubMed_head = PubMed.head(10)
PubMed_tail = PubMed.tail(10)

# save image of head
PubMed_head = PubMed_head.style.background_gradient()
dfi.export(PubMed_head,"PubMed_head.png")

# save image of tail
PubMed_tail = PubMed_tail.style.background_gradient()
dfi.export(PubMed_tail,"PubMed_tail.png")

[0106/143235.885781:ERROR:sandbox_linux.cc(376)] InitializeSandbox() called with multiple threads in process gpu-process.
[0106/143235.995839:INFO:headless_shell.cc(653)] Written to file /tmp/tmpfocnku6h/temp.png.
[0106/143236.116055:ERROR:sandbox_linux.cc(376)] InitializeSandbox() called with multiple threads in process gpu-process.
[0106/143236.234229:INFO:headless_shell.cc(653)] Written to file /tmp/tmpwu_34j3e/temp.png.


## 3. Merging PLOS and PubMed

In [14]:
df_PLOS

Unnamed: 0,code,annotation,start,stop,software
0,T1,Application_Usage:Purpose_Modelling,3670,3681,DS modeling
1,T2,Application_Usage:Purpose_DataCollection,5677,5683,pCLAMP
2,T3,Application_Usage:Purpose_Analysis,1203,1211,ENSEMBLE
3,T1,Application_Usage:Purpose_DataCollection,1447,1450,RIU
4,T2,Application_Usage:Purpose_Analysis,8512,8517,Stata
...,...,...,...,...,...
991,T54,Application_Usage:Purpose_DataPreProcss,4548,4578,Haplotype Reference Consortium
992,T1,Application_Usage:Purpose_Analysis,10140,10145,Stata
993,T2,Application_Usage:Purpose_DataCollection,8352,8358,Enketo
994,T4,Application_Usage:Purpose_DataCollection,8487,8493,Enketo


In [15]:
PubMed

Unnamed: 0,code,annotation,start,stop,software
0,T1,Application_Usage:Purpose_Analysis,15525,15534,SigmaStat
1,T5,Application_Usage:Purpose_DataCollection,23622,23636,Image-Pro Plus
2,T10,Application_Usage:Purpose_DataCollection,23735,23744,Scope-Pro
3,T14,Application_Usage:Purpose_DataPreProcss,24827,24830,IPP
4,T15,Application_Usage:Purpose_Visualization,24913,24922,Photoshop
...,...,...,...,...,...
256,T74,Application_Usage:Purpose_DataPreProcss,47120,47130,Visiopharm
257,T1,Application_Usage:Purpose_DataPreProcss,9337,9367,Draeger EIT Data Analysis Tool
258,T3,Application_Usage:Purpose_Analysis,11679,11683,SPSS
259,T1,Application_Usage:Purpose_DataPreProcss,11418,11429,Trimmomatic


In [16]:
# merge the two dfs

frames = [PubMed, df_PLOS]
PubMed_PLoS = pd.concat(frames)

# reset index
PubMed_PLoS = PubMed_PLoS.reset_index(drop=True)
PubMed_PLoS

Unnamed: 0,code,annotation,start,stop,software
0,T1,Application_Usage:Purpose_Analysis,15525,15534,SigmaStat
1,T5,Application_Usage:Purpose_DataCollection,23622,23636,Image-Pro Plus
2,T10,Application_Usage:Purpose_DataCollection,23735,23744,Scope-Pro
3,T14,Application_Usage:Purpose_DataPreProcss,24827,24830,IPP
4,T15,Application_Usage:Purpose_Visualization,24913,24922,Photoshop
...,...,...,...,...,...
1252,T54,Application_Usage:Purpose_DataPreProcss,4548,4578,Haplotype Reference Consortium
1253,T1,Application_Usage:Purpose_Analysis,10140,10145,Stata
1254,T2,Application_Usage:Purpose_DataCollection,8352,8358,Enketo
1255,T4,Application_Usage:Purpose_DataCollection,8487,8493,Enketo


In [17]:
# EXPORT RESULTS


# save image of tail
PubMed_PLoS_head = PubMed_PLoS.head(10)
PubMed_PLoS_tail = PubMed_PLoS.tail(10)

PubMed_PLoS_head = PubMed_PLoS_head.style.background_gradient()
dfi.export(PubMed_tail,"PubMed_PLoS_head.png")

PubMed_PLoS_tail = PubMed_PLoS_tail.style.background_gradient()
dfi.export(PubMed_PLoS_tail,"PubMed_PLoS_tail.png")

# save to csv
PubMed_PLoS.to_csv(r'PubMed_PLoS.csv', index = False)

[0106/143246.095056:ERROR:sandbox_linux.cc(376)] InitializeSandbox() called with multiple threads in process gpu-process.
[0106/143246.217664:INFO:headless_shell.cc(653)] Written to file /tmp/tmpnw5ae7c3/temp.png.
[0106/143246.343862:ERROR:sandbox_linux.cc(376)] InitializeSandbox() called with multiple threads in process gpu-process.
[0106/143246.441764:INFO:headless_shell.cc(653)] Written to file /tmp/tmpblggrkrc/temp.png.
