# Imports

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# Load Mined Data

In [2]:
#text = pd.read_csv("Data/minedText.csv")["title_abstract"]
text = pd.read_csv("Data/manualCurate.csv")["title_abstract"]
# text = text[0:300]
text = pd.read_csv("Data/query1_2-manualCurate.csv")["title_abstract"]
#text = pd.concat([text[0:300],text2]).reset_index(drop=True)

# Split Data into Train and Test Sets

In [3]:
train, test = train_test_split(text, test_size = 0.2, random_state=555) # 20% testing 80% training

In [4]:
train.head()

421    MetaComp: comprehensive analysis software for ...
394    Ion mobility collision cross-section atlas for...
112    Direct Infusion Based Metabolomics Identifies ...
86     Exploring human metabolites using the human me...
334    A novel approach to transforming a non-targete...
Name: title_abstract, dtype: object

# Process Data

In [5]:
# Define a function to process the data
def processText(text):
    # Get abstracts into one continuous string
    text = text.str.cat()
    # Tokenize the string object by word
    text = word_tokenize(text)
    # Remove stop words
    stop = set(stopwords.words('english'))
    text = [w for w in text if not w in stop]
    # Tag each word by the appropriate part of speech (POS) tag
    text = pos_tag(text)
    # Reshape the data into a dataframe
    text = pd.DataFrame(text, columns=['word','POS'])
    return(text)

In [6]:
# Process training data
train = processText(train)
train.head()

Unnamed: 0,word,POS
0,MetaComp,NN
1,:,:
2,comprehensive,JJ
3,analysis,NN
4,software,NN


In [7]:
# Process testing data
test = processText(test)

In [8]:
# What do the POS mean? Here is a key
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

# Create named entity tags

In [9]:
# We want to detect metabolomics software tools, so we will tag some tools
tags = pd.read_csv("Data/CuratedTools.csv")
#toolsToTag = tags['0'].tolist() + ["OSCA", "ROIMCR", "SIRIUS", "TarMet", "COBRA", "FELLA", "PhenoMeNal", "Lilikoi", "MetNet", "KIMBLE", "MRMAnalyzer", "C/VDdb", "MetGem", "MetaboGroup S", "MixProTool", "HappyTools", "MetaboAnalystR","MARSI", "PROMIS", "PaintOmics", "SimExTargId", "MetaboAnalyst", "MRMAssayDB", "MetExplore", "MetaboDiff", "SECIMTools", "VSClust", "CMM", "GSimp", "rDolphin", "DExSI", "polyPK", "ROMANCE", "MINMA", "Curatr", "HMDB", "WikiPathways", "FreeClust", "VANTED", "COMICS", "ADAP-GC", "UC2 search", "proFIA", "MWASTools", "MetaboDrift", "MaBoSS", "MetaboLights", "OLS Client/ OLS Dialog", "MetExtract", "pyQms", "KPIC2", "NOREVA", "Pathview Web", "LipidFinder", "mzML2ISA/NMRL2ISA", "MetCirc", "IMGMD", "BioContainers", "MetCCS predictor", "MS-FLO", "compMS2Miner", "COLMARm","Meneco", "PlantMat", "MetMatch", "GC2MS", "InterpretMSSpectrum", "eRah", "OpenMS","SASMeQ", "R.JIVE", "LaCyTools","SMART", "Heatmapper", "NMRPro","RIPPER", "Elder", "MBROLE", "PIPINO", "Galaxy-M", "specmine", "Unipept", "iMet-Q", "ChainRank", "geoRge", "MassyTools", "FALCON", "Escher", "SIMAT", "MESSI", "OmicsTract/SynpExtractor", "IPO", "Metabolome searcher", "DiPR", "BioSMXpress", "PathVisio", "TrackSM", "GridMass", "EigenMS", "Workflow4Metabolomics", "Pathomx", "LocFuse", "Lipid-Pro", "ALLocator", "MUSCLE", "Dolphin", "Haystack", "MetMSLine", "LipidBlast", "PathCaseMAW", "isoMETLIN", "COCONETS", "MPINet", "STATegra EMS", "OMICtools", "IIS", "KOMICS", "RAMClust", "CGBayesNets", "CFM-ID", "MET-COFEA", "PeptidePicker", "MRMPROBS", "MetDisease", "jmzTab", "MAIT", "iMatch", "MVAPACK", "MASSyPup", "X13CMS", "BiPACE 2D", "Focus", "ORCA", "HAMMER", "ProfileDB", "MetSizeR","MetaboNetworks", "MSPrep", "EBI metagenomics", "CASI", "MAPA/MASS WESTERN/PROMEX/COVAIN", "iPEAP", "INVEX", "INMEX", "tO2PLS", "NMRbot","MetPP", "MetaboQuant", "ATTED-II", "NTFD", "MultiAlign", "BRAIN", "xMSanalyzer","KNApSAcK", "PRIMe Update", "DiffCorr", "mzMatch-ISO", "The ConsensusPathDB", "MetabR", "SEED servers", "IPAD", "MolFind", "MaConDa", "EasyLCMS", "MetiTree","MassTRIX", "MetaboSearch", "ADAP-GC", "BATMAN", "ISIS", "MetaboAnalyst", "MarVis-Filter", "CompExtractor", "MAVEN", "IDEOM", "MetRxn", "Metscape", "MetaCyc database", "CluPA", "Pathos", "MetaboHunter", "TIGER", "MetSign", "MetaboLab", "IQMNMR", "MADMAX", "MetATT", "Metab", "MeRy-B", "MGV", "Guineu", "AStream", "PUTMEDID_LCMS", "CCPN Metabolomics", "Paintomics", "MetaP-server", "PAPi", "MetAssimulo", "Chemical Translation Service", "MetDAT", "MZmine", "MetPA", "DISCO", "MSEA", "PCANS", "Metscape", "VitisNet", "VIBE", "FiehnLib","T3DB", "rNMR", "Pathway Editor", "Grafta", "integrOmics", "ChromA", "TOPPView", "MetaboliteDetector", "MarVis", "TICL", "Automics", "MDAS", "MetaFIND", "GabiPD", "MeltDB", "FiD", "DrugViz", "XCMS", "PolySearch", "KEGG Atlas", "anNET", "OpenMS", "DECOMP", "BioSpider", "MetaNetwork", "P-BOSS", "HiRes", "MeMo", "ASCA", "COMPSARI", "MSFACTS"]
#toolsToTag = list(set(toolsToTag))
toolsToTag = tags.CuratedTools.tolist()

In [10]:
# Tag the training data
train["label"] = ["T" if x in toolsToTag else "O" for x in train.word]
# Tag the testing data
test["label"] = ["T" if x in toolsToTag else "O" for x in test.word]

In [11]:
# Check that we have successfully labeled tools
train.groupby("label").count()

Unnamed: 0_level_0,word,POS
label,Unnamed: 1_level_1,Unnamed: 2_level_1
O,76099,76099
T,1036,1036


In [12]:
# Check that we have successfully labeled tools
test.groupby("label").count()

Unnamed: 0_level_0,word,POS
label,Unnamed: 1_level_1,Unnamed: 2_level_1
O,18536,18536
T,286,286


# Identify individual sentences

In [13]:
# Define a function that identifies where sentences begin/end
def identSentence(textDf):
    # start at sentence 1
    n_sent = 1
    sents = [] # init empty array to wholed sentence identifiers
    # Loop through text incrementing n_sent after each period
    for word in textDf.word:
        if word == ".":
            sents.append(n_sent)
            n_sent += 1
        else:
            sents.append(np.nan) # If we are still before the end of the sentence label it as NA
    textDf['Sent_id'] = sents # Generate a column of the sentences 
    textDf['Sent_id'] = textDf['Sent_id'].bfill() # back fill the NAs to get the correct sentence IDs
    return(textDf)

In [14]:
# Find sentence boundaries in training set
train = identSentence(train)
train.head()

Unnamed: 0,word,POS,label,Sent_id
0,MetaComp,NN,T,1.0
1,:,:,O,1.0
2,comprehensive,JJ,O,1.0
3,analysis,NN,O,1.0
4,software,NN,O,1.0


In [15]:
train.sort_values(by='Sent_id', ascending=False)

Unnamed: 0,word,POS,label,Sent_id
77134,.,.,O,3947.0
77133,identified,VBN,O,3947.0
77132,phases,NNS,O,3947.0
77131,growth,NN,O,3947.0
77130,analyzed,VBD,O,3947.0
...,...,...,...,...
7,including,VBG,O,1.0
8,comparative,JJ,O,1.0
9,metagenomics,NNS,O,1.0
10,.,.,O,1.0


In [16]:
# Find sentence boundaries in testing set
test = identSentence(test)

# Balance sentences

In [17]:
# Define a function to find positive sentences
def balanceSents(textDf):
    positiveSents = textDf[textDf.Sent_id.isin(textDf.groupby(['Sent_id', 'label']).filter(lambda x: len(x) < 2).Sent_id)].copy()
    positiveSents['sent_sign'] = "P"
    numPosSents = positiveSents.groupby('Sent_id').count().shape[0]
    negativeSents = textDf[~textDf.index.isin(positiveSents.index)].copy()
    uniqNegSents = negativeSents['Sent_id'].unique().tolist()
    negativeSents = negativeSents[negativeSents.Sent_id.isin(np.random.choice(uniqNegSents, numPosSents, replace = False))]
    negativeSents['sent_sign'] = "N"
    textDf = pd.concat([positiveSents, negativeSents]).sort_index()
    return(textDf)

In [18]:
#train = balanceSents(train)
#train.head()

# Format using conference on natural language learning (CoNLL) specifications

In [19]:
# Define a function that reformats the data into CoNLL format
def conllFormatter(textDf):
    conll_lines = "-DOCSTART- -X- -X- -O-\n\n"
    save = 0

    for sent, token, pos, label in zip(textDf['Sent_id'], textDf['word'], textDf['POS'], textDf['label']):
        # If we start a new sentence, add empty line.
        if save!=sent:
            conll_lines+="\n"
        
        # Save the line
        conll_lines += "{} {} {} {}\n".format(token, pos, pos, label)
        save = sent
    return(conll_lines)

In [20]:
# Convert the training data to CoNLL format
train = conllFormatter(train)
# Convert the testing data to CoNLL format
test = conllFormatter(test)

In [21]:
# Output the processed training data to a txt file
with open("Data/trainingTextProcessed.txt", "w") as txtfile:
    for line in train:
        txtfile.write(line)
txtfile.close()

# Output the processed testing data to a txt file
with open("Data/testingTextProcessed.txt", "w") as txtfile:
    for line in test:
        txtfile.write(line)
txtfile.close()