# Installing Packages

In [None]:
pip install scikit-multilearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[K     |████████████████████████████████| 89 kB 7.0 MB/s 
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [None]:
pip install neattext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
[K     |████████████████████████████████| 114 kB 34.2 MB/s 
[?25hInstalling collected packages: neattext
Successfully installed neattext-0.1.3


# Importing libraries

In [None]:
import pandas as pd
import numpy as np
import neattext as nt
import neattext.functions as nfx
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset
from sklearn.metrics import f1_score
import scipy
from skmultilearn.adapt import MLkNN
from sklearn.naive_bayes import GaussianNB
from skmultilearn.ensemble import RakelD
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

# Loading Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data=pd.read_csv('/content/drive/MyDrive/MLBioMedLAT-780-Questions (1).csv',sep='\t')

In [None]:
data.head()

Unnamed: 0,No.,Question ID,Question,Labels
0,1,55046d5ff8aee20f27000007,List signaling molecules (ligands) that intera...,umls:aapp
1,2,52f8b2902059c6d71c000053,Which thyroid hormone transporter is implicate...,umls:enzy
2,3,553fa78b1d53b76422000007,Which miRNAs could be used as potential biomar...,"umls:nusq,umls:gngm"
3,4,5149199dd24251bc05000040,Which acetylcholinesterase inhibitors are used...,"umls:orch,umls:clnd,umls:phsu"
4,5,5709e4b2cf1c32585100001c,List the human genes encoding for the dishevel...,"tmtool:Gene,umls:bacs,umls:aapp"


# Preprocessing

In [None]:
data['Question'].apply(lambda x:nt.TextExtractor(x).extract_stopwords())

0                      [that, with, the]
1                        [which, is, in]
2      [which, could, be, used, as, for]
3            [which, are, used, for, of]
4                        [the, for, the]
                     ...                
775                [which, are, the, of]
776         [which, is, the, most, used]
777                 [what, are, the, of]
778         [what, is, the, of, through]
779                          [which, in]
Name: Question, Length: 780, dtype: object

In [None]:
data['Question']=data['Question'].apply(nfx.remove_stopwords)

In [None]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

data['Question']= data['Question'].apply(lambda x:remove_punctuation(x))

# Genia Tagger

In [None]:
!git clone https://github.com/saffsd/geniatagger && cd geniatagger && make #Installing genia tagger C code

Cloning into 'geniatagger'...
remote: Enumerating objects: 53, done.[K
remote: Total 53 (delta 0), reused 0 (delta 0), pack-reused 53[K
Unpacking objects: 100% (53/53), done.
g++ -c -O2 -DNDEBUG main.cpp
In file included from [01m[K/usr/include/c++/7/ext/hash_map:60:0[m[K,
                 from [01m[Kmaxent.h:20[m[K,
                 from [01m[Kmain.cpp:11[m[K:
  [01;35m[K^~~~~~~[m[K
g++ -c -O2 -DNDEBUG maxent.cpp
In file included from [01m[K/usr/include/c++/7/ext/hash_map:60:0[m[K,
                 from [01m[Kmaxent.h:20[m[K,
                 from [01m[Kmaxent.cpp:5[m[K:
  [01;35m[K^~~~~~~[m[K
g++ -c -O2 -DNDEBUG tokenize.cpp
g++ -c -O2 -DNDEBUG bidir.cpp
In file included from [01m[K/usr/include/c++/7/ext/hash_map:60:0[m[K,
                 from [01m[Kmaxent.h:20[m[K,
                 from [01m[Kbidir.cpp:14[m[K:
  [01;35m[K^~~~~~~[m[K
g++ -c -O2 -DNDEBUG morph.cpp
g++ -c -O2 -DNDEBUG chunking.cpp
In file included from [01m[K/usr/inc

In [None]:
cd geniatagger/

/content/geniatagger


In [None]:
Question=data['Question']

In [None]:
with open('Questions.txt', 'w') as f: #Make QUESTIONS file and pass to tagger
    for item in Question:
        f.write("%s\n" % item)

In [None]:
tagger_result=! ./geniatagger Questions.txt 

In [None]:
tagger_result

['loading morphdic...done.',
 'loading pos_models................done.',
 'loading chunk_models....done.',
 'loading named_entity_models..done.',
 'List\tList\tNN\tB-NP\tO',
 'signaling\tsignal\tVBG\tB-VP\tO',
 'molecules\tmolecule\tNNS\tB-NP\tO',
 'ligands\tligand\tNNS\tI-NP\tO',
 'interact\tinteract\tVBP\tB-VP\tO',
 'receptor\treceptor\tNN\tB-NP\tO',
 'EGFR\tEGFR\tNN\tI-NP\tB-protein',
 '',
 'thyroid\tthyroid\tNN\tB-NP\tB-protein',
 'hormone\thormone\tNN\tI-NP\tI-protein',
 'transporter\ttransporter\tNN\tI-NP\tI-protein',
 'implicated\timplicate\tVBD\tB-VP\tO',
 'thyroid\tthyroid\tNN\tB-NP\tO',
 'hormone\thormone\tNN\tI-NP\tO',
 'resistance\tresistance\tNN\tI-NP\tO',
 'syndrome\tsyndrome\tNN\tI-NP\tO',
 '',
 'miRNAs\tmiRNA\tNNS\tB-NP\tO',
 'potential\tpotential\tJJ\tI-NP\tO',
 'biomarkers\tbiomarker\tNNS\tI-NP\tO',
 'epithelial\tepithelial\tJJ\tB-NP\tO',
 'ovarian\tovarian\tJJ\tI-NP\tO',
 'cancer\tcancer\tNN\tI-NP\tO',
 '',
 'acetylcholinesterase\tacetylcholinesterase\tNN\tB-NP\tO',


In [None]:
c=tagger_result[4:] #Slice array to get question's POS

In [None]:
tag_questions=[]

In [None]:
question=[]
for i in c:  
  if i!='':
    question.append(i)
    #print(i)
    
  else:
    tag_questions.append(question)
    question=[]

In [None]:
tag_questions

[['List\tList\tNN\tB-NP\tO',
  'signaling\tsignal\tVBG\tB-VP\tO',
  'molecules\tmolecule\tNNS\tB-NP\tO',
  'ligands\tligand\tNNS\tI-NP\tO',
  'interact\tinteract\tVBP\tB-VP\tO',
  'receptor\treceptor\tNN\tB-NP\tO',
  'EGFR\tEGFR\tNN\tI-NP\tB-protein'],
 ['thyroid\tthyroid\tNN\tB-NP\tB-protein',
  'hormone\thormone\tNN\tI-NP\tI-protein',
  'transporter\ttransporter\tNN\tI-NP\tI-protein',
  'implicated\timplicate\tVBD\tB-VP\tO',
  'thyroid\tthyroid\tNN\tB-NP\tO',
  'hormone\thormone\tNN\tI-NP\tO',
  'resistance\tresistance\tNN\tI-NP\tO',
  'syndrome\tsyndrome\tNN\tI-NP\tO'],
 ['miRNAs\tmiRNA\tNNS\tB-NP\tO',
  'potential\tpotential\tJJ\tI-NP\tO',
  'biomarkers\tbiomarker\tNNS\tI-NP\tO',
  'epithelial\tepithelial\tJJ\tB-NP\tO',
  'ovarian\tovarian\tJJ\tI-NP\tO',
  'cancer\tcancer\tNN\tI-NP\tO'],
 ['acetylcholinesterase\tacetylcholinesterase\tNN\tB-NP\tO',
  'inhibitors\tinhibitor\tNNS\tI-NP\tO',
  'treatment\ttreatment\tNN\tI-NP\tO',
  'myasthenia\tmyasthenia\tNN\tI-NP\tO',
  'gravis\tgrav

In [None]:
len(tag_questions)

780

In [None]:
for i in range(len(tag_questions)): #converting string to lists
  for j in range(len(tag_questions[i])):
    tag_questions[i][j]=tag_questions[i][j].split('\t')
    

In [None]:
tag_questions

[[['List', 'List', 'NN', 'B-NP', 'O'],
  ['signaling', 'signal', 'VBG', 'B-VP', 'O'],
  ['molecules', 'molecule', 'NNS', 'B-NP', 'O'],
  ['ligands', 'ligand', 'NNS', 'I-NP', 'O'],
  ['interact', 'interact', 'VBP', 'B-VP', 'O'],
  ['receptor', 'receptor', 'NN', 'B-NP', 'O'],
  ['EGFR', 'EGFR', 'NN', 'I-NP', 'B-protein']],
 [['thyroid', 'thyroid', 'NN', 'B-NP', 'B-protein'],
  ['hormone', 'hormone', 'NN', 'I-NP', 'I-protein'],
  ['transporter', 'transporter', 'NN', 'I-NP', 'I-protein'],
  ['implicated', 'implicate', 'VBD', 'B-VP', 'O'],
  ['thyroid', 'thyroid', 'NN', 'B-NP', 'O'],
  ['hormone', 'hormone', 'NN', 'I-NP', 'O'],
  ['resistance', 'resistance', 'NN', 'I-NP', 'O'],
  ['syndrome', 'syndrome', 'NN', 'I-NP', 'O']],
 [['miRNAs', 'miRNA', 'NNS', 'B-NP', 'O'],
  ['potential', 'potential', 'JJ', 'I-NP', 'O'],
  ['biomarkers', 'biomarker', 'NNS', 'I-NP', 'O'],
  ['epithelial', 'epithelial', 'JJ', 'B-NP', 'O'],
  ['ovarian', 'ovarian', 'JJ', 'I-NP', 'O'],
  ['cancer', 'cancer', 'NN', 'I

In [None]:
tag_questions[0] #first question

[['List', 'List', 'NN', 'B-NP', 'O'],
 ['signaling', 'signal', 'VBG', 'B-VP', 'O'],
 ['molecules', 'molecule', 'NNS', 'B-NP', 'O'],
 ['ligands', 'ligand', 'NNS', 'I-NP', 'O'],
 ['interact', 'interact', 'VBP', 'B-VP', 'O'],
 ['receptor', 'receptor', 'NN', 'B-NP', 'O'],
 ['EGFR', 'EGFR', 'NN', 'I-NP', 'B-protein']]

# Extracting noun phrases from questions

In [None]:
p=[]
o=[]

In [None]:
for k in range(len(tag_questions)):
  for i in tag_questions[k]:
    for j in range(len(i)):
      #print(i[j])
      if i[j]=='NN' or i[j]=='NNP' or i[j]=='NNPS' or i[j]=='NNS':
        print(i[j-2])
        #print(i[j-1])
        #print(i[j])
        #print(i[j+1])
        #print(i[j+2])
        p.append(i[j-2])
  o.append(p)
  p=[]

List
molecules
ligands
receptor
EGFR
thyroid
hormone
transporter
thyroid
hormone
resistance
syndrome
miRNAs
biomarkers
cancer
acetylcholinesterase
inhibitors
treatment
myasthenia
gravis
genes
proteins
synonym
Acrokeratosis
paraneoplastica
classes
drugs
VaughanWilliams
classification
isoforms
Notch
receptor
characteristics
senescence
Orteronel
treatment
cancer
protein
gene
GLT8D1
Yamanaka
factors
protein
Pannexin1
mode
inheritance
Wilsons
disease
inheritance
dystrophy
FSHD
proteins
formation
ryanodine
receptor
complex
kind
chromatography
HILIC
diagnoses
children
autism
MAP
kinase
transcription
factor
cjun
acronym
TAILS
protein
Nterminomics
indications
lacosamide
fusion
protein
development
Ewing
sarcoma
List
Hemolytic
Uremic
Syndrome
Triad
effects
protein
km231
DYNLRB1
cell
Treatment
disease
MR
CLEAN
study
factors
gene
expression
transition
zebrafish
enzyme
Evolocumab
methyl
donor
DNA
cytosine5methyltransferases
phase
cell
cycle
arrest
Fanconi
anemia
Mutation
gene
adenoma
mutations
tropo

In [None]:
len(o[0])

5

In [None]:
o

[['List', 'molecules', 'ligands', 'receptor', 'EGFR'],
 ['thyroid',
  'hormone',
  'transporter',
  'thyroid',
  'hormone',
  'resistance',
  'syndrome'],
 ['miRNAs', 'biomarkers', 'cancer'],
 ['acetylcholinesterase', 'inhibitors', 'treatment', 'myasthenia', 'gravis'],
 ['genes', 'proteins'],
 ['synonym', 'Acrokeratosis', 'paraneoplastica'],
 ['classes', 'drugs', 'VaughanWilliams', 'classification'],
 ['isoforms', 'Notch', 'receptor'],
 ['characteristics', 'senescence'],
 ['Orteronel', 'treatment', 'cancer'],
 ['protein', 'gene', 'GLT8D1'],
 ['Yamanaka', 'factors'],
 ['protein', 'Pannexin1'],
 ['mode', 'inheritance', 'Wilsons', 'disease'],
 ['inheritance', 'dystrophy', 'FSHD'],
 ['proteins', 'formation', 'ryanodine', 'receptor', 'complex'],
 ['kind', 'chromatography', 'HILIC'],
 ['diagnoses', 'children', 'autism'],
 ['MAP', 'kinase', 'transcription', 'factor', 'cjun'],
 ['acronym', 'TAILS', 'protein', 'Nterminomics'],
 ['indications', 'lacosamide'],
 ['fusion', 'protein', 'development'

In [None]:
len(o)

780

In [None]:
o[0]

['List', 'molecules', 'ligands', 'receptor', 'EGFR']

# Extracting verb phrases from questions

In [None]:
y=[]
verb=[]

In [None]:
for k in range(len(tag_questions)):
  for i in tag_questions[k]:
    for j in range(len(i)):
      #print(i[j])
      if i[j]=='VB' or i[j]=='VBD' or i[j]=='VBN' or i[j]=='VBZ' or i[j]=='VBP':
        print(i[j-2])
        #print(i[j-1])
        #print(i[j])
        #print(i[j+1])
        #print(i[j+2])
        y.append(i[j-2])
  verb.append(y)
  y=[]

interact
implicated
developed
encoded
participate
reported
phosphorylates
involved
investigated
activate
targeted
impaired
implicated
isolated
found
cause
targets
ubiquitinates
implicated
targeted
inhibited
related
measured
required
considered
Tay
known
utilized
amiodaroneinduced
contains
offlabel
neonates
cited
inhibied
thought
regulated
contain
caused
inhibited
found
mutated
known
needed
known
localized
chromodomain
delayed
utilized
phopspholamban
found
associated
autophagy
activates
appears
mutated
reported
exist
determined
characterized
administered
proposed
forms
associated
subject
found
cause
targeted
transcriptioncoupled
linked
included
inhibit
found
interact
contain
targets
implicated
set
associated
abacavir
diagnosis
revealed
associated
proposed
involved
cause
involved
associated
compared
screened
comprise
tested
localized
associated
associated
induce
shown
encoded
belong
dinucleotide
argininemethylated
isolated
derived
caused
involved
approved
imprinted
regulated
achieved
alp

In [None]:
verb

[['interact'],
 ['implicated'],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['developed'],
 ['encoded'],
 [],
 [],
 [],
 [],
 ['participate'],
 [],
 ['reported'],
 ['phosphorylates'],
 [],
 [],
 ['involved'],
 [],
 [],
 ['investigated'],
 ['activate'],
 ['targeted'],
 [],
 ['impaired'],
 ['implicated', 'isolated'],
 ['found', 'cause'],
 ['targets'],
 ['ubiquitinates'],
 ['implicated'],
 ['targeted'],
 ['inhibited'],
 ['related'],
 [],
 [],
 ['measured'],
 [],
 ['required'],
 ['considered'],
 ['Tay'],
 ['known'],
 [],
 [],
 ['utilized', 'amiodaroneinduced'],
 [],
 ['contains'],
 ['offlabel', 'neonates', 'cited'],
 ['inhibied'],
 ['thought', 'regulated'],
 ['contain'],
 ['caused'],
 [],
 ['inhibited'],
 ['found', 'mutated'],
 ['known'],
 [],
 [],
 ['needed'],
 [],
 [],
 ['known'],
 [],
 [],
 ['localized'],
 ['chromodomain'],
 ['delayed'],
 [],
 [],
 ['utilized'],
 ['phopspholamban', 'found'],
 [],
 ['associated'],
 ['autophagy'],
 ['activates'],
 ['appears', 'mutated', 'reported'],
 ['exist', 'd

In [None]:
len(verb)

780

In [None]:
verb[0]

['interact']

In [None]:
cd ..

/content


# Protein-Protein Interaction Feature

In [None]:
protein=[]
p=[]

In [None]:
for k in range(len(tag_questions)):
  for i in tag_questions[k]:
    for j in range(len(i)):
      #print(i[j])
      if i[j]=='B-protein' or i[j]=='I-protein':
        p.append(i[j])
      else:
        p.append('O')
  protein.append(p)
  p=[]

In [None]:
len(protein)

780

In [None]:
data['Protein']=protein

In [None]:
data['Protein'] = [','.join(map(str, l)) for l in data['Protein']]

In [None]:
data.head()

Unnamed: 0,No.,Question ID,Question,Labels,Protein
0,1,55046d5ff8aee20f27000007,List signaling molecules ligands interact rece...,umls:aapp,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,2,52f8b2902059c6d71c000053,thyroid hormone transporter implicated thyroid...,umls:enzy,"O,O,O,O,B-protein,O,O,O,O,I-protein,O,O,O,O,I-..."
2,3,553fa78b1d53b76422000007,miRNAs potential biomarkers epithelial ovarian...,"umls:nusq,umls:gngm","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,4,5149199dd24251bc05000040,acetylcholinesterase inhibitors treatment myas...,"umls:orch,umls:clnd,umls:phsu","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
4,5,5709e4b2cf1c32585100001c,List human genes encoding dishevelled proteins,"tmtool:Gene,umls:bacs,umls:aapp","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


# MetaMap API

In [None]:
!python3 -m pip install requests requests-html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting requests-html
  Downloading requests_html-0.10.0-py3-none-any.whl (13 kB)
Collecting pyquery
  Downloading pyquery-1.4.3-py3-none-any.whl (22 kB)
Collecting parse
  Downloading parse-1.19.0.tar.gz (30 kB)
Collecting fake-useragent
  Downloading fake-useragent-0.1.11.tar.gz (13 kB)
Collecting w3lib
  Downloading w3lib-2.0.1-py3-none-any.whl (20 kB)
Collecting pyppeteer>=0.0.14
  Downloading pyppeteer-1.0.2-py3-none-any.whl (83 kB)
[K     |████████████████████████████████| 83 kB 1.6 MB/s 
Collecting websockets<11.0,>=10.0
  Downloading websockets-10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (112 kB)
[K     |████████████████████████████████| 112 kB 37.3 MB/s 
Collecting pyee<9.0.0,>=8.1.0
  Downloading pyee-8.2.2-py2.py3-none-any.whl (12 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-p

In [None]:
!git clone 'https://github.com/lhncbc/skr_web_python_api'

Cloning into 'skr_web_python_api'...
remote: Enumerating objects: 49, done.[K
remote: Counting objects: 100% (49/49), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 49 (delta 23), reused 45 (delta 19), pack-reused 0[K
Unpacking objects: 100% (49/49), done.


In [None]:
cd skr_web_python_api/

/content/skr_web_python_api


In [None]:
!python3 -m pip install --upgrade pip
!python3 -m pip install --upgrade build
!python3 -m build --no-isolation

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.2.2-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 5.1 MB/s 
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting build
  Downloading build-0.8.0-py3-none-any.whl (17 kB)
Installing collected packages: build
Successfully installed build-0.8.0
[0m[1m* Getting dependencies for sdist...[0m
running egg_info
creating src/skr_web_api.egg-info
writing src/skr_web_api.egg-info/PKG-INFO
writing dependency_links to src/skr_web_api.egg-info/dependency_links.txt
writing top-level names to src/skr_web_api.egg-info/top_level.txt
writing manifest file 'src/skr_web_ap

In [None]:
!python3 -m pip install dist/skr_web_api-0.1-py3-none-any.whl

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Processing ./dist/skr_web_api-0.1-py3-none-any.whl
Installing collected packages: skr-web-api
Successfully installed skr-web-api-0.1
[0m

In [None]:
from skr_web_api import Submission, METAMAP_INTERACTIVE_URL

In [None]:
r=[]
cc=[]
response=[]

In [None]:
length=[]
for i in range(len(o)):
  length.append(len(o[i]))

In [None]:
response=[]

In [None]:
for i in range(len(o)):
  for j in range(len(o[i])):
    email = '21001279011@skt.umt.edu.pk'
    apikey = 'b4fa54b7-2835-4c30-afa2-443e9229b754'
    inst = Submission(email, apikey)
    inputtext = o[i][j]
    inst.init_mm_interactive(inputtext)
    response.append(inst.submit())  

In [None]:
response[0].text

'/dmzfiler/II_Group/MetaMap2020/public_mm/bin/SKRrun.20 /dmzfiler/II_Group/MetaMap2020/public_mm/bin/metamap20.BINARY.Linux --lexicon db -Z 2020AB -N\nUSER|MMI|5.18|List|C0745732|[inpr]|["List"-tx-1-"List"-noun-0]|TX|0/4|\nUSER|MMI|5.18|Sequence Data Type|C3272378|[inpr]|["LIST"-tx-1-"List"-noun-0]|TX|0/4|\n'

In [None]:
ss=[]

In [None]:
for i in range(len(response)):
  ss.append(response[i].text.split('|'))

# Extracting noun concepts from response

In [None]:
noun_API_concepts=[]

In [None]:
prev = 0
for i in length:
    noun_API_concepts.append(list(ss[prev:prev+i]))
    prev = prev+i 

In [None]:
len(noun_API_concepts)

780

In [None]:
def index_in_list(a_list, index):
    return index < len(a_list)

In [None]:
nn=[]
noun_concepts=[]

In [None]:
for i in range(len(noun_API_concepts)):
  for j in range(len(noun_API_concepts[i])):
    if index_in_list(noun_API_concepts[i][j], 5)==True:
      nn.append(noun_API_concepts[i][j][5])
  noun_concepts.append(nn)
  nn=[]
    

In [None]:
len(noun_concepts)

780

In [None]:
with open('noun_concepts.txt', 'w') as f:
    for item in noun_API_concepts:
        f.write("%s\n" % item)

In [None]:
data['Noun Concepts']=noun_concepts

In [None]:
data['Noun Concepts'] = [','.join(map(str, l)) for l in data['Noun Concepts']]

In [None]:
data.head()

Unnamed: 0,No.,Question ID,Question,Labels,Protein,Noun Concepts
0,1,55046d5ff8aee20f27000007,List signaling molecules ligands interact rece...,umls:aapp,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","[inpr],[sbst],[chem],[aapp,rcpt],[aapp,rcpt]"
1,2,52f8b2902059c6d71c000053,thyroid hormone transporter implicated thyroid...,umls:enzy,"O,O,O,O,B-protein,O,O,O,O,I-protein,O,O,O,O,I-...","[irda],[qlco],[aapp,bacs],[irda],[qlco],[ftcn]..."
2,3,553fa78b1d53b76422000007,miRNAs potential biomarkers epithelial ovarian...,"umls:nusq,umls:gngm","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","[bacs,nnon],[clna],[euka]"
3,4,5149199dd24251bc05000040,acetylcholinesterase inhibitors treatment myas...,"umls:orch,umls:clnd,umls:phsu","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O","[aapp,enzy],[chvf]"
4,5,5709e4b2cf1c32585100001c,List human genes encoding dishevelled proteins,"tmtool:Gene,umls:bacs,umls:aapp","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...",


# Extracting verb concepts 

In [None]:
r=[]
cc=[]
response=[]
length=[]
for i in range(len(verb)):
  length.append(len(verb[i]))

In [None]:
length

[1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 2,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 2,
 0,
 1,
 3,
 1,
 2,
 1,
 1,
 0,
 1,
 2,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 2,
 0,
 1,
 1,
 1,
 3,
 2,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 2,
 2,
 1,
 0,
 1,
 0,
 1,
 2,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 3,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 2,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 2,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 2,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 3,
 1,
 0,
 1,
 0,
 0,
 2,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 2,
 1,
 1,
 0,
 1,
 1,
 1,
 2,
 0,
 0,
 0,
 3,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,


In [None]:
response=[]

In [None]:
for i in range(len(verb)):
  for j in range(len(verb[i])):
    email = '21001279011@skt.umt.edu.pk'
    apikey = 'b4fa54b7-2835-4c30-afa2-443e9229b754'
    inst = Submission(email, apikey)
    inputtext = verb[i][j]
    inst.init_mm_interactive(inputtext)
    response.append(inst.submit())  

In [None]:
response[0].text

'/dmzfiler/II_Group/MetaMap2020/public_mm/bin/SKRrun.20 /dmzfiler/II_Group/MetaMap2020/public_mm/bin/metamap20.BINARY.Linux --lexicon db -Z 2020AB -N\n'

In [None]:
ss=[]

In [None]:
for i in range(len(response)):
  ss.append(response[i].text.split('|'))

In [None]:
verb_API_concepts=[]

In [None]:
#myList = [1,2,3,4,5,6,7,8,9]
prev = 0
for i in length:
    verb_API_concepts.append(list(ss[prev:prev+i]))
    prev = prev+i 

In [None]:
len(verb_API_concepts)

780

In [None]:
pwd

'/content/skr_web_python_api'

In [None]:
def index_in_list(a_list, index):
    return index < len(a_list)

In [None]:
nn=[]
verb_concepts=[]

In [None]:
for i in range(len(verb_API_concepts)):
  for j in range(len(verb_API_concepts[i])):
    if index_in_list(verb_API_concepts[i][j], 5)==True:
      nn.append(verb_API_concepts[i][j][5])
  verb_concepts.append(nn)
  nn=[]
    

In [None]:
len(verb_concepts)

780

In [None]:
data['Verb Concepts']=verb_concepts

In [None]:
data['Verb Concepts'] = [','.join(map(str, l)) for l in data['Verb Concepts']]

In [None]:
with open('verb_concepts.txt', 'w') as f:
    for item in verb_API_concepts:
        f.write("%s\n" % item)

# Focus Concepts

In [None]:
focus=['EGFR','thyroid hormone ','miRNAs','acetylcholinesterase inhibitors','human genes','Acrokeratosis paraneoplastica','anti-arrhythmic drugs','isoforms','cellular senescence',
'Orteronel','protein','Yamanaka factors','Pannexin1','mode of inheritance','mode of inheritance','proteins','chromatography','endoscopic diagnoses','MAP kinase phosphorylates',
'TAILS','indications','fusion protein','Hemolytic Uremic Syndrome','protein km23-1 (DYNLRB1)','disease','zygotic gene','enzyme','methyl donor','cell cycle arrest','gene','gene',
'enzyme','E3 ubiquitin ligase','gene','enzyme','interleukins','genes','side effects','SGLT2','FASP','measured','symptoms','gene','transcription factor','Tay syndrome','cell types',
'EWS/FLI1 fusion','genetic basis','drugs','genetic defect','selenoprotein','clinical trials','enzyme','genes','archaeal genomes','pathological conditions','mTORC1',
'kinase','genes','viruses','vildagliptin','prognosis','MMP proteins','hyperosmia','number','bacterium','symptoms','plant DNA','histone variant CENPA','proteins',
'prognostic','drugs','cellular localization','drugs','mutations','antioxidant','enzymatic activity','markers','metabolite','amino acid ','databases','transcriptional regulator BACH1',
'disease','disease','oprozomib','biological roles','proteins','syndrome',' drug Olaparib','proteins','transporter','phospholamban gene','gene','chromosomes','branch site consensus sequence',
'drugs','probes','enzyme','protein','number of protein','antibiotics','Gliolan','genes','protein','diseases','proportion','protein gel','Synostosis','medulloblastoma',
'symptoms','biomarker','proteins','Tuberous Sclerosis','sweating and anaerobic','motor speech disorder','enzyme','Dyke-Davidoff-Masson syndrome','gene','Polycomb group (PcG) proteins',
'phosphorylation consensus','medication','genes','treatment','genera','organs','CENPA','gene','cancer','CCFDN syndrome','oncogenes','protein','proteins','proteins',
'mDia','genomic signatures','arginine-methylated peptides','Rindopepimut','gene','tissue','Edwards syndrom','microbial enzyme','treatment','genes','diseases','tyrosine kinase',
'empagliflozin FDA','symptoms','how many','cellular processes','prokaryotic organisms','alpha-myosin' ,'Marfan syndrome',' cardiac death','rate of survival','loss of protein Lon1',
'gene','TAp73 isoforms','Ras oncogenes','ERAP2','anti-CD52 monoclonal antibody','brain structures','monoclonal antibodies ','deiodinase polymorphisms',' pharmacological treatment',
'tissue','transcription factor','diseases','restless leg syndrome','histone modification','memory problems','neuroendocrine tumors ','	How many','metabolism','hormone',
'protein','residue of alpha-synuclein','Genes','SSRIs','brodalumab','Long interspersed nuclear element','gene','receptors','genes','nemaline myopathy','syndrome',
'virus','chromosomes','translocation','effect of CRD-BP ','medical diagnostic tests','protein','echogenic bowel','enzymatic','percentage','disease','histones','indication of Daonil',
'peptide sequence','proteases','proteins','molecule','symptom','genes','clinical Villefranche criteria','treatment of neuropathic pain ','crystal structure','Rheumatoid Arthritis',
'methyl-CpG-binding protein','deiodinases','genes','molecular weight','(SGLT2) inhibitors ','gene','atypical neuroleptic drugs','Andersen syndrome','disease','protein','disease',
'deiodinase','proteins','transcriptional regulator BACH1 ','molecule','diabetes','pharmacophore modelling','enzymes','inhibitors','life expectancy ','genes','JAK (Janus kinase) inhibitor',
'DNA nucleases','dishevelled proteins','defective protein ','inflammatory caspase proteins','invertebrates','gene','Subependymal Giant Cell Astrocytoma','databases','disease','antibodies','ILK ablation',
'RNA polymerase','Parkin binding partners','drug Adempas','CRISPR/Cas9','protein','antiepileptic drug','bacteria','drug JTV519','long non- coding RNA malat-1 ','Oxantel',
'protein','enzyme',' Romano Ward long QT syndrome','gene',' mobile applications fields','genes','RNASeq experiments','Lewy bodies?','MC1R gene variants','enzyme','protein',
'antagonists','protein Drp1','diseases','myeloma','Atg8','Zika virus','K-ras alterations','polypill','Marfan syndrome','myotonic dystrophy','mammalian orthologs',
' outer diameter of microtubules','protein','Neurostimulation','myosin isozymes','proteins','Marchesani syndrome','Tumor-treating fields','angiocrine factors','Ctf4',
'hormone','Replisome Progression Complex','CFTR','CDK targets','plant organelles','itaconic acid levels','genes','transcription factors','Simpson grading','cell','enzyme',
'protein',' treatment options for anxiety ','adenosine A2A receptor antagonists','cardiac T1','autophagy pathway','TSC','fish anti-freeze proteins','phosphorylated residue',
'non coding RNAs','diet','mTOR Complex 2 (mTORC2)','proteins','risk factors ','multiple kinase inhibitors ','gene','LeuRS translational quality control','drug Denosumab','enzyme',
'gene test','cells','protein','antibody','micro-RNAs','enzymes','protein S100A7','protein-protein interactions','coronary artery bypass graft surger','proteins','SAGA complex',
'polyQ tract protein','subviral pathogens','prevention of sarcopenia','database of molecular recognition','database','kinase','genetic manipulation of cells','cellular target',
'acute myelogenous leukemia','different mutations','genes','DX-88','genes','Sleeping Beauty or the piggyBac transposons','inhibitors','Nalmefene','gene prioritization analysis','disease',
'Treprostinil','enzyme','pediatric cerebellar tumor','E3 ubiquitin ligase','phycobiliproteins','microsporidia','Emery-Dreifuss muscular dystrophy','CLN3 gene','genes',
'transmembrane protein','EMT','genetic lesion','active and poised enhancers','Li–Fraumeni','pituitary adenoma ','cardiac effects','Matuzumab','prostate-specific antigen',
'biomedical question answering systems','dediodinases','Fanconi anemia pathogenesis','symptoms','transcriptional initiation or elongation','polyadenylate-binding protein 4',
'average diameter','cryptochrome-1','intermediate filament (IF) protein','enzyme','genes','proportion','cell type','kinases','Magnetic beads','gene','Tumors','protein',
'human population','homologs ','mTOR','scales','calcium channels ',' sedimentation coefficient','proteins','receptor','trinucleotide repeat disorders ',' cellular targets ','Tuberous Sclerosis',
'drugs','bacterial microflora','families of deadenylases','translocation','ficolin-3','phosphorylated peptides','disorders','antibody','Facioscapulohumeral muscular dystrophy',
'diseases','commotio cordis','Phthiriasis Palpebrarum','gene','disorder','peripheral neuropathy','receptor','short QT syndrome','clinical trials','biotracers','isochores',
'mutational hotspots','intraflagellar transport (IFT) motor protein','transcript','TDP-43 and FUS proteins','Chompret criteria','enzyme','thyroid',' Kartagener Syndrome Triad','disease',
'G protein','APOBEC3 family',' histone modifications','application of SWATH-MS','Nothobranchius furzeri','mass-tag','gene','atechol-O-methyl transferase (COMT) inhibitors',
'structures','prognostic impact','genes','programs','disorders','gene','RTS S AS01 vaccine','deiodinases','computational methods ','regulatory molecule','gene','active neurotoxin',
'medication','enzymes','lubag disease','treatment','execution time','mutations','SEA0400','protein GATA1','DNA damage reponse','tele monitoring applications','ultraconserved elements',
'disease','selenoproteins','Src, Cortactin','cohesin','neurodevelopmental disorder','dovitinib','diseases','heat shock protein','gene','anorexia','indications','scaffold proteins','families of mammalian',
'classes of endogenous retroelements','GATA-1 interacting partners','yeast genome',' stem loop','proteins','symptoms','genetic manipulation',
' SWI/SNF protein','fungal hypoxanthine-adenine-guanine transporter','proteins','substances','archaeal genomes','mutations of SCN5A','network meta-analysis','clotting factor',
'syndromes','ESKAPE organisms','bladder wall','LY450139','gene','methyl donor','carbapenemases','gene','cardiomyopathy','genomic positions','sequence','Abnormality','anti-arrhythmic activity',
'arrhythmia','FLAMSA chemotherapy','IkB protein kinase (IKK)','glands','computer programs','transcription factors','gene','protein','gene and chromosomes','genes','METLIN database',
'RNA sequence','computational frameworks','proteins','genes','SCENAR therapy','symptoms','relationship','SPAN-100 score','genes','color','phylogenetic tree methods','histone modification',
'interleukin','disorder','subcellular localization','treatments','bone protein','receptors','contaminants','gene','Trypan blue','Selexipag','DEND syndrome','genes','disorder','bortezomib',
'genes','Idarucizumab','mesaure of gene expression','enzyme','pathogenic species','thyroid hormone transporter','subunits','protein','receptors','drugs','incidence of cystic fibrosis','Her2 status','X-ray free electron laser',
'protein','enzyme','disorder',' binding site motif','autoimmune disorders','clinical indication','thyroid cancer','role of LKB1 ','prognostic role','philadelphia translocation','gene mutations',
'syndrome','lung cancer','disease','algorithms','cytoplasmic or nuclear','promoterome mining','life expectancy','computational tools',' protein lacritin','triple screening test',
'Viagra','organisms','disorder','compounds','genes','receptors','fluorescent reporter proteins','transcription factors','lincRNA','SGK1-mediated phosphorylation of FOXO3a','drugs',
'caspases','BRAF inhibitors ','risk factors ','coactivators','genes','transcription-associated mutagenesis (TAM)','proteins','tumor','drug',' Shapiro syndrome','autism spectrum disorder (ASD)',
'treatment','cardiac death','Mutations','bacteria','duration of the QT interval',' protein neprilysin','disease','calcium pump','inheritance pattern ','Saccharomyces cerevisiae Rrm3p','ubiquitin proteome',
'RIP1','sympathetic nervous system','clinical trials of the polypill','cancers','Apert syndrome','multiple sclerosis (MS)','HER-2 status','treatment','human transmembrane nucleoporins','E3 ubiquitin ligase',
'genes','histone methyltransferases','bioinformatics tools ','phytoalexin','syndrome','selenoprotein synthesis','genes/proteins','disorders','protein','protein','pharmacogenetic test',
'oligonucleotides','Stiff man Syndrome','enzymes','indications for Glivec','proteins','g6PD-deficient','drug','Catecholaminergic Polymorphic Ventricular Tachycardia (CPVT)',
'protein-protein interactions','A-type lamins',' membrane protein','histone modifications','DNA G-quadruplexes','drugs','triple negative breast cancer','enzyme','deiodinase','treatments','genetic determinant',
'drugs','forms of cancer','hydrochlorothiazide','Apert syndrome','disease','genes','short QT syndrome','sports','breath test biomarkers',' DNA mutations ','diseases','calcium binding protein',
'diseases','anti-amyloid-beta monoclonal antibodies','anticancer drugs','inhibitors','Fanconi anemia','Gulf war syndrome','proteins','applications of machine learning algorithms','molecule',
'mutation','DNA repair system','cancers','hormone','species','types of cancer','subtypes',' typical rash','DNA nucleases','proteins','vector of Louping','receptor','diseases','threshold','genes code',
'neurotransmitters','enamel matrix derivative','acceptable sequence coverage(depth) ','HPV vaccination','clinical meaning','disease','DNA hypo or hypermethylation','parallel or antiparallel','protein','Pfeiffer syndrome',
' How large complexes','Palbociclib','genes','oligonucleotides','gold standard treatment','CHEK2 genetic variant','hormone abnormalities','assays','hormone','cell','molecule','causative agent of malaria',
'adriamycin(doxorubicin)','eukaryotic mRNA',' APOBEC3 protein','genetic defects','computational tools','disease','molecule','calcium/calmodulin','histone trimethylation','gene test','diseases',
'DNA (cytosine-5-)-methyltransferases inhibitors','histone modifications ','drug','complexes','NMD factors','histone marks','isradipine','non-surgical treatment ','genes','factors','DNA repair pathways ',
'musculoskeletal manifestations','proteins','effects of gabapentin',' transcription factors','Bruton tyrosine kinase inhibitor',' disulfide bridges','intramolecular phosphorylation ',
'methods','receptors','diseases','thyroid tissues','metaxin complex','classes of retrotransposons ','Diamond-Blackfan anemia','SerH3 immobilization antigen','promoter proximal pausing of RNA',
'Fanconi anemia','immunosuppressive drug cyclosporin A (CsA)','drugs','gene','UvrAB complex ','localization of the protein','treatment for CCSVI ','hormone','QT Jervell and Lange-Nielsen',
'programs','genes','promote or inhibit T-cell','number of CTCF','mutations','population','substrate of the haspin kinase','side-effects','driver gene mutations','cyclin- dependent kinase inhibitor','microRNAs',
'protein','process','bacterial species','properties of the mRNA','Tau hyperphosphorylation','histone residue','cells','components','disease','genes','gene','molecular target',
'disorder',' protein GATA1','Acromicric dysplasia','treatment','sclerostin interaction partners','imaging modalities ','software tools','tissue kallikrein genes','triad of Wernicke encephalopathy',
'ryanodine receptor','compound','molecule','protein','sequence consensus ','Drosophila melanogaster Groucho protein','enzyme','cardiac effects','anti-TNF drug','pharmacophore models',
'frequency of mutations','drugs']

In [None]:
for i in range(len(focus)):
  email = '21001279011@skt.umt.edu.pk'
  apikey = 'b4fa54b7-2835-4c30-afa2-443e9229b754'
  inst = Submission(email, apikey)
  inputtext = focus[i]
  inst.init_mm_interactive(inputtext)
  response.append(inst.submit())  

In [None]:
len(focus)

780

In [None]:
ss=[]
for i in range(len(response)):
  ss.append(response[i].text.split('|'))

In [None]:
concepts=[]

In [None]:
for i in range(len(ss)):
  if index_in_list(ss[i], 5)==True:
    concepts.append(ss[i][5])
  else:
    concepts.append("[]")


In [None]:
with open('concept_focus.txt', 'w') as f:
    for item in concepts:
        f.write("%s\n" % item)

# Biomedical Named Entity

In [None]:
!pip install spacy==3.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spacy==3.0
  Downloading spacy-3.0.0-cp37-cp37m-manylinux2014_x86_64.whl (12.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting thinc<8.1.0,>=8.0.0
  Downloading thinc-8.0.17-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (660 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m660.6/660.6 kB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
Collecting pydantic<1.8.0,>=1.7.1
  Downloading pydantic-1.7.4-cp37-cp37m-manylinux2014_x86_64.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
Collecting typer<0.4.0,>=0.3.0
  Downloading typer-0.3.2-py3-none-any.whl (21 kB)
Installing collected packages: typer, pydantic, thinc, spacy
  Attempting uninstall: typer
    Found existing installation: t

In [None]:
!pip install scispacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scispacy
  Downloading scispacy-0.5.1-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.9/44.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting nmslib>=1.7.3.6
  Downloading nmslib-2.1.1-cp37-cp37m-manylinux2010_x86_64.whl (13.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m40.1 MB/s[0m eta [36m0:00:00[0m
Collecting pysbd
  Downloading pysbd-0.3.4-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
Collecting spacy<3.5.0,>=3.4.0
  Downloading spacy-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting conllu
  Downloading conllu-4.5.2-py2.

In [None]:
!pip install 'https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_core_sci_sm-0.5.0.tar.gz (15.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.9/15.9 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting spacy<3.3.0,>=3.2.3
  Downloading spacy-3.2.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
Collecting thinc<8.1.0,>=8.0.12
  Using cached thinc-8.0.17-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (660 kB)
Collecting typing-extensions<4.0.0.0,>=3.7.4
  Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)
Building wheels

In [None]:
import spacy
import scispacy

from scispacy.linking import EntityLinker

nlp = spacy.load("en_core_sci_sm")

nlp.add_pipe("scispacy_linker", config={"resolve_abbreviations": True, "linker_name": "umls"})

https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/tfidf_vectors_sparse.npz not found in cache, downloading to /tmp/tmpreynajfe
Finished download, copying /tmp/tmpreynajfe to cache at /root/.scispacy/datasets/e9f7327283e43f0482f7c0c71b71dec278a58ccb3ffdd03c2c2350159e7ef146.f2a350ad19015b2591545f7feeed6a6d6d2fffcd635d868a5d7fc0dfc3cadfd8.tfidf_vectors_sparse.npz
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/nmslib_index.bin not found in cache, downloading to /tmp/tmpkfitfqbr
Finished download, copying /tmp/tmpkfitfqbr to cache at /root/.scispacy/datasets/f48455d6c79262057cce66b4619123c2b558b21092d42fac97f47bb99a5b8f9f.dd70d3dffe7d90d7ac8914460e16a48375dab32485fb6313a34e6fbcaf53218b.nmslib_index.bin
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/tfidf_vectorizer.joblib not found in cache, downloading to /tmp/tmph_d9iv5n
Finished download, copying /tmp/tmph_d9iv5n to cache at /root/.scispacy/da

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/linkers/2020-10-09/umls/concept_aliases.json not found in cache, downloading to /tmp/tmp0t1il24e
Finished download, copying /tmp/tmp0t1il24e to cache at /root/.scispacy/datasets/1428ec15d3b1061731ea273c03699130b3d6b90948993e74bda66af605ff8e2a.aeb7a686c654df6bccb6c2c23d3eda3eb381daaefda4592b58158d0bee53b352.concept_aliases.json
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/kbs/2020-10-09/umls_2020_aa_cat0129.jsonl not found in cache, downloading to /tmp/tmpk8wsnj2d
Finished download, copying /tmp/tmpk8wsnj2d to cache at /root/.scispacy/datasets/4d7fb8fcae1035d1e0a47d9072b43d5a628057d35497fbfb2499b4b7b2dd4dd7.05ec7eef12f336d4666da85b7fa69b9401883a7dd4244473f7b88b413ccbba03.umls_2020_aa_cat0129.jsonl
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/data/umls_semantic_type_tree.tsv not found in cache, downloading to /tmp/tmpul3s5h0p
Finished download, copying /tmp/tmpul3s5h0p to cache at /root/.scispacy/datasets/21a1012c53

<scispacy.linking.EntityLinker at 0x7f258824bd10>

In [None]:
quest=data['Question']

In [None]:
entities=[]

In [None]:
for i in range(len(quest)):
  doc = nlp(quest[i])
  #entity = doc.ents[1]
  #print(type(doc.ents))
  print(doc.ents)
  entities.append(doc.ents)

(List, signaling molecules, interact receptor)
(thyroid hormone transporter, thyroid hormone resistance syndrome)
(miRNAs, potential, biomarkers, epithelial ovarian cancer)
(acetylcholinesterase inhibitors, treatment myasthenia gravis)
(List, human, genes, dishevelled proteins)
(synonym, Acrokeratosis paraneoplastica)
(antiarrhythmic drugs, VaughanWilliams classification)
(isoforms, mammalian, Notch receptor)
(characteristics, cellular senescence)
(Orteronel, treatment cancer)
(gene GLT8D1,)
(Yamanaka factors,)
(protein, Pannexin1)
(Wilsons disease,)
(Facioscapulohumeral muscular dystrophy, FSHD)
(proteins, participate formation, ryanodine receptor, quaternary macromolecular complex)
(HILIC,)
(List, endoscopic, diagnoses, children, autism)
(MAP kinase, phosphorylates, transcription factor, cjun)
(TAILS protein,)
(indications,)
(development, Ewing sarcoma)
(List, Hemolytic Uremic Syndrome)
(effects depleting, protein km231 DYNLRB1 cell)
(Treatment disease, investigated, MR, CLEAN study)

In [None]:
data['ent']=entities

In [None]:
data['ent'] = [','.join(map(str, l)) for l in data['ent']]

# MultiLabelBinarizer

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(data.Labels)
Y = multilabel_binarizer.transform(data.Labels)

# TF-IDF of questions

In [None]:
x=data.Question
X_train,X_test,y_train,y_test = train_test_split(x, Y, test_size=0.2, random_state=9000)

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
print(X_train.shape)
print(X_test.shape)

(624, 1744)
(156, 1744)


# Loading Feature File

In [None]:
data=pd.read_csv('/content/drive/MyDrive/Features.csv') #protein feature changed to yes or no manually
data.head()

Unnamed: 0.1,Unnamed: 0,No.,Question ID,Question,Labels,Noun_Concepts,Verb_Concepts,Focus,ent,Protein
0,0,1,55046d5ff8aee20f27000007,List signaling molecules ligands interact rece...,umls:aapp,"[inpr],[sbst],[chem],[aapp,rcpt],[aapp,rcpt]",,"[aapp,rcpt]","List,signaling molecules,interact receptor",No
1,1,2,52f8b2902059c6d71c000053,thyroid hormone transporter implicated thyroid...,umls:enzy,"[irda],[qlco],[aapp,bacs],[irda],[qlco],[ftcn]...",,[horm],"thyroid hormone transporter,thyroid hormone re...",No
2,2,3,553fa78b1d53b76422000007,miRNAs potential biomarkers epithelial ovarian...,"umls:nusq,umls:gngm","[bacs,nnon],[clna],[euka]",[fndg],"[bacs,nnon]","miRNAs,potential,biomarkers,epithelial ovarian...",No
3,3,4,5149199dd24251bc05000040,acetylcholinesterase inhibitors treatment myas...,"umls:orch,umls:clnd,umls:phsu","[aapp,enzy],[chvf],[topp],[dsyn],[fndg]",[fndg],[phsu],"acetylcholinesterase inhibitors,treatment myas...",No
4,4,5,5709e4b2cf1c32585100001c,List human genes encoding dishevelled proteins,"tmtool:Gene,umls:bacs,umls:aapp","[gngm],[aapp,bacs]",[inpr],[gngm],"List,human,genes,dishevelled proteins",Yes


# Noun Concepts feature

In [None]:
xn=data.Noun_Concepts
Xtrain,Xtest,ytrain,ytest = train_test_split(xn, Y, test_size=0.2, random_state=9000)

In [None]:
vect = CountVectorizer(binary=True)
train=vect.fit_transform(Xtrain)
test=vect.transform(Xtest)

# Verb Concepts

In [None]:
data.Verb_Concepts=data.Verb_Concepts.fillna(0)

In [None]:
data.Verb_Concepts = [str (item) for item in data.Verb_Concepts]

In [None]:
xv=data.Verb_Concepts
Vtrain,Vtest,ytrain,ytest = train_test_split(xv, Y, test_size=0.2, random_state=9000)

In [None]:
vectv = CountVectorizer(binary=True)
Vtrain=vectv.fit_transform(Vtrain)
Vtest=vectv.transform(Vtest)

# Focus Concept

In [None]:
data.Focus=data.Focus.fillna(0)

In [None]:
data.Focus = [str (item) for item in data.Focus]

In [None]:
xf=data.Focus
ftrain,ftest,ytrain,ytest = train_test_split(xf, Y, test_size=0.2, random_state=9000)

In [None]:
vectf = CountVectorizer(binary=True)
ftrain=vectf.fit_transform(ftrain)
ftest=vectf.transform(ftest)

# Entity

In [None]:
data.ent=data.ent.fillna(0)
data.ent = [str (item) for item in data.ent]
xe=data.ent
etrain,etest,ytrain,ytest = train_test_split(xe, Y, test_size=0.2, random_state=9000)

In [None]:
vecte = CountVectorizer() #counts
etrain=vecte.fit_transform(etrain)
etest=vecte.transform(etest)

In [None]:
etrain.shape

(624, 1334)

# Protein

In [None]:
xp=data.Protein
ptrain,ptest,ytrain,ytest = train_test_split(xp, Y, test_size=0.2, random_state=9000)

In [None]:
vectp = CountVectorizer()
ptrain=vectp.fit_transform(ptrain)
ptest=vectp.transform(ptest)

In [None]:
ptrain.shape

(624, 2)

# Comining features and tfidf

In [None]:
X=scipy.sparse.hstack([X_train, train,Vtrain,ftrain,etrain,ptrain])
#X=scipy.sparse.hstack([X_train, train])

In [None]:
X.shape

(624, 3284)

In [None]:
ttest=scipy.sparse.hstack([X_test, test,Vtest,ftest,etest,ptest])
#ttest=scipy.sparse.hstack([X_test, test])

In [None]:
ttest.shape

(156, 3284)

In [None]:
X

<624x3284 sparse matrix of type '<class 'numpy.float64'>'
	with 12675 stored elements in COOrdinate format>

# Classifiers

In [None]:
clf = OneVsRestClassifier(MultinomialNB())
clf.fit(X, y_train)
y_pred = clf.predict(ttest)
print("Hamming Loss",hamming_loss(y_test,y_pred))
print("F1 score",f1_score(y_test,y_pred,average='micro')) #72.9

Hamming Loss 0.10718545020870603
F1 score 0.74656327106098


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


In [None]:
clf = OneVsRestClassifier(LogisticRegression())
clf.fit(X, y_train)
y_pred = clf.predict(ttest)
print("Hamming Loss",hamming_loss(y_test,y_pred))
print("F1 score",f1_score(y_test,y_pred,average='micro')) #72.7

  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


Hamming Loss 0.1050983899821109
F1 score 0.7607736681370886


In [None]:
clf = OneVsRestClassifier(LinearSVC())
clf.fit(X, y_train)
y_pred = clf.predict(ttest)
print("Hamming Loss",hamming_loss(y_test,y_pred))
print("F1 score",f1_score(y_test,y_pred,average='micro')) #77.2

Hamming Loss 0.10673822301729279
F1 score 0.7669270833333334


  "Label %s is present in all training examples." % str(classes[c])
  "Label %s is present in all training examples." % str(classes[c])


# Binary Relevance

In [None]:
from sklearn.ensemble import AdaBoostClassifier
classifier = BinaryRelevance(AdaBoostClassifier())
classifier.fit(X, y_train)
predictions = classifier.predict(ttest)
predictions=predictions.toarray()
hamming_loss(y_test,predictions)

0.11687537268932618

In [None]:
f1_score(y_test,predictions,average='micro') #74.8

0.7429508196721313

In [None]:
from sklearn.ensemble import RandomForestClassifier

classifierrf = BinaryRelevance(RandomForestClassifier())
classifierrf.fit(X, y_train)
predictionsrf = classifierrf.predict(ttest)
hamming_loss(y_test,predictionsrf)

0.10405485986881335

In [None]:
f1_score(y_test,predictionsrf,average='micro') 

0.7510699001426534

# Classifier Chain

In [None]:
classifierCC = ClassifierChain(AdaBoostClassifier())
classifierCC.fit(X, y_train)
predictionsCC = classifierCC.predict(ttest)
predictionsCC=predictionsCC.toarray()
hamming_loss(y_test,predictionsCC)

0.120304114490161

In [None]:
f1_score(y_test,predictionsCC,average='micro') 

0.7379019162065605

In [None]:
classifierrf = ClassifierChain(RandomForestClassifier())
classifierrf.fit(X, y_train)
predictionsrf = classifierrf.predict(ttest)
hamming_loss(y_test,predictionsrf)

0.10494931425163984

In [None]:
f1_score(y_test,predictionsrf,average='micro') 

0.744927536231884

# Label PowerSet

In [None]:
#LP
classifierLB = LabelPowerset(AdaBoostClassifier())
classifierLB.fit(X, y_train)
predictionsLB = classifierLB.predict(ttest)
predictionsLB=predictionsLB.toarray()
hamming_loss(y_test,predictionsLB)

0.142814549791294

In [None]:
f1_score(y_test,predictionsLB,average='micro') 

0.670563961485557

In [None]:
classifierrf = LabelPowerset(RandomForestClassifier())
classifierrf.fit(X, y_train)
predictionsrf = classifierrf.predict(ttest)
hamming_loss(y_test,predictionsrf)

0.10166964818127609

In [None]:
f1_score(y_test,predictionsrf,average='micro') #75.6

0.7756578947368421

# RakelD

In [None]:
classifier = RakelD(
            base_classifier=AdaBoostClassifier(),
            base_classifier_require_dense=[True, True],
            labelset_size=4
        )
classifier.fit(X, y_train)
prediction6 = classifier.predict(ttest)

  return np.array(label_sets)


In [None]:

prediction6=prediction6.toarray()


In [None]:
hamming_loss(y_test,prediction6)

0.1325283243887895

In [None]:
f1_score(y_test,prediction6,average='micro') #69.3

0.6968973747016706

In [None]:
classifier = RakelD(
            base_classifier=RandomForestClassifier(),
            base_classifier_require_dense=[True, True],
            labelset_size=4
        )
classifierrf.fit(X, y_train)
predictionsrf = classifierrf.predict(ttest)
hamming_loss(y_test,predictionsrf)

0.10524746571258199

In [None]:
f1_score(y_test,predictionsrf,average='micro') #74.7

0.7680683311432326

In [None]:
#pip uninstall scikit-learn -y  #install different version of sklearn before running MLKNN to avoid erros

In [None]:
#pip install scikit-learn==0.24.1

In [None]:
c = MLkNN(k=3)

In [None]:
c.fit(X, y_train)



MLkNN(k=3)

In [None]:
predictionsKNN = c.predict(ttest)

In [None]:
predictionsKNN= predictionsKNN.toarray()

In [None]:
hamming_loss(y_test,predictionsKNN)

0.12090041741204532

In [None]:
f1_score(y_test,predictionsKNN,average='micro')

0.7403138008325327